## Initialization

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
model_accuracies = {'KNN':1, 'LogReg':1, 'DT':1, 'RF':1, 'NB':1, 'LinearSVC':1, 'KernelSVC':1}

## Importing the Data

In [5]:
df = pd.read_csv('yeast.data', header = None, delimiter = r"\s+")
df.shape

(1484, 10)

In [6]:
df.columns = ['Name', 'mcg', 'gvh', 'alm', 'mit', 'erl', 'pox', 'vac', 'nuc', 'Class']

In [7]:
df.head()

Unnamed: 0,Name,mcg,gvh,alm,mit,erl,pox,vac,nuc,Class
0,ADT1_YEAST,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,ADT2_YEAST,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,ADT3_YEAST,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,AAR2_YEAST,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,AATM_YEAST,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT


## Creating X and Y

In [8]:
Y = df.iloc[:, 9].values
Y.shape

(1484,)

In [9]:
Y

array(['MIT', 'MIT', 'MIT', ..., 'ME2', 'NUC', 'CYT'], dtype=object)

In [10]:
X = df.iloc[:, 1:9].values
X.shape

(1484, 8)

In [11]:
X

array([[ 0.58,  0.61,  0.47, ...,  0.  ,  0.48,  0.22],
       [ 0.43,  0.67,  0.48, ...,  0.  ,  0.53,  0.22],
       [ 0.64,  0.62,  0.49, ...,  0.  ,  0.53,  0.22],
       ..., 
       [ 0.67,  0.57,  0.36, ...,  0.  ,  0.56,  0.22],
       [ 0.43,  0.4 ,  0.6 , ...,  0.  ,  0.53,  0.39],
       [ 0.65,  0.54,  0.54, ...,  0.  ,  0.53,  0.22]])

## Preprocess the Data

In [12]:
sc_X = StandardScaler()

In [13]:
X = sc_X.fit_transform(X)
X

array([[ 0.58198136,  0.88848148, -0.3466451 , ..., -0.0991314 ,
        -0.34417514, -0.5279193 ],
       [-0.51089067,  1.37281104, -0.23122636, ..., -0.0991314 ,
         0.52121948, -0.5279193 ],
       [ 1.01913017,  0.96920307, -0.11580762, ..., -0.0991314 ,
         0.52121948, -0.5279193 ],
       ..., 
       [ 1.23770457,  0.5655951 , -1.61625127, ..., -0.0991314 ,
         1.04045625, -0.5279193 ],
       [-0.51089067, -0.80667199,  1.15379854, ..., -0.0991314 ,
         0.52121948,  1.06900494],
       [ 1.0919883 ,  0.32343032,  0.46128609, ..., -0.0991314 ,
         0.52121948, -0.5279193 ]])

In [14]:
le_Y = LabelEncoder()

In [15]:
Y = le_Y.fit_transform(Y)
Y

array([6, 6, 6, ..., 4, 7, 0], dtype=int64)

## Create Train and Test data

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 4)

In [17]:
X_train.shape

(1187, 8)

In [18]:
X_test.shape

(297, 8)

In [19]:
Y_train.shape

(1187,)

In [20]:
Y_test.shape

(297,)

In [21]:
pd.DataFrame(pd.DataFrame(Y_train)[0].value_counts())

Unnamed: 0,0
0,370
7,350
6,205
5,127
4,39
3,29
9,26
2,24
8,13
1,4


In [22]:
pd.DataFrame(pd.DataFrame(Y_test)[0].value_counts())

Unnamed: 0,0
0,93
7,79
6,39
5,36
3,15
4,12
2,11
8,7
9,4
1,1


## Decision Tree Classifier

In [23]:
clf_dt = DecisionTreeClassifier(criterion = 'entropy')

In [24]:
clf_dt.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [25]:
Y_pred_dt = clf_dt.predict(X_test)

In [26]:
cm_dt = confusion_matrix(Y_pred_dt, Y_test)
cm_dt

array([[42,  0,  0,  0,  0,  3,  9, 21,  3,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  4,  2,  1,  0,  1,  0,  0,  0],
       [ 0,  0,  2, 12,  4,  0,  2,  0,  0,  0],
       [ 2,  0,  2,  0,  3,  1,  0,  1,  0,  1],
       [ 1,  0,  0,  0,  2, 27,  3,  3,  0,  2],
       [12,  0,  3,  0,  1,  1, 13, 12,  1,  0],
       [27,  0,  0,  0,  0,  4,  8, 39,  1,  1],
       [ 6,  0,  0,  0,  1,  0,  1,  2,  2,  0],
       [ 3,  1,  0,  1,  0,  0,  2,  1,  0,  0]], dtype=int64)

## Random Forest Classifier

In [27]:
clf_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')

In [28]:
clf_rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [29]:
Y_pred_rf = clf_rf.predict(X_test)

In [30]:
cm_rf = confusion_matrix(Y_pred_rf, Y_test)
cm_rf

array([[57,  0,  1,  0,  0,  3, 11, 30,  4,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  0,  6,  1,  1,  0,  0,  0,  0,  0],
       [ 0,  0,  2, 11,  4,  0,  1,  0,  0,  1],
       [ 0,  0,  2,  2,  4,  1,  0,  0,  0,  0],
       [ 2,  0,  0,  0,  1, 27,  3,  5,  0,  2],
       [ 8,  0,  0,  0,  0,  1, 17,  7,  1,  0],
       [24,  0,  0,  0,  1,  3,  7, 37,  1,  1],
       [ 1,  0,  0,  0,  0,  0,  0,  0,  1,  0],
       [ 0,  1,  0,  1,  1,  1,  0,  0,  0,  0]], dtype=int64)

## Naive Bayes Classifier

In [31]:
clf_nb = GaussianNB()

In [32]:
clf_nb.fit(X_train, Y_train)

GaussianNB(priors=None)

In [33]:
Y_pred_nb = clf_nb.predict(X_test)

In [34]:
cm_nb = confusion_matrix(Y_pred_nb, Y_test)
cm_nb

array([[ 3,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  1,  0,  0,  0,  0,  0,  0,  0,  0],
       [11,  0,  7,  3,  2,  0,  5,  3,  1,  1],
       [ 0,  0,  3, 11,  6,  1,  2,  0,  0,  1],
       [ 1,  0,  0,  0,  0,  0,  0,  1,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  0,  0,  0,  0,  0,  6,  0,  0,  0],
       [ 2,  0,  0,  0,  0,  3,  1, 10,  0,  0],
       [ 1,  0,  0,  0,  0,  0,  0,  0,  3,  0],
       [74,  0,  1,  1,  4, 32, 25, 65,  3,  2]], dtype=int64)

## KNN Classifier

In [35]:
clf_knn = KNeighborsClassifier(n_neighbors = 5)

In [36]:
clf_knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [37]:
Y_pred_knn = clf_knn.predict(X_test)

In [38]:
cm_knn = confusion_matrix(Y_pred_knn, Y_test)
cm_knn

array([[65,  0,  2,  0,  2,  5, 13, 38,  2,  0],
       [ 0,  1,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  6,  1,  1,  0,  0,  1,  0,  0],
       [ 0,  0,  2, 12,  3,  1,  1,  1,  0,  0],
       [ 1,  0,  1,  2,  3,  0,  1,  0,  0,  0],
       [ 1,  0,  0,  0,  2, 25,  3,  1,  0,  2],
       [ 9,  0,  0,  0,  0,  0, 18,  6,  2,  1],
       [16,  0,  0,  0,  1,  5,  3, 32,  0,  1],
       [ 1,  0,  0,  0,  0,  0,  0,  0,  3,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0]], dtype=int64)

## Logistic Regression

In [39]:
clf_lr = LogisticRegression()

In [40]:
clf_lr.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [41]:
Y_pred_lr = clf_lr.predict(X_test)

In [42]:
cm_lr = confusion_matrix(Y_pred_lr, Y_test)
cm_lr

array([[62,  0,  1,  0,  1,  5, 11, 36,  3,  0],
       [ 0,  1,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  4,  1,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  2,  8,  3,  0,  1,  0,  0,  0],
       [ 0,  0,  2,  0,  2,  0,  1,  1,  0,  0],
       [ 0,  0,  0,  2,  1, 25,  3,  4,  0,  3],
       [12,  0,  2,  4,  4,  0, 18,  7,  2,  0],
       [18,  0,  0,  0,  1,  6,  5, 31,  0,  1],
       [ 1,  0,  0,  0,  0,  0,  0,  0,  2,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0]], dtype=int64)

## SVC Linear

In [43]:
clf_lsvc = SVC(kernel = "linear")

In [44]:
clf_lsvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [45]:
Y_pred_lsvc = clf_lsvc.predict(X_test)

In [46]:
cm_lsvc = confusion_matrix(Y_pred_lsvc, Y_test)
cm_lsvc

array([[65,  0,  1,  0,  1,  4, 13, 45,  3,  1],
       [ 0,  1,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  0,  5,  0,  1,  0,  0,  1,  0,  0],
       [ 0,  0,  3, 11,  4,  0,  1,  0,  0,  0],
       [ 0,  0,  2,  2,  2,  0,  1,  1,  0,  0],
       [ 0,  0,  0,  1,  1, 28,  3,  4,  0,  3],
       [14,  0,  0,  1,  3,  0, 19,  5,  1,  0],
       [12,  0,  0,  0,  0,  4,  2, 23,  0,  0],
       [ 1,  0,  0,  0,  0,  0,  0,  0,  3,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0]], dtype=int64)

## SVC Kernel

In [47]:
clf_ksvc = SVC(kernel = "rbf")

In [48]:
clf_ksvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [49]:
Y_pred_ksvc = clf_ksvc.predict(X_test)

In [50]:
cm_ksvc = confusion_matrix(Y_pred_ksvc, Y_test)
cm_ksvc

array([[63,  0,  1,  0,  1,  3, 12, 34,  3,  0],
       [ 0,  1,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  6,  0,  1,  0,  0,  0,  0,  0],
       [ 0,  0,  2, 12,  4,  1,  1,  0,  0,  0],
       [ 0,  0,  2,  2,  2,  0,  1,  1,  0,  1],
       [ 2,  0,  0,  0,  1, 26,  3,  4,  0,  2],
       [ 8,  0,  0,  0,  2,  0, 18,  5,  2,  0],
       [20,  0,  0,  1,  1,  6,  4, 35,  0,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  2,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0]], dtype=int64)

## Accuracy of Various Classifiers

In [51]:
model_accuracies['DT'] = accuracy_score(Y_pred_dt, Y_test)
model_accuracies['KNN'] = accuracy_score(Y_pred_knn, Y_test)
model_accuracies['KernelSVC'] = accuracy_score(Y_pred_ksvc, Y_test)
model_accuracies['LinearSVC'] = accuracy_score(Y_pred_lsvc, Y_test)
model_accuracies['LogReg'] = accuracy_score(Y_pred_lr, Y_test)
model_accuracies['NB'] = accuracy_score(Y_pred_nb, Y_test)
model_accuracies['RF'] = accuracy_score(Y_pred_rf, Y_test)
model_accuracies

{'DT': 0.4781144781144781,
 'KNN': 0.55555555555555558,
 'KernelSVC': 0.55555555555555558,
 'LinearSVC': 0.52861952861952866,
 'LogReg': 0.51515151515151514,
 'NB': 0.14478114478114479,
 'RF': 0.53872053872053871}