## Initialization

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
model_accuracies = {'KNN':1, 'LogReg':1, 'DT':1, 'RF':1, 'NB':1, 'LinearSVC':1, 'KernelSVC':1}

## Importing the Data

In [5]:
df = pd.read_excel('BreastTissue.xls', sheetname = 1)
df.shape

(106, 11)

In [6]:
df.head()

Unnamed: 0,Case #,Class,I0,PA500,HFS,DA,Area,A/DA,Max IP,DR,P
0,1,car,524.794072,0.187448,0.032114,228.800228,6843.598481,29.910803,60.20488,220.737212,556.828334
1,2,car,330.0,0.226893,0.26529,121.154201,3163.239472,26.109202,69.717361,99.084964,400.225776
2,3,car,551.879287,0.232478,0.06353,264.804935,11888.391827,44.894903,77.793297,253.7853,656.769449
3,4,car,380.0,0.240855,0.286234,137.640111,5402.17118,39.248524,88.758446,105.198568,493.701814
4,5,car,362.831266,0.200713,0.244346,124.912559,3290.462446,26.342127,69.389389,103.866552,424.796503


## Create X and Y

In [9]:
Y = df.iloc[:, 1].values
Y.shape

(106,)

In [10]:
Y

array(['car', 'car', 'car', 'car', 'car', 'car', 'car', 'car', 'car',
       'car', 'car', 'car', 'car', 'car', 'car', 'car', 'car', 'car',
       'car', 'car', 'car', 'fad', 'fad', 'fad', 'fad', 'fad', 'fad',
       'fad', 'fad', 'fad', 'fad', 'fad', 'fad', 'fad', 'fad', 'fad',
       'mas', 'mas', 'mas', 'mas', 'mas', 'mas', 'mas', 'mas', 'mas',
       'mas', 'mas', 'mas', 'mas', 'mas', 'mas', 'mas', 'mas', 'mas',
       'gla', 'gla', 'gla', 'gla', 'gla', 'gla', 'gla', 'gla', 'gla',
       'gla', 'gla', 'gla', 'gla', 'gla', 'gla', 'gla', 'con', 'con',
       'con', 'con', 'con', 'con', 'con', 'con', 'con', 'con', 'con',
       'con', 'con', 'con', 'adi', 'adi', 'adi', 'adi', 'adi', 'adi',
       'adi', 'adi', 'adi', 'adi', 'adi', 'adi', 'adi', 'adi', 'adi',
       'adi', 'adi', 'adi', 'adi', 'adi', 'adi', 'adi'], dtype=object)

In [14]:
X = df.iloc[:, 2:].values
X.shape

(106, 9)

In [15]:
X

array([[  5.24794072e+02,   1.87448362e-01,   3.21140582e-02,
          2.28800228e+02,   6.84359848e+03,   2.99108027e+01,
          6.02048798e+01,   2.20737212e+02,   5.56828334e+02],
       [  3.30000000e+02,   2.26892803e-01,   2.65290046e-01,
          1.21154201e+02,   3.16323947e+03,   2.61092018e+01,
          6.97173615e+01,   9.90849640e+01,   4.00225776e+02],
       [  5.51879287e+02,   2.32477856e-01,   6.35299848e-02,
          2.64804935e+02,   1.18883918e+04,   4.48949028e+01,
          7.77932968e+01,   2.53785300e+02,   6.56769449e+02],
       [  3.80000000e+02,   2.40855437e-01,   2.86233997e-01,
          1.37640111e+02,   5.40217118e+03,   3.92485239e+01,
          8.87584457e+01,   1.05198568e+02,   4.93701814e+02],
       [  3.62831266e+02,   2.00712864e-01,   2.44346095e-01,
          1.24912559e+02,   3.29046245e+03,   2.63421265e+01,
          6.93893890e+01,   1.03866552e+02,   4.24796503e+02],
       [  3.89872978e+02,   1.50098316e-01,   9.77384381e-02,
   

## Preprocess the Data

In [16]:
sc_X = StandardScaler()

In [17]:
X = sc_X.fit_transform(X)

In [18]:
le_Y = LabelEncoder()

In [19]:
Y = le_Y.fit_transform(Y)

In [20]:
X

array([[ -3.45765773e-01,   9.85995757e-01,  -8.18663989e-01,
          2.01325569e-01,  -2.65814600e-02,   2.76929535e-01,
         -1.87452440e-01,   2.99395634e-01,  -3.34219072e-01],
       [ -6.05357848e-01,   1.56375157e+00,   1.49303791e+00,
         -3.65532879e-01,  -2.25600861e-01,   1.13379349e-01,
         -6.99581450e-02,  -3.74756121e-01,  -5.40434762e-01],
       [ -3.09670694e-01,   1.64555771e+00,  -5.07207146e-01,
          3.90924509e-01,   2.46221206e-01,   9.21566460e-01,
          2.97925154e-02,   4.82535910e-01,  -2.02615691e-01],
       [ -5.38725411e-01,   1.76826691e+00,   1.70067581e+00,
         -2.78718923e-01,  -1.04528203e-01,   6.78651350e-01,
          1.65229563e-01,  -3.40876789e-01,  -4.17344655e-01],
       [ -5.61605302e-01,   1.18028532e+00,   1.28540002e+00,
         -3.45741555e-01,  -2.18721141e-01,   1.23400098e-01,
         -7.40091265e-02,  -3.48258330e-01,  -5.08079802e-01],
       [ -5.25568199e-01,   4.38917241e-01,  -1.68065250e-01,
   

In [21]:
Y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

## Create Train and Test data

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 4)

In [23]:
X_train.shape

(84, 9)

In [24]:
X_test.shape

(22, 9)

In [25]:
Y_train.shape

(84,)

In [26]:
Y_test.shape

(22,)

In [27]:
pd.DataFrame(pd.DataFrame(Y_train)[0].value_counts())

Unnamed: 0,0
0,19
4,15
1,14
5,13
3,12
2,11


In [28]:
pd.DataFrame(pd.DataFrame(Y_test)[0].value_counts())

Unnamed: 0,0
1,7
5,5
3,3
2,3
0,3
4,1


## Decision Tree Classifier

In [29]:
clf_dt = DecisionTreeClassifier(criterion = 'entropy')

In [30]:
clf_dt.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [31]:
Y_pred_dt = clf_dt.predict(X_test)

In [32]:
cm_dt = confusion_matrix(Y_pred_dt, Y_test)
cm_dt

array([[3, 0, 0, 0, 0, 0],
       [0, 5, 0, 0, 0, 0],
       [0, 0, 3, 0, 0, 0],
       [0, 1, 0, 3, 0, 1],
       [0, 1, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 3]], dtype=int64)

## Random Forest Classifier

In [33]:
clf_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')

In [34]:
clf_rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [35]:
Y_pred_rf = clf_rf.predict(X_test)

In [36]:
cm_rf = confusion_matrix(Y_pred_rf, Y_test)
cm_rf

array([[3, 0, 0, 0, 0, 0],
       [0, 5, 0, 0, 0, 0],
       [0, 0, 3, 0, 0, 0],
       [0, 1, 0, 2, 0, 1],
       [0, 1, 0, 0, 1, 3],
       [0, 0, 0, 1, 0, 1]], dtype=int64)

## Naive Bayes Classifier

In [37]:
clf_nb = GaussianNB()

In [38]:
clf_nb.fit(X_train, Y_train)

GaussianNB(priors=None)

In [39]:
Y_pred_nb = clf_nb.predict(X_test)

In [40]:
cm_nb = confusion_matrix(Y_pred_nb, Y_test)
cm_nb

array([[2, 0, 0, 0, 0, 0],
       [0, 5, 0, 0, 0, 0],
       [1, 0, 3, 0, 0, 0],
       [0, 0, 0, 1, 0, 1],
       [0, 0, 0, 0, 1, 2],
       [0, 2, 0, 2, 0, 2]], dtype=int64)

## KNN Classifier

In [41]:
clf_knn = KNeighborsClassifier(n_neighbors = 5)

In [42]:
clf_knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [43]:
Y_pred_knn = clf_knn.predict(X_test)

In [44]:
cm_knn = confusion_matrix(Y_pred_knn, Y_test)
cm_knn

array([[2, 0, 0, 0, 0, 0],
       [0, 6, 0, 0, 0, 0],
       [1, 0, 3, 0, 0, 0],
       [0, 0, 0, 2, 0, 2],
       [0, 0, 0, 1, 1, 1],
       [0, 1, 0, 0, 0, 2]], dtype=int64)

## Logistic Regression

In [45]:
clf_lr = LogisticRegression()

In [46]:
clf_lr.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [47]:
Y_pred_lr = clf_lr.predict(X_test)

In [48]:
cm_lr = confusion_matrix(Y_pred_lr, Y_test)
cm_lr

array([[3, 0, 0, 0, 0, 0],
       [0, 7, 0, 0, 0, 0],
       [0, 0, 3, 0, 0, 0],
       [0, 0, 0, 2, 0, 1],
       [0, 0, 0, 1, 1, 4],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

## SVC Linear

In [49]:
clf_lsvc = SVC(kernel = "linear")

In [50]:
clf_lsvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [51]:
Y_pred_lsvc = clf_lsvc.predict(X_test)

In [52]:
cm_lsvc = confusion_matrix(Y_pred_lsvc, Y_test)
cm_lsvc

array([[3, 0, 0, 0, 0, 0],
       [0, 6, 0, 0, 0, 0],
       [0, 0, 3, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 1, 3],
       [0, 0, 0, 2, 0, 2]], dtype=int64)

## SVC Kernel

In [53]:
clf_ksvc = SVC(kernel = "rbf")

In [54]:
clf_ksvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [55]:
Y_pred_ksvc = clf_ksvc.predict(X_test)

In [56]:
cm_ksvc = confusion_matrix(Y_pred_ksvc, Y_test)
cm_ksvc

array([[3, 0, 0, 0, 0, 0],
       [0, 6, 0, 0, 0, 0],
       [0, 0, 3, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 1, 0, 3, 1, 5],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

## Accuracy of Various Classifiers

In [57]:
model_accuracies['DT'] = accuracy_score(Y_pred_dt, Y_test)
model_accuracies['KNN'] = accuracy_score(Y_pred_knn, Y_test)
model_accuracies['KernelSVC'] = accuracy_score(Y_pred_ksvc, Y_test)
model_accuracies['LinearSVC'] = accuracy_score(Y_pred_lsvc, Y_test)
model_accuracies['LogReg'] = accuracy_score(Y_pred_lr, Y_test)
model_accuracies['NB'] = accuracy_score(Y_pred_nb, Y_test)
model_accuracies['RF'] = accuracy_score(Y_pred_rf, Y_test)
model_accuracies

{'DT': 0.81818181818181823,
 'KNN': 0.72727272727272729,
 'KernelSVC': 0.59090909090909094,
 'LinearSVC': 0.72727272727272729,
 'LogReg': 0.72727272727272729,
 'NB': 0.63636363636363635,
 'RF': 0.68181818181818177}