## Initialization

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [3]:
model_accuracies = {'LogReg':1, 'DT':1, 'RF':1, 'LinearSVC':1, 'KernelSVC':1, 'NB':1, 'KNN':1}

## Importing the data

In [4]:
dataset = pd.read_csv('ThoraricSurgery.csv', header = None)
dataset.shape

(470, 17)

In [5]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,DGN2,2.88,2.16,PRZ1,F,F,F,T,T,OC14,F,F,F,T,F,60,F
1,DGN3,3.4,1.88,PRZ0,F,F,F,F,F,OC12,F,F,F,T,F,51,F
2,DGN3,2.76,2.08,PRZ1,F,F,F,T,F,OC11,F,F,F,T,F,59,F
3,DGN3,3.68,3.04,PRZ0,F,F,F,F,F,OC11,F,F,F,F,F,54,F
4,DGN3,2.44,0.96,PRZ2,F,T,F,T,T,OC11,F,F,F,T,F,73,T


## Create X and Y

In [6]:
X = dataset.iloc[:, 0:16].values
Y = dataset.iloc[:, 16].values

In [7]:
X.shape

(470, 16)

In [8]:
Y.shape

(470,)

In [9]:
X

array([['DGN2', 2.88, 2.16, ..., 'T', 'F', 60],
       ['DGN3', 3.4, 1.88, ..., 'T', 'F', 51],
       ['DGN3', 2.76, 2.08, ..., 'T', 'F', 59],
       ..., 
       ['DGN3', 3.04, 2.08, ..., 'F', 'F', 52],
       ['DGN3', 1.96, 1.68, ..., 'T', 'F', 79],
       ['DGN3', 4.72, 3.56, ..., 'T', 'F', 51]], dtype=object)

In [10]:
Y

array(['F', 'F', 'F', 'F', 'T', 'F', 'T', 'T', 'F', 'F', 'F', 'F', 'F',
       'T', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'T', 'F',
       'F', 'T', 'F', 'F', 'T', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F',
       'F', 'T', 'T', 'F', 'T', 'F', 'F', 'F', 'T', 'F', 'F', 'F', 'F',
       'F', 'F', 'F', 'F', 'F', 'T', 'F', 'F', 'T', 'F', 'F', 'F', 'F',
       'F', 'F', 'T', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'T', 'F', 'T',
       'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F',
       'F', 'T', 'F', 'F', 'F', 'T', 'F', 'F', 'T', 'F', 'F', 'F', 'F',
       'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'T', 'F',
       'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'T', 'T', 'F',
       'F', 'F', 'T', 'F', 'F', 'F', 'F', 'T', 'F', 'F', 'F', 'F', 'F',
       'T', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'T', 'T', 'F', 'F', 'F',
       'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F',
       'T', 'F', 'F', 'F', 'T', 'F', 'T', 'F', 'F', 'F', 'F', 'F

## Preprocess the Data

In [11]:
le_Y = LabelEncoder()

In [12]:
Y = le_Y.fit_transform(Y)
Y

array([0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0,

In [13]:
def enocder_X(index):
    le_X = LabelEncoder()
    X[:, index] = le_X.fit_transform(X[:, index])

In [14]:
X_indices = [0] + list(range(3,15))
X_indices

[0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [15]:
for x in X_indices:
    enocder_X(x)

In [16]:
X

array([[1, 2.88, 2.16, ..., 1, 0, 60],
       [2, 3.4, 1.88, ..., 1, 0, 51],
       [2, 2.76, 2.08, ..., 1, 0, 59],
       ..., 
       [2, 3.04, 2.08, ..., 0, 0, 52],
       [2, 1.96, 1.68, ..., 1, 0, 79],
       [2, 4.72, 3.56, ..., 1, 0, 51]], dtype=object)

In [17]:
ohe_X = OneHotEncoder(categorical_features = [9])

In [18]:
X = ohe_X.fit_transform(X).toarray()
X.shape

(470, 19)

In [19]:
X = X[:, 1:]
X.shape

(470, 18)

In [20]:
ohe_X = OneHotEncoder(categorical_features = [6])

In [21]:
X = ohe_X.fit_transform(X).toarray()
X.shape

(470, 20)

In [22]:
X = X[:, 1:]
X.shape

(470, 19)

In [23]:
ohe_X = OneHotEncoder(categorical_features = [5])

In [24]:
X = ohe_X.fit_transform(X).toarray()
X.shape

(470, 25)

In [25]:
X = X[:, 1:]
X.shape

(470, 24)

In [26]:
sc_X = StandardScaler()

In [27]:
X = sc_X.fit_transform(X)
X

array([[ 2.83521808, -1.69832197, -0.33333333, ...,  0.46649392,
        -0.06537205, -0.29134848],
       [-0.35270655,  0.5888165 , -0.33333333, ...,  0.46649392,
        -0.06537205, -1.32611263],
       [-0.35270655,  0.5888165 , -0.33333333, ...,  0.46649392,
        -0.06537205, -0.40632228],
       ..., 
       [-0.35270655,  0.5888165 , -0.33333333, ..., -2.14365065,
        -0.06537205, -1.21113883],
       [-0.35270655,  0.5888165 , -0.33333333, ...,  0.46649392,
        -0.06537205,  1.89315359],
       [-0.35270655,  0.5888165 , -0.33333333, ...,  0.46649392,
        -0.06537205, -1.32611263]])

## Create Train and Test Data

In [28]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 4)

In [29]:
X_train.shape

(376, 24)

In [30]:
X_test.shape

(94, 24)

In [31]:
Y_train.shape

(376,)

In [32]:
Y_test.shape

(94,)

In [33]:
pd.DataFrame(pd.DataFrame(Y_train)[0].value_counts())

Unnamed: 0,0
0,324
1,52


In [34]:
pd.DataFrame(pd.DataFrame(Y_test)[0].value_counts())

Unnamed: 0,0
0,76
1,18


## DecisionTree

In [35]:
clf_dt = DecisionTreeClassifier(criterion = 'entropy')

In [36]:
clf_dt.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [37]:
Y_pred_dt = clf_dt.predict(X_test)

In [38]:
confusion_matrix(Y_pred_dt, Y_test)

array([[57, 14],
       [19,  4]], dtype=int64)

## Random Forest

In [39]:
clf_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')

In [40]:
clf_rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [41]:
Y_pred_rf = clf_rf.predict(X_test)

In [42]:
confusion_matrix(Y_pred_rf, Y_test)

array([[75, 17],
       [ 1,  1]], dtype=int64)

## Naive Bayes

In [43]:
clf_nb = GaussianNB()

In [44]:
clf_nb.fit(X_train, Y_train)

GaussianNB(priors=None)

In [45]:
Y_pred_nb = clf_nb.predict(X_test)

In [46]:
confusion_matrix(Y_pred_nb, Y_test)

array([[ 1,  1],
       [75, 17]], dtype=int64)

## KNN

In [47]:
clf_knn = KNeighborsClassifier(n_neighbors = 5)

In [48]:
clf_knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [49]:
Y_pred_knn = clf_knn.predict(X_test)

In [50]:
confusion_matrix(Y_pred_knn, Y_test)

array([[75, 18],
       [ 1,  0]], dtype=int64)

## Logistic Regression

In [51]:
clf_lr = LogisticRegression()

In [52]:
clf_lr.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [53]:
Y_pred_lr = clf_lr.predict(X_test)

In [54]:
confusion_matrix(Y_pred_lr, Y_test)

array([[74, 17],
       [ 2,  1]], dtype=int64)

## Linear SVC

In [55]:
clf_lsvc = SVC(kernel = 'linear')

In [56]:
clf_lsvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [57]:
Y_pred_lsvc = clf_lsvc.predict(X_test)

In [58]:
confusion_matrix(Y_pred_lsvc, Y_test)

array([[76, 18],
       [ 0,  0]], dtype=int64)

## Kernel SVC

In [59]:
clf_ksvc = SVC(kernel = 'rbf')

In [60]:
clf_ksvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [61]:
Y_pred_ksvc = clf_ksvc.predict(X_test)

In [62]:
confusion_matrix(Y_pred_ksvc, Y_test)

array([[76, 18],
       [ 0,  0]], dtype=int64)

## Accuracy of Various Models

In [63]:
model_accuracies['DT'] = accuracy_score(Y_pred_dt, Y_test)
model_accuracies['KNN'] = accuracy_score(Y_pred_knn, Y_test)
model_accuracies['KernelSVC'] = accuracy_score(Y_pred_ksvc, Y_test)
model_accuracies['LinearSVC'] = accuracy_score(Y_pred_lsvc, Y_test)
model_accuracies['LogReg'] = accuracy_score(Y_pred_lr, Y_test)
model_accuracies['NB'] = accuracy_score(Y_pred_nb, Y_test)
model_accuracies['RF'] = accuracy_score(Y_pred_rf, Y_test)
model_accuracies

{'DT': 0.64893617021276595,
 'KNN': 0.7978723404255319,
 'KernelSVC': 0.80851063829787229,
 'LinearSVC': 0.80851063829787229,
 'LogReg': 0.7978723404255319,
 'NB': 0.19148936170212766,
 'RF': 0.80851063829787229}