## Initialization

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
model_accuracies = {'KNN':1, 'LogReg':1, 'DT':1, 'RF':1, 'NB':1, 'LinearSVC':1, 'KernelSVC':1}

## Importing the Data

In [5]:
df = pd.read_csv('tae.data', header = None)
df.shape

(151, 6)

In [6]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,1,23,3,1,19,3
1,2,15,3,1,17,3
2,1,23,3,2,49,3
3,1,5,2,2,33,3
4,2,7,11,2,55,3


## Creating X and Y

In [7]:
X = df.iloc[:, 0:5].values
Y = df.iloc[:, 5].values

In [8]:
Y.shape

(151,)

In [9]:
X.shape

(151, 5)

In [10]:
X.shape[1]

5

In [11]:
X

array([[ 1, 23,  3,  1, 19],
       [ 2, 15,  3,  1, 17],
       [ 1, 23,  3,  2, 49],
       [ 1,  5,  2,  2, 33],
       [ 2,  7, 11,  2, 55],
       [ 2, 23,  3,  1, 20],
       [ 2,  9,  5,  2, 19],
       [ 2, 10,  3,  2, 27],
       [ 1, 22,  3,  1, 58],
       [ 2, 15,  3,  1, 20],
       [ 2, 10, 22,  2,  9],
       [ 2, 13,  1,  2, 30],
       [ 2, 18, 21,  2, 29],
       [ 2,  6, 17,  2, 39],
       [ 2,  6, 17,  2, 42],
       [ 2,  6, 17,  2, 43],
       [ 2,  7, 11,  2, 10],
       [ 2, 22,  3,  2, 46],
       [ 2, 13,  3,  1, 10],
       [ 2,  7, 25,  2, 42],
       [ 2, 25,  7,  2, 27],
       [ 2, 25,  7,  2, 23],
       [ 2,  2,  9,  2, 31],
       [ 2,  1, 15,  1, 22],
       [ 2, 15, 13,  2, 37],
       [ 2,  7, 11,  2, 13],
       [ 2,  8,  3,  2, 24],
       [ 2, 14, 15,  2, 38],
       [ 2, 21,  2,  2, 42],
       [ 2, 22,  3,  2, 28],
       [ 2, 11,  1,  2, 51],
       [ 2, 18,  5,  2, 19],
       [ 2, 13,  1,  2, 31],
       [ 1, 13,  3,  1, 13],
       [ 2,  5

In [12]:
Y

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

## Preprocess the Data

In [13]:
le_Y = LabelEncoder()

In [14]:
Y = le_Y.fit_transform(Y)
Y

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [15]:
def encoder_X(index):
    le_X = LabelEncoder()
    X[:, index] = le_X.fit_transform(X[:, index])

In [16]:
for x in range(0, X.shape[1] - 1):
    encoder_X(x)

In [17]:
X

array([[ 0, 22,  2,  0, 19],
       [ 1, 14,  2,  0, 17],
       [ 0, 22,  2,  1, 49],
       [ 0,  4,  1,  1, 33],
       [ 1,  6, 10,  1, 55],
       [ 1, 22,  2,  0, 20],
       [ 1,  8,  4,  1, 19],
       [ 1,  9,  2,  1, 27],
       [ 0, 21,  2,  0, 58],
       [ 1, 14,  2,  0, 20],
       [ 1,  9, 21,  1,  9],
       [ 1, 12,  0,  1, 30],
       [ 1, 17, 20,  1, 29],
       [ 1,  5, 16,  1, 39],
       [ 1,  5, 16,  1, 42],
       [ 1,  5, 16,  1, 43],
       [ 1,  6, 10,  1, 10],
       [ 1, 21,  2,  1, 46],
       [ 1, 12,  2,  0, 10],
       [ 1,  6, 24,  1, 42],
       [ 1, 24,  6,  1, 27],
       [ 1, 24,  6,  1, 23],
       [ 1,  1,  8,  1, 31],
       [ 1,  0, 14,  0, 22],
       [ 1, 14, 12,  1, 37],
       [ 1,  6, 10,  1, 13],
       [ 1,  7,  2,  1, 24],
       [ 1, 13, 14,  1, 38],
       [ 1, 20,  1,  1, 42],
       [ 1, 21,  2,  1, 28],
       [ 1, 10,  0,  1, 51],
       [ 1, 17,  4,  1, 19],
       [ 1, 12,  0,  1, 31],
       [ 0, 12,  2,  0, 13],
       [ 1,  4

In [18]:
ohe_X = OneHotEncoder(categorical_features = [2])

In [19]:
X = ohe_X.fit_transform(X).toarray()
X = X[:, 1:]
X.shape

(151, 29)

In [20]:
pd.DataFrame(pd.DataFrame(X[:, 26])[0].value_counts())

Unnamed: 0,0
22.0,17
12.0,14
21.0,12
6.0,11
14.0,8
8.0,8
9.0,8
13.0,8
17.0,8
5.0,8


In [21]:
ohe_X = OneHotEncoder(categorical_features = [26])

In [22]:
X = ohe_X.fit_transform(X).toarray()
X = X[:, 1:]
X.shape

(151, 52)

In [59]:
sc_X = StandardScaler()

In [60]:
X = sc_X.fit_transform(X)
X

array([[-0.16495722, -0.11585689, -0.11585689, ..., -2.05107205,
        -2.3590713 , -0.69002837],
       [-0.16495722, -0.11585689, -0.11585689, ...,  0.48754991,
        -2.3590713 , -0.84565836],
       [-0.16495722, -0.11585689, -0.11585689, ..., -2.05107205,
         0.42389562,  1.6444216 ],
       ..., 
       [-0.16495722, -0.11585689, -0.11585689, ..., -2.05107205,
         0.42389562,  1.5666066 ],
       [-0.16495722, -0.11585689, -0.11585689, ...,  0.48754991,
         0.42389562,  1.80005159],
       [ 6.06217783, -0.11585689, -0.11585689, ...,  0.48754991,
         0.42389562, -0.06750838]])

## Create Train and Test data

In [61]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [62]:
X_train.shape

(120, 52)

In [63]:
X_test.shape

(31, 52)

In [64]:
Y_train.shape

(120,)

In [65]:
Y_test.shape

(31,)

In [66]:
pd.DataFrame(pd.DataFrame(Y_train)[0].value_counts())

Unnamed: 0,0
0,42
2,40
1,38


In [67]:
pd.DataFrame(pd.DataFrame(Y_test)[0].value_counts())

Unnamed: 0,0
2,12
1,12
0,7


## Decision Tree Classifier

In [68]:
clf_dt = DecisionTreeClassifier(criterion = 'entropy')

In [69]:
clf_dt.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [70]:
Y_pred_dt = clf_dt.predict(X_test)

In [71]:
cm_dt = confusion_matrix(Y_pred_dt, Y_test)
cm_dt

array([[5, 5, 3],
       [1, 6, 5],
       [1, 1, 4]], dtype=int64)

## Random Forest Classifier

In [72]:
clf_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')

In [73]:
clf_rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [74]:
Y_pred_rf = clf_rf.predict(X_test)

In [75]:
cm_rf = confusion_matrix(Y_pred_rf, Y_test)
cm_rf

array([[7, 4, 0],
       [0, 7, 4],
       [0, 1, 8]], dtype=int64)

## Naive Bayes Classifier

In [76]:
clf_nb = GaussianNB()

In [77]:
clf_nb.fit(X_train, Y_train)

GaussianNB(priors=None)

In [78]:
Y_pred_nb = clf_nb.predict(X_test)

In [79]:
cm_nb = confusion_matrix(Y_pred_nb, Y_test)
cm_nb

array([[ 0,  0,  0],
       [ 1,  2,  3],
       [ 6, 10,  9]], dtype=int64)

## KNN Classifier

In [80]:
clf_knn = KNeighborsClassifier(n_neighbors = 5)

In [81]:
clf_knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [82]:
Y_pred_knn = clf_knn.predict(X_test)

In [83]:
cm_knn = confusion_matrix(Y_pred_knn, Y_test)
cm_knn

array([[2, 3, 4],
       [2, 6, 3],
       [3, 3, 5]], dtype=int64)

## Logistic Regression

In [84]:
clf_lr = LogisticRegression()

In [85]:
clf_lr.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [86]:
Y_pred_lr = clf_lr.predict(X_test)

In [87]:
cm_lr = confusion_matrix(Y_pred_lr, Y_test)
cm_lr

array([[3, 6, 2],
       [1, 6, 3],
       [3, 0, 7]], dtype=int64)

## SVC Linear

In [88]:
clf_lsvc = SVC(kernel = "linear")

In [89]:
clf_lsvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [90]:
Y_pred_lsvc = clf_lsvc.predict(X_test)

In [91]:
cm_lsvc = confusion_matrix(Y_pred_lsvc, Y_test)
cm_lsvc

array([[2, 5, 2],
       [1, 6, 4],
       [4, 1, 6]], dtype=int64)

## SVC Kernel

In [92]:
clf_ksvc = SVC(kernel = "rbf")

In [93]:
clf_ksvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [94]:
Y_pred_ksvc = clf_ksvc.predict(X_test)

In [95]:
cm_ksvc = confusion_matrix(Y_pred_ksvc, Y_test)
cm_ksvc

array([[0, 7, 3],
       [2, 5, 2],
       [5, 0, 7]], dtype=int64)

## Accuracy of Various Classifiers

In [96]:
model_accuracies['DT'] = accuracy_score(Y_pred_dt, Y_test)
model_accuracies['KNN'] = accuracy_score(Y_pred_knn, Y_test)
model_accuracies['KernelSVC'] = accuracy_score(Y_pred_ksvc, Y_test)
model_accuracies['LinearSVC'] = accuracy_score(Y_pred_lsvc, Y_test)
model_accuracies['LogReg'] = accuracy_score(Y_pred_lr, Y_test)
model_accuracies['NB'] = accuracy_score(Y_pred_nb, Y_test)
model_accuracies['RF'] = accuracy_score(Y_pred_rf, Y_test)
model_accuracies

{'DT': 0.4838709677419355,
 'KNN': 0.41935483870967744,
 'KernelSVC': 0.38709677419354838,
 'LinearSVC': 0.45161290322580644,
 'LogReg': 0.5161290322580645,
 'NB': 0.35483870967741937,
 'RF': 0.70967741935483875}