## Initialization

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [3]:
model_accuracies = {'LogReg':1, 'DT':1, 'RF':1, 'LinearSVC':1, 'KernelSVC':1, 'NB':1, 'KNN':1}

## Importing the data

In [4]:
dataset = pd.read_csv('semeion.data', header = None, delimiter = r"\s+")
dataset.shape

(1593, 266)

In [5]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,256,257,258,259,260,261,262,263,264,265
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,1,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,1,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1,0,0,0,0,0,0,0,0,0


## Create X and Y

In [6]:
X = dataset.iloc[:, 0:256].values
Y = dataset.iloc[:, 256:].values

In [7]:
X.shape

(1593, 256)

In [8]:
Y.shape

(1593, 10)

In [9]:
X[0]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.,
        1.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,
        1.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,
        1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  0.,
        0.,  0.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,
        1.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,
        1.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,
        1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,
        0.,  0.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,  0

In [10]:
Y[0]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [11]:
Y = np.argmax(Y, axis = 1)
Y.shape

(1593,)

In [12]:
Y[0]

0

## Create Train and Test Data

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 4)

In [14]:
X_train.shape

(1274, 256)

In [15]:
X_test.shape

(319, 256)

In [16]:
Y_train.shape

(1274,)

In [17]:
Y_test.shape

(319,)

In [18]:
pd.DataFrame(pd.DataFrame(Y_train)[0].value_counts())

Unnamed: 0,0
7,134
2,134
5,130
1,129
6,127
3,126
9,125
0,125
8,122
4,122


In [19]:
pd.DataFrame(pd.DataFrame(Y_test)[0].value_counts())

Unnamed: 0,0
4,39
0,36
6,34
9,33
8,33
3,33
1,33
5,29
2,25
7,24


## DecisionTree

In [20]:
clf_dt = DecisionTreeClassifier(criterion = 'entropy')

In [21]:
clf_dt.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [22]:
Y_pred_dt = clf_dt.predict(X_test)

In [23]:
confusion_matrix(Y_test, Y_pred_dt)

array([[32,  0,  0,  0,  0,  1,  0,  0,  1,  2],
       [ 0, 25,  2,  0,  3,  1,  0,  2,  0,  0],
       [ 0,  3, 14,  3,  2,  0,  2,  0,  1,  0],
       [ 0,  0,  0, 28,  0,  1,  0,  0,  0,  4],
       [ 0,  2,  2,  0, 29,  1,  2,  1,  1,  1],
       [ 0,  1,  0,  2,  1, 21,  0,  0,  2,  2],
       [ 0,  0,  3,  0,  2,  2, 27,  0,  0,  0],
       [ 0,  3,  0,  1,  1,  2,  0, 17,  0,  0],
       [ 1,  1,  2,  3,  2,  0,  1,  0, 21,  2],
       [ 2,  4,  0,  3,  1,  3,  0,  1,  3, 16]], dtype=int64)

## Random Forest

In [24]:
clf_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')

In [25]:
clf_rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
Y_pred_rf = clf_rf.predict(X_test)

In [27]:
confusion_matrix(Y_test, Y_pred_rf)

array([[36,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 33,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 23,  0,  1,  0,  0,  1,  0,  0],
       [ 0,  2,  0, 29,  0,  1,  0,  0,  0,  1],
       [ 2,  2,  0,  0, 32,  1,  2,  0,  0,  0],
       [ 0,  0,  1,  0,  0, 26,  0,  0,  0,  2],
       [ 0,  0,  1,  0,  2,  0, 31,  0,  0,  0],
       [ 0,  0,  0,  1,  1,  0,  0, 21,  1,  0],
       [ 0,  1,  4,  0,  0,  0,  0,  3, 24,  1],
       [ 1,  0,  1,  1,  1,  1,  0,  2,  1, 25]], dtype=int64)

## Naive Bayes

In [28]:
clf_nb = GaussianNB()

In [29]:
clf_nb.fit(X_train, Y_train)

GaussianNB(priors=None)

In [30]:
Y_pred_nb = clf_nb.predict(X_test)

In [31]:
confusion_matrix(Y_test, Y_pred_nb)

array([[34,  0,  0,  0,  1,  0,  0,  0,  0,  1],
       [ 0, 31,  0,  0,  0,  0,  0,  2,  0,  0],
       [ 0,  8, 14,  0,  1,  1,  1,  0,  0,  0],
       [ 0,  4,  0, 28,  0,  1,  0,  0,  0,  0],
       [ 1,  5,  0,  0, 26,  1,  5,  1,  0,  0],
       [ 1,  2,  0,  0,  0, 23,  1,  1,  0,  1],
       [ 0,  3,  0,  0,  1,  0, 30,  0,  0,  0],
       [ 0,  0,  0,  1,  0,  0,  0, 23,  0,  0],
       [ 0, 10,  0,  0,  0,  0,  0,  0, 23,  0],
       [ 0,  6,  0,  1,  0,  0,  0,  0,  3, 23]], dtype=int64)

## KNN

In [32]:
clf_knn = KNeighborsClassifier(n_neighbors = 5)

In [33]:
clf_knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [34]:
Y_pred_knn = clf_knn.predict(X_test)

In [35]:
confusion_matrix(Y_test, Y_pred_knn)

array([[36,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 33,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 25,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  1,  0, 32,  0,  0,  0,  0,  0,  0],
       [ 0,  3,  0,  0, 36,  0,  0,  0,  0,  0],
       [ 0,  2,  0,  0,  0, 26,  1,  0,  0,  0],
       [ 0,  0,  0,  0,  1,  0, 33,  0,  0,  0],
       [ 0,  2,  0,  0,  0,  0,  0, 22,  0,  0],
       [ 0,  0,  1,  2,  0,  1,  0,  0, 29,  0],
       [ 0,  2,  0,  3,  1,  5,  1,  0,  0, 21]], dtype=int64)

## Logistic Regression

In [36]:
clf_lr = LogisticRegression()

In [37]:
clf_lr.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [38]:
Y_pred_lr = clf_lr.predict(X_test)

In [39]:
confusion_matrix(Y_test, Y_pred_lr)

array([[36,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 29,  1,  0,  1,  0,  0,  2,  0,  0],
       [ 0,  0, 23,  0,  0,  0,  1,  0,  0,  1],
       [ 0,  1,  0, 30,  0,  1,  0,  0,  0,  1],
       [ 0,  1,  0,  0, 33,  1,  3,  0,  0,  1],
       [ 0,  0,  0,  1,  0, 25,  1,  0,  0,  2],
       [ 0,  0,  0,  0,  1,  0, 33,  0,  0,  0],
       [ 0,  0,  0,  0,  1,  0,  0, 23,  0,  0],
       [ 0,  1,  1,  0,  0,  0,  0,  0, 29,  2],
       [ 0,  1,  0,  2,  1,  0,  0,  0,  4, 25]], dtype=int64)

## Linear SVC

In [40]:
clf_lsvc = SVC(kernel = 'linear')

In [41]:
clf_lsvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [42]:
Y_pred_lsvc = clf_lsvc.predict(X_test)

In [43]:
confusion_matrix(Y_test, Y_pred_lsvc)

array([[36,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 32,  0,  0,  0,  0,  0,  1,  0,  0],
       [ 0,  0, 25,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  1,  0, 31,  0,  1,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 39,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  2,  0, 27,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0, 34,  0,  0,  0],
       [ 0,  1,  0,  0,  0,  0,  0, 23,  0,  0],
       [ 0,  0,  0,  1,  0,  0,  0,  0, 31,  1],
       [ 0,  3,  0,  1,  0,  0,  0,  0,  1, 28]], dtype=int64)

## Kernel SVC

In [44]:
clf_ksvc = SVC(kernel = 'rbf')

In [45]:
clf_ksvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [46]:
Y_pred_ksvc = clf_ksvc.predict(X_test)

In [47]:
confusion_matrix(Y_test, Y_pred_ksvc)

array([[36,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 32,  0,  0,  0,  0,  0,  1,  0,  0],
       [ 0,  0, 25,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  2,  0, 30,  0,  1,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 37,  0,  2,  0,  0,  0],
       [ 0,  0,  0,  1,  0, 28,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  1,  0, 33,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 24,  0,  0],
       [ 0,  1,  1,  0,  0,  0,  0,  0, 31,  0],
       [ 0,  1,  0,  2,  1,  0,  0,  0,  2, 27]], dtype=int64)

## Accuracy of Various Models

In [48]:
model_accuracies['DT'] = accuracy_score(Y_test, Y_pred_dt)
model_accuracies['KNN'] = accuracy_score(Y_test, Y_pred_knn)
model_accuracies['KernelSVC'] = accuracy_score(Y_test, Y_pred_ksvc)
model_accuracies['LinearSVC'] = accuracy_score(Y_test, Y_pred_lsvc)
model_accuracies['LogReg'] = accuracy_score(Y_test, Y_pred_lr)
model_accuracies['NB'] = accuracy_score(Y_test, Y_pred_nb)
model_accuracies['RF'] = accuracy_score(Y_test, Y_pred_rf)
model_accuracies

{'DT': 0.72100313479623823,
 'KNN': 0.91849529780564265,
 'KernelSVC': 0.94984326018808773,
 'LinearSVC': 0.95924764890282133,
 'LogReg': 0.89655172413793105,
 'NB': 0.79937304075235105,
 'RF': 0.87774294670846398}