## Initialization

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [3]:
model_accuracies = {'LogReg':1, 'DT':1, 'RF':1, 'LinearSVC':1, 'KernelSVC':1, 'NB':1, 'KNN':1}

## Importing the data

In [4]:
dataset = pd.read_csv('Dataset.txt', header = None, delimiter = r'\s+')
dataset.shape

(846, 19)

In [5]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,95,48,83,178,72,10,162,42,20,159,176,379,184,70,6,16,187,197,van
1,91,41,84,141,57,9,149,45,19,143,170,330,158,72,9,14,189,199,van
2,104,50,106,209,66,10,207,32,23,158,223,635,220,73,14,9,188,196,saab
3,93,41,82,159,63,9,144,46,19,143,160,309,127,63,6,10,199,207,van
4,85,44,70,205,103,52,149,45,19,144,241,325,188,127,9,11,180,183,bus


## Create X and Y

In [6]:
X = dataset.iloc[:, 0:18].values
Y = dataset.iloc[:, 18].values

In [7]:
X.shape

(846, 18)

In [8]:
Y.shape

(846,)

In [9]:
X

array([[ 95,  48,  83, ...,  16, 187, 197],
       [ 91,  41,  84, ...,  14, 189, 199],
       [104,  50, 106, ...,   9, 188, 196],
       ..., 
       [106,  54, 101, ...,   4, 187, 201],
       [ 86,  36,  78, ...,  25, 190, 195],
       [ 85,  36,  66, ...,  18, 186, 190]], dtype=int64)

In [10]:
Y

array(['van', 'van', 'saab', 'van', 'bus', 'bus', 'bus', 'van', 'van',
       'saab', 'van', 'saab', 'bus', 'van', 'bus', 'opel', 'van', 'bus',
       'saab', 'opel', 'bus', 'van', 'bus', 'bus', 'saab', 'van', 'saab',
       'saab', 'bus', 'saab', 'van', 'saab', 'opel', 'opel', 'opel', 'van',
       'bus', 'van', 'saab', 'bus', 'opel', 'van', 'van', 'saab', 'saab',
       'van', 'van', 'bus', 'van', 'saab', 'saab', 'saab', 'opel', 'bus',
       'bus', 'van', 'saab', 'van', 'opel', 'van', 'opel', 'opel', 'van',
       'bus', 'bus', 'opel', 'bus', 'opel', 'van', 'bus', 'opel', 'opel',
       'opel', 'opel', 'van', 'opel', 'saab', 'saab', 'bus', 'bus', 'bus',
       'bus', 'van', 'opel', 'bus', 'bus', 'van', 'van', 'bus', 'opel',
       'saab', 'opel', 'saab', 'van', 'bus', 'opel', 'saab', 'bus', 'opel',
       'bus', 'bus', 'van', 'van', 'van', 'bus', 'saab', 'opel', 'opel',
       'bus', 'bus', 'van', 'van', 'opel', 'opel', 'van', 'van', 'opel',
       'saab', 'bus', 'bus', 'saab', 'van

## Preprocess the Data

In [11]:
le_Y = LabelEncoder()

In [12]:
Y = le_Y.fit_transform(Y)

In [13]:
Y

array([3, 3, 2, 3, 0, 0, 0, 3, 3, 2, 3, 2, 0, 3, 0, 1, 3, 0, 2, 1, 0, 3, 0,
       0, 2, 3, 2, 2, 0, 2, 3, 2, 1, 1, 1, 3, 0, 3, 2, 0, 1, 3, 3, 2, 2, 3,
       3, 0, 3, 2, 2, 2, 1, 0, 0, 3, 2, 3, 1, 3, 1, 1, 3, 0, 0, 1, 0, 1, 3,
       0, 1, 1, 1, 1, 3, 1, 2, 2, 0, 0, 0, 0, 3, 1, 0, 0, 3, 3, 0, 1, 2, 1,
       2, 3, 0, 1, 2, 0, 1, 0, 0, 3, 3, 3, 0, 2, 1, 1, 0, 0, 3, 3, 1, 1, 3,
       3, 1, 2, 0, 0, 2, 3, 3, 2, 3, 3, 0, 0, 3, 0, 2, 2, 2, 3, 1, 3, 3, 3,
       2, 3, 2, 0, 1, 0, 1, 1, 3, 0, 2, 3, 2, 0, 1, 2, 3, 0, 1, 3, 2, 1, 1,
       1, 2, 2, 1, 1, 2, 2, 0, 3, 1, 0, 3, 0, 3, 0, 1, 0, 0, 3, 2, 1, 0, 2,
       2, 0, 0, 1, 1, 1, 1, 3, 2, 0, 2, 0, 2, 0, 0, 0, 3, 2, 1, 1, 3, 3, 1,
       0, 0, 1, 1, 3, 0, 0, 1, 1, 2, 1, 0, 3, 1, 3, 0, 0, 2, 3, 2, 3, 2, 2,
       3, 2, 0, 2, 3, 3, 0, 0, 1, 0, 1, 1, 0, 2, 3, 3, 0, 2, 1, 2, 3, 1, 3,
       0, 3, 2, 2, 1, 2, 1, 2, 2, 3, 3, 2, 0, 0, 2, 3, 3, 0, 2, 3, 3, 0, 0,
       0, 1, 2, 0, 1, 0, 0, 2, 3, 2, 0, 3, 1, 2, 3, 3, 0, 3, 0, 0, 0, 2, 2,
       0, 2,

In [14]:
sc_X = StandardScaler()

In [15]:
X = sc_X.fit_transform(X)



In [16]:
X

array([[ 0.16058035,  0.5089502 ,  0.05781852, ...,  0.38099086,
        -0.31372213,  0.18395733],
       [-0.32546965, -0.62626753,  0.1212614 , ...,  0.15692533,
         0.0109371 ,  0.45297703],
       [ 1.25419283,  0.83329812,  1.51700488, ..., -0.4032385 ,
        -0.15139252,  0.04944748],
       ..., 
       [ 1.49721783,  1.48199396,  1.19979045, ..., -0.96340232,
        -0.31372213,  0.72199673],
       [-0.93303214, -1.43713733, -0.25939591, ...,  1.38928574,
         0.17326672, -0.08506238],
       [-1.05454464, -1.43713733, -1.02071053, ...,  0.60505639,
        -0.47605175, -0.75761164]])

## Create Train and Test Data

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 4)

In [18]:
X_train.shape

(676, 18)

In [19]:
X_test.shape

(170, 18)

In [20]:
Y_train.shape

(676,)

In [21]:
Y_test.shape

(170,)

In [22]:
pd.DataFrame(pd.DataFrame(Y_train)[0].value_counts())

Unnamed: 0,0
2,178
0,176
1,163
3,159


In [23]:
pd.DataFrame(pd.DataFrame(Y_test)[0].value_counts())

Unnamed: 0,0
1,49
0,42
3,40
2,39


## DecisionTree

In [24]:
clf_dt = DecisionTreeClassifier(criterion = 'entropy')

In [25]:
clf_dt.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [26]:
Y_pred_dt = clf_dt.predict(X_test)

In [27]:
confusion_matrix(Y_pred_dt, Y_test)

array([[40,  0,  1,  2],
       [ 0, 22, 19,  0],
       [ 1, 26, 17,  1],
       [ 1,  1,  2, 37]], dtype=int64)

## Random Forest

In [28]:
clf_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')

In [29]:
clf_rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [30]:
Y_pred_rf = clf_rf.predict(X_test)

In [31]:
confusion_matrix(Y_pred_rf, Y_test)

array([[42,  1,  2,  2],
       [ 0, 25, 16,  0],
       [ 0, 21, 20,  0],
       [ 0,  2,  1, 38]], dtype=int64)

## Naive Bayes

In [32]:
clf_nb = GaussianNB()

In [33]:
clf_nb.fit(X_train, Y_train)

GaussianNB(priors=None)

In [34]:
Y_pred_nb = clf_nb.predict(X_test)

In [35]:
confusion_matrix(Y_pred_nb, Y_test)

array([[ 9,  0,  0,  3],
       [ 6, 20,  8,  0],
       [ 3, 14, 17,  2],
       [24, 15, 14, 35]], dtype=int64)

## KNN

In [36]:
clf_knn = KNeighborsClassifier(n_neighbors = 5)

In [37]:
clf_knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [38]:
Y_pred_knn = clf_knn.predict(X_test)

In [39]:
confusion_matrix(Y_pred_knn, Y_test)

array([[41,  2,  2,  5],
       [ 0, 22, 15,  0],
       [ 0, 20, 22,  1],
       [ 1,  5,  0, 34]], dtype=int64)

## Logistic Regression

In [40]:
clf_lr = LogisticRegression()

In [41]:
clf_lr.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [42]:
Y_pred_lr = clf_lr.predict(X_test)

In [43]:
confusion_matrix(Y_pred_lr, Y_test)

array([[39,  2,  4,  0],
       [ 1, 27, 10,  0],
       [ 2, 18, 24,  0],
       [ 0,  2,  1, 40]], dtype=int64)

## Linear SVC

In [44]:
clf_lsvc = SVC(kernel = 'linear')

In [45]:
clf_lsvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [46]:
Y_pred_lsvc = clf_lsvc.predict(X_test)

In [47]:
confusion_matrix(Y_pred_lsvc, Y_test)

array([[39,  1,  4,  0],
       [ 0, 30,  9,  0],
       [ 2, 18, 26,  0],
       [ 1,  0,  0, 40]], dtype=int64)

## Kernel SVC

In [48]:
clf_ksvc = SVC(kernel = 'rbf')

In [49]:
clf_ksvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [50]:
Y_pred_ksvc = clf_ksvc.predict(X_test)

In [51]:
confusion_matrix(Y_pred_ksvc, Y_test)

array([[41,  1,  1,  1],
       [ 0, 22, 14,  0],
       [ 0, 22, 23,  0],
       [ 1,  4,  1, 39]], dtype=int64)

## Accuracy of Various Models

In [52]:
model_accuracies['DT'] = accuracy_score(Y_pred_dt, Y_test)
model_accuracies['KNN'] = accuracy_score(Y_pred_knn, Y_test)
model_accuracies['KernelSVC'] = accuracy_score(Y_pred_ksvc, Y_test)
model_accuracies['LinearSVC'] = accuracy_score(Y_pred_lsvc, Y_test)
model_accuracies['LogReg'] = accuracy_score(Y_pred_lr, Y_test)
model_accuracies['NB'] = accuracy_score(Y_pred_nb, Y_test)
model_accuracies['RF'] = accuracy_score(Y_pred_rf, Y_test)
model_accuracies

{'DT': 0.68235294117647061,
 'KNN': 0.69999999999999996,
 'KernelSVC': 0.73529411764705888,
 'LinearSVC': 0.79411764705882348,
 'LogReg': 0.76470588235294112,
 'NB': 0.47647058823529409,
 'RF': 0.73529411764705888}