## Initialization

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
model_accuracies = {'KNN':1, 'LogReg':1, 'DT':1, 'RF':1, 'NB':1, 'LinearSVC':1, 'KernelSVC':1}

## Importing the Data

In [5]:
df = pd.read_csv('Car.data', header = None)
df.shape

(1728, 7)

In [6]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


## Create X and Y

In [7]:
X = df.iloc[:, 0:6].values
Y = df.iloc[:, 6].values

In [8]:
X.shape

(1728, 6)

In [9]:
Y.shape

(1728,)

In [10]:
X[0]

array(['vhigh', 'vhigh', '2', '2', 'small', 'low'], dtype=object)

In [11]:
Y[0]

'unacc'

## Preprocess the data

In [12]:
le_Y = LabelEncoder()
Y = le_Y.fit_transform(Y)

def encoder(index):
    le = LabelEncoder()
    X[:, index] = le.fit_transform(X[:, index])

In [13]:
for i in range(0, 6):
    encoder(i)

In [14]:
X[0]

array([3, 3, 0, 0, 2, 1], dtype=object)

In [15]:
Y[0]

2

In [16]:
ohe_X = OneHotEncoder(categorical_features = [5])
X = ohe_X.fit_transform(X).toarray()
X = X[:, 1:]

In [17]:
ohe_X = OneHotEncoder(categorical_features = [6])
X = ohe_X.fit_transform(X).toarray()
X = X[:, 1:]

In [18]:
ohe_X = OneHotEncoder(categorical_features = [7])
X = ohe_X.fit_transform(X).toarray()
X = X[:, 1:]

In [19]:
ohe_X = OneHotEncoder(categorical_features = [8])
X = ohe_X.fit_transform(X).toarray()
X = X[:, 1:]

In [20]:
ohe_X = OneHotEncoder(categorical_features = [10])
X = ohe_X.fit_transform(X).toarray()
X = X[:, 1:]

In [21]:
ohe_X = OneHotEncoder(categorical_features = [12])
X = ohe_X.fit_transform(X).toarray()
X = X[:, 1:]

In [22]:
X.shape

(1728, 15)

## Create Train and Test data

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 4)

In [24]:
X_train.shape

(1382, 15)

In [25]:
X_test.shape

(346, 15)

In [26]:
Y_train.shape

(1382,)

In [27]:
Y_test.shape

(346,)

In [28]:
pd.DataFrame(pd.DataFrame(Y_train)[0].value_counts())

Unnamed: 0,0
2,969
0,301
3,56
1,56


In [29]:
pd.DataFrame(pd.DataFrame(Y_test)[0].value_counts())

Unnamed: 0,0
2,241
0,83
1,13
3,9


## Decision Tree Classifier

In [30]:
clf_dt = DecisionTreeClassifier(criterion = 'entropy')

In [31]:
clf_dt.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [32]:
Y_pred_dt = clf_dt.predict(X_test)

In [33]:
confusion_matrix(Y_test, Y_pred_dt)

array([[ 67,   1,  14,   1],
       [  3,   7,   2,   1],
       [  5,   1, 235,   0],
       [  1,   0,   0,   8]], dtype=int64)

## Random Forest Classifier

In [34]:
clf_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')

In [35]:
clf_rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [36]:
Y_pred_rf = clf_rf.predict(X_test)

In [37]:
confusion_matrix(Y_test, Y_pred_rf)

array([[ 62,   1,  19,   1],
       [  6,   4,   2,   1],
       [ 11,   2, 228,   0],
       [  4,   1,   0,   4]], dtype=int64)

## Naive Bayes Classifier

In [38]:
clf_nb = GaussianNB()

In [39]:
clf_nb.fit(X_train, Y_train)

GaussianNB(priors=None)

In [40]:
Y_pred_nb = clf_nb.predict(X_test)

In [41]:
confusion_matrix(Y_test, Y_pred_nb)

array([[ 35,  29,   0,  19],
       [  0,  10,   0,   3],
       [ 71,  41, 117,  12],
       [  0,   0,   0,   9]], dtype=int64)

## KNN Classifier

In [42]:
clf_knn = KNeighborsClassifier(n_neighbors = 5)

In [43]:
clf_knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [44]:
Y_pred_knn = clf_knn.predict(X_test)

In [45]:
confusion_matrix(Y_test, Y_pred_knn)

array([[ 60,   3,  20,   0],
       [  9,   2,   2,   0],
       [  8,   1, 232,   0],
       [  2,   2,   5,   0]], dtype=int64)

## Logistic Regression

In [46]:
clf_lr = LogisticRegression()

In [47]:
clf_lr.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [48]:
Y_pred_lr = clf_lr.predict(X_test)

In [49]:
confusion_matrix(Y_test, Y_pred_lr)

array([[ 66,   2,  15,   0],
       [  9,   3,   0,   1],
       [  8,   0, 233,   0],
       [  7,   0,   0,   2]], dtype=int64)

## SVC Linear

In [50]:
clf_lsvc = SVC(kernel = "linear")

In [51]:
clf_lsvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [52]:
Y_pred_lsvc = clf_lsvc.predict(X_test)

In [53]:
confusion_matrix(Y_test, Y_pred_lsvc)

array([[ 74,   3,   5,   1],
       [  4,   6,   0,   3],
       [  7,   1, 233,   0],
       [  0,   0,   0,   9]], dtype=int64)

## SVC Kernel

In [54]:
clf_ksvc = SVC(kernel = "rbf")

In [55]:
clf_ksvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [56]:
Y_pred_ksvc = clf_ksvc.predict(X_test)

In [57]:
confusion_matrix(Y_test, Y_pred_ksvc)

array([[ 81,   0,   2,   0],
       [ 11,   0,   0,   2],
       [ 12,   0, 229,   0],
       [  6,   0,   0,   3]], dtype=int64)

## Accuracy of Various Classifiers

In [58]:
model_accuracies['DT'] = accuracy_score(Y_test, Y_pred_dt)
model_accuracies['KNN'] = accuracy_score(Y_test, Y_pred_knn)
model_accuracies['KernelSVC'] = accuracy_score(Y_test, Y_pred_ksvc)
model_accuracies['LinearSVC'] = accuracy_score(Y_test, Y_pred_lsvc)
model_accuracies['LogReg'] = accuracy_score(Y_test, Y_pred_lr)
model_accuracies['NB'] = accuracy_score(Y_test, Y_pred_nb)
model_accuracies['RF'] = accuracy_score(Y_test, Y_pred_rf)
model_accuracies

{'DT': 0.91618497109826591,
 'KNN': 0.8497109826589595,
 'KernelSVC': 0.90462427745664742,
 'LinearSVC': 0.93063583815028905,
 'LogReg': 0.87861271676300579,
 'NB': 0.49421965317919075,
 'RF': 0.86127167630057799}