In [1]:
import numpy as np
from collections import Counter
import pandas as pd
from sklearn.metrics import accuracy_score

In [2]:
col_names=['v_high','v_high.1','2','2.1','small','low','unacc']
data = pd.read_csv("car_evaluation.csv",skiprows=1,header=None,names=col_names)
data.head(10)

Unnamed: 0,v_high,v_high.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc
5,vhigh,vhigh,2,2,big,low,unacc
6,vhigh,vhigh,2,2,big,med,unacc
7,vhigh,vhigh,2,2,big,high,unacc
8,vhigh,vhigh,2,4,small,low,unacc
9,vhigh,vhigh,2,4,small,med,unacc


In [3]:
from sklearn.preprocessing import LabelEncoder
le_v_high = LabelEncoder()
le_v_high_1 = LabelEncoder()
le_small = LabelEncoder()
le_low = LabelEncoder()
le_unacc = LabelEncoder()
le_2 = LabelEncoder()
le_21 = LabelEncoder()

In [4]:
inputs = data.drop('unacc',axis='columns')
target = data['unacc']

In [5]:
inputs['le_v_high'] = le_v_high.fit_transform(inputs['v_high'])
inputs['le_v_high_1'] = le_v_high_1.fit_transform(inputs['v_high.1'])
inputs['le_small'] = le_small.fit_transform(inputs['small'])
inputs['le_low'] = le_low.fit_transform(inputs['low'])
inputs['le_2'] = le_2.fit_transform(inputs['2'])
inputs['le_21'] = le_21.fit_transform(inputs['2.1'])
target = le_unacc.fit_transform(target)

In [6]:
inputs

Unnamed: 0,v_high,v_high.1,2,2.1,small,low,le_v_high,le_v_high_1,le_small,le_low,le_2,le_21
0,vhigh,vhigh,2,2,small,med,3,3,2,2,0,0
1,vhigh,vhigh,2,2,small,high,3,3,2,0,0,0
2,vhigh,vhigh,2,2,med,low,3,3,1,1,0,0
3,vhigh,vhigh,2,2,med,med,3,3,1,2,0,0
4,vhigh,vhigh,2,2,med,high,3,3,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1722,low,low,5more,more,med,med,1,1,1,2,3,2
1723,low,low,5more,more,med,high,1,1,1,0,3,2
1724,low,low,5more,more,big,low,1,1,0,1,3,2
1725,low,low,5more,more,big,med,1,1,0,2,3,2


In [7]:
inputs_n = inputs.drop(['v_high','v_high.1','small','low','2','2.1'],axis='columns')

In [8]:
inputs_n = inputs_n.to_numpy()
# target = target.to_numpy()

In [9]:
inputs_n

array([[3, 3, 2, 2, 0, 0],
       [3, 3, 2, 0, 0, 0],
       [3, 3, 1, 1, 0, 0],
       ...,
       [1, 1, 0, 1, 3, 2],
       [1, 1, 0, 2, 3, 2],
       [1, 1, 0, 0, 3, 2]])

In [10]:
def euclidean_distance(x1, x2):
    distance = np.sqrt(np.sum((x1-x2)**2))
    return distance

In [11]:
class KNN:
    def __init__(self,k=3):
        self.k = k
    def fit(self,X,y):
        self.X_train = X
        self.y_train = y
    def predict(self,X):
        predictions = [self._predict(x) for x in X]
        return predictions
    def _predict(self,x):
        #compute the distance
        distances =[euclidean_distance(x,x_train) for x_train in self.X_train]

        #get the closest k
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels=[self.y_train[i] for i in k_indices]

        #majority vote
        most_common = Counter(k_nearest_labels).most_common()
        return most_common[0][0]
    

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(inputs_n, target, test_size=.1, random_state=0)

In [13]:
X_train

array([[0, 3, 0, 0, 3, 1],
       [2, 2, 1, 0, 2, 1],
       [3, 0, 0, 1, 0, 0],
       ...,
       [1, 1, 0, 2, 1, 0],
       [0, 0, 2, 0, 0, 2],
       [0, 2, 2, 2, 1, 1]])

In [14]:
classifier=KNN(k=10)
classifier.fit(X_train,Y_train)
Y_pred = classifier.predict(X_test)
acc=accuracy_score(Y_test, Y_pred)*100

In [15]:
acc

92.48554913294798

In [16]:
from sklearn.model_selection import KFold
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=0)
accuracy_scores = []
classifiers=[]
for train_index, val_index in kf.split(inputs_n):
    X_train, X_val = inputs_n[train_index], inputs_n[val_index]
    Y_train, Y_val = target[train_index], target[val_index]
    classifier = KNN(k=5)
    classifier.fit(X_train, Y_train)
    Y_pred = classifier.predict(X_val)
    acc = accuracy_score(Y_val, Y_pred) * 100
    accuracy_scores.append(acc)
    classifiers.append(classifier)

classifier=classifiers[np.argmax(accuracy_scores)]

In [17]:
accuracy_scores

[92.1965317919075,
 89.30635838150289,
 91.59420289855072,
 90.43478260869566,
 93.33333333333333]

In [18]:
y__pred=classifier.predict(X_test)
acc=accuracy_score(Y_test, y__pred)*100

In [19]:
acc

97.10982658959537