In [14]:
import sklearn
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model,preprocessing
import pandas as pd
import numpy as np

In [15]:
# the class label values : unacceptable - (unacc) , acceptable - (acc), good , very good - (vgood)
data = pd.read_csv('car.data')


In [16]:
# pandas read the first row as column labels, here buying, maint(maintenance), doors, persons..etc are features  

data.head()


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


### Removing features with non numerical data, data preprocessing 

In [9]:
num_labels = preprocessing.LabelEncoder()

In [24]:
# this will convert all the categorical features of the buying column into suitable numerical list values, do this for all column labels
# (buying,maint,doors,persons,lug_boot are all features i.e X)
# the fit_transform methods only works when the input is in list, so we need to convert the column labels into list 

buying = num_labels.fit_transform(list(data["buying"]))
maint = num_labels.fit_transform(list(data["maint"]))
doors = num_labels.fit_transform(list(data["doors"]))
persons = num_labels.fit_transform(list(data["persons"]))
lug_boot = num_labels.fit_transform(list(data["lug_boot"]))
safety = num_labels.fit_transform(list(data["safety"]))
# now the class variable (cls) which converts categorical values like unacceptable, acceptable, good and very good into numerical values
# this is a numpy array cls
cls = num_labels.fit_transform(list(data["class"]))



In [25]:
# here the columns are converted into a list with suitable numerical values using sklearn preprocessing technique
buying,maint

(array([3, 3, 3, ..., 1, 1, 1], dtype=int64),
 array([3, 3, 3, ..., 1, 1, 1], dtype=int64))

In [27]:
# Creating X & Y list ( X is the feature and Y is the class label )
# zippping all the features like buying,persons,lug_boot..etc
# zip creates a tuple object with given input lists

X = list(zip(buying,maint,doors,persons,lug_boot,safety))

# since Y is the class label i.e cls, we will just put the converted values of cls into a list

Y = list(cls)


In [28]:
# test size is the size of test data
x_train,x_test,y_train,y_test = sklearn.model_selection.train_test_split(X,Y, test_size = 0.1)

In [30]:
# x_train,y_test

In [57]:
# Creating the KNN classifier, which takes in amount of neighbours/How many neighbours we want as a parameter 
# n_neighbors is the number of nearest neighbors, we can adjust this to get better results 

clf = KNeighborsClassifier(n_neighbors=9)

In [58]:
clf.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=9, p=2,
           weights='uniform')

In [59]:
# getting the accuracy values 

acc = clf.score(x_test,y_test)

In [60]:
acc

0.930635838150289

### Observations !!!


### with n_neighbors = 3 we got accuracy of 0.83456
### with n_neighbors = 5 we got accuracy of 0.87861
### with n_neighbors = 7 we got accuracy of 0.92485

### with n_neighbors = 9 we got the higest accuracy value of 0.930635 

### Actual class results vs the predicted results

In [62]:
# looping through the test data and observing the real values and the predicted ones 

In [63]:
predicted = clf.predict(x_test)

In [88]:
# This is done in order to labels the converted numerical feature values into suitable names for better interpretation 
ratings = ["unacceptable","acceptable","good","very good"]

### Comparing the predicted ratings vs the actual ratings

In [89]:
for i in range(len(predicted)):
    print(f'The predicted rating: {ratings[predicted[i]]}, actual rating: {ratings[y_test[i]]}, Data : {x_test[i]}')
    # here Actual Ratings [0,1,2,3] --> 0 = unacc , 1 = acc, 2 = good, 3 = very Good

The predicted rating: unacceptable, actual rating: unacceptable, Data : (3, 2, 0, 2, 0, 0)
The predicted rating: unacceptable, actual rating: unacceptable, Data : (2, 1, 1, 2, 2, 2)
The predicted rating: good, actual rating: good, Data : (1, 0, 1, 0, 1, 0)
The predicted rating: unacceptable, actual rating: unacceptable, Data : (0, 1, 1, 2, 1, 0)
The predicted rating: very good, actual rating: very good, Data : (1, 2, 3, 1, 1, 0)
The predicted rating: good, actual rating: good, Data : (1, 1, 2, 0, 2, 2)
The predicted rating: unacceptable, actual rating: unacceptable, Data : (2, 2, 0, 2, 1, 0)
The predicted rating: good, actual rating: good, Data : (1, 2, 2, 0, 2, 0)
The predicted rating: unacceptable, actual rating: unacceptable, Data : (3, 1, 2, 1, 2, 0)
The predicted rating: unacceptable, actual rating: unacceptable, Data : (0, 2, 2, 1, 1, 0)
The predicted rating: unacceptable, actual rating: unacceptable, Data : (0, 1, 1, 1, 1, 0)
The predicted rating: good, actual rating: good, Data