# KNN Classification

In [1]:
# Using car evaluation data from UCI
# https://archive.ics.uci.edu/ml/datasets/Car+Evaluation

In [2]:
# adding feature names to the data file as pandas use the first line as the name of the attributes/features in the dataset
# feature names are: "buying,maint,door,persons,lug_boot,safety,class"
# more info about the features is given on the link from where the dataset is taken

In [3]:
# importing the essential files from sklearn
import sklearn
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
from sklearn import linear_model,preprocessing

In [4]:
# importing data from car.data file using pandas read csv function as the .data file is also comma separated
cardata=pd.read_csv("C:\\Users\\SID\\pythonpractice\\MLcourse\\data\\car.data")

In [5]:
# viewing the top 5 data lines
cardata.head()

Unnamed: 0,buying,maint,door,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [6]:
cardata["buying"]

0       vhigh
1       vhigh
2       vhigh
3       vhigh
4       vhigh
        ...  
1723      low
1724      low
1725      low
1726      low
1727      low
Name: buying, Length: 1728, dtype: object

In [7]:
# converting non numeric data to numeric data and changing vhigh,small etc to some relevant numeric values
# label encoding using sklearn's built in preprocessing mechanism called labelEncoder object and saving it to a new variable "le" same way as we save pandas as pd
le=preprocessing.LabelEncoder()

In [8]:
le

LabelEncoder()

In [9]:
# fit_transform() method takes list of column and return an array from the original pandas dataframe
tr_buying=le.fit_transform(list(cardata["buying"])) # transforming buying column into a list then transform them using fit_tranform method

In [10]:
# data is segregated into the categories that exist in the cardata, vhigh is replaced by  3, low is replaced by 1 etc i.e. some integer value for all the classes
tr_buying

array([3, 3, 3, ..., 1, 1, 1], dtype=int64)

In [11]:
cardata["buying"]

0       vhigh
1       vhigh
2       vhigh
3       vhigh
4       vhigh
        ...  
1723      low
1724      low
1725      low
1726      low
1727      low
Name: buying, Length: 1728, dtype: object

In [12]:
# transforming other columns into relevant numeric data
tr_maint = le.fit_transform(list(cardata["maint"]))
tr_door = le.fit_transform(list(cardata["door"]))
tr_persons = le.fit_transform(list(cardata["persons"]))
tr_lug_boot = le.fit_transform(list(cardata["lug_boot"]))
tr_safety = le.fit_transform(list(cardata["safety"]))
tr_cls = le.fit_transform(list(cardata["class"]))

In [13]:
# combining the features together using python's built in function zip which joins list into tuple which further has to be transformed to list or dict (possible if only 2 element are zipped)
X=list(zip(tr_buying,tr_maint,tr_door,tr_persons,tr_lug_boot,tr_safety)) # features i.e. the attributes used to predict the labels
y=list(tr_cls) # labels i.e. the values which are to be predicted/classified

In [14]:
# dividing the data into training and testing data
x_train,x_test,y_train,y_test=sklearn.model_selection.train_test_split(X,y,test_size=0.1)

In [15]:
len(x_train)

1555

In [16]:
len(x_test)

173

In [35]:
model=KNeighborsClassifier(n_neighbors=7) # using generic name "model" as it is of not much use to pickle this model

In [36]:
model.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')

In [37]:
# accuracy score changes as per the K hyperparamters for 5 it is close to 90 but for 3 it is around 80 and for 7 it is around 93 although it depends on the train test split
acc=model.score(x_test,y_test)

In [38]:
acc

0.9364161849710982

In [39]:
# making predictions/classification the test data with KNNClassifier with 7 closest neighbor which we finalize after checking for a few times could be better but that depends on the train test split
predicted = model.predict(x_test)

In [40]:
predicted

array([2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2, 0, 3, 0, 2, 2,
       2, 2, 2, 2, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2,
       2, 0, 1, 0, 2, 2, 0, 2, 2, 2, 3, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 0,
       0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2,
       2, 0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2,
       0, 2, 2, 0, 2, 0, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2,
       2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 0, 0, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2],
      dtype=int64)

In [41]:
# to  use actual labels instead of numbers 
names=["unacc","acc","good","vgood"]

In [42]:
# to view predicted values, the data and actual values on test data in transformed form
for i in range(len(predicted)):
    print(f"Predicted: {predicted[i]} Data: {x_test[i]} Actual: {y_test[i]}")

Predicted: 2 Data: (0, 0, 2, 1, 2, 2) Actual: 2
Predicted: 2 Data: (2, 1, 2, 0, 0, 1) Actual: 2
Predicted: 2 Data: (0, 2, 1, 0, 1, 1) Actual: 2
Predicted: 0 Data: (2, 0, 1, 2, 0, 0) Actual: 0
Predicted: 2 Data: (0, 2, 3, 0, 2, 1) Actual: 2
Predicted: 2 Data: (3, 2, 3, 2, 1, 1) Actual: 2
Predicted: 2 Data: (3, 0, 3, 2, 1, 0) Actual: 2
Predicted: 0 Data: (1, 0, 1, 1, 1, 2) Actual: 0
Predicted: 2 Data: (3, 1, 0, 1, 2, 2) Actual: 2
Predicted: 0 Data: (0, 1, 1, 1, 0, 0) Actual: 0
Predicted: 2 Data: (3, 2, 0, 1, 2, 1) Actual: 2
Predicted: 0 Data: (2, 2, 2, 2, 2, 0) Actual: 0
Predicted: 2 Data: (3, 1, 0, 0, 0, 2) Actual: 2
Predicted: 0 Data: (3, 1, 1, 2, 0, 2) Actual: 0
Predicted: 2 Data: (0, 2, 3, 2, 2, 1) Actual: 2
Predicted: 2 Data: (1, 3, 2, 0, 1, 2) Actual: 2
Predicted: 2 Data: (0, 1, 3, 1, 1, 1) Actual: 2
Predicted: 0 Data: (2, 2, 0, 2, 0, 0) Actual: 3
Predicted: 3 Data: (1, 1, 2, 2, 0, 0) Actual: 3
Predicted: 0 Data: (1, 3, 1, 2, 0, 0) Actual: 0
Predicted: 2 Data: (0, 0, 3, 2, 0, 1) Ac

In [43]:
# using the names
for i in range(len(predicted)):
    print(f"Predicted: {names[predicted[i]]}       Actual: {names[y_test[i]]}")

Predicted: good       Actual: good
Predicted: good       Actual: good
Predicted: good       Actual: good
Predicted: unacc       Actual: unacc
Predicted: good       Actual: good
Predicted: good       Actual: good
Predicted: good       Actual: good
Predicted: unacc       Actual: unacc
Predicted: good       Actual: good
Predicted: unacc       Actual: unacc
Predicted: good       Actual: good
Predicted: unacc       Actual: unacc
Predicted: good       Actual: good
Predicted: unacc       Actual: unacc
Predicted: good       Actual: good
Predicted: good       Actual: good
Predicted: good       Actual: good
Predicted: unacc       Actual: vgood
Predicted: vgood       Actual: vgood
Predicted: unacc       Actual: unacc
Predicted: good       Actual: good
Predicted: good       Actual: good
Predicted: good       Actual: good
Predicted: good       Actual: good
Predicted: good       Actual: good
Predicted: good       Actual: good
Predicted: unacc       Actual: acc
Predicted: good       Actual: good
Pred

In [44]:
# printing the neighbors with distances so see how it predicted those values
#  ie if in test set the value to be predicted is actually "good" then the neighbors around it should be "good" within some distance to view those neighbor and distances we can use sklearn's inbuilt

In [45]:
for i in range(len(predicted)):
    print(f"Predicted: {names[predicted[i]]}       Actual: {names[y_test[i]]}")
    n=model.kneighbors([x_test[i]],5,True)
    print(f"n {n}")

Predicted: good       Actual: good
n (array([[1., 1., 1., 1., 1.]]), array([[  46,  228,  713, 1465,  111]], dtype=int64))
Predicted: good       Actual: good
n (array([[1., 1., 1., 1., 1.]]), array([[1094, 1543,   35,  250, 1130]], dtype=int64))
Predicted: good       Actual: good
n (array([[1., 1., 1., 1., 1.]]), array([[ 289,  351,   22, 1547,  558]], dtype=int64))
Predicted: unacc       Actual: unacc
n (array([[1., 1., 1., 1., 1.]]), array([[ 531, 1185,  450,  536,  624]], dtype=int64))
Predicted: good       Actual: good
n (array([[1., 1., 1., 1., 1.]]), array([[ 85, 354, 520, 900, 663]], dtype=int64))
Predicted: good       Actual: good
n (array([[1., 1., 1., 1., 1.]]), array([[1308,  901,  135,  241,  125]], dtype=int64))
Predicted: good       Actual: good
n (array([[1., 1., 1., 1., 1.]]), array([[ 304,  424, 1311,  542,  473]], dtype=int64))
Predicted: unacc       Actual: unacc
n (array([[1., 1., 1., 1., 1.]]), array([[ 329, 1162,  395,  939, 1193]], dtype=int64))
Predicted: good  

n (array([[1., 1., 1., 1., 1.]]), array([[ 538,   72, 1258,  618,  552]], dtype=int64))
Predicted: good       Actual: good
n (array([[1., 1., 1., 1., 1.]]), array([[504, 650, 222, 627, 550]], dtype=int64))
Predicted: good       Actual: good
n (array([[1., 1., 1., 1., 1.]]), array([[ 515,  506,  529, 1391,  109]], dtype=int64))
Predicted: unacc       Actual: unacc
n (array([[1., 1., 1., 1., 1.]]), array([[  40, 1259,   99,   32,  836]], dtype=int64))
Predicted: good       Actual: good
n (array([[1., 1., 1., 1., 1.]]), array([[ 566, 1477,  550, 1548,  441]], dtype=int64))
Predicted: unacc       Actual: unacc
n (array([[1., 1., 1., 1., 1.]]), array([[1030,  805,  942,  316, 1092]], dtype=int64))
Predicted: good       Actual: good
n (array([[1., 1., 1., 1., 1.]]), array([[492, 388, 444, 413, 756]], dtype=int64))
Predicted: good       Actual: good
n (array([[1., 1., 1., 1., 1.]]), array([[1227,  224, 1338, 1357,   52]], dtype=int64))
Predicted: unacc       Actual: unacc
n (array([[1., 1., 1

In [46]:
# we can create a map but that is not of much use so skipping it 
# sklearn documentation can help with it