In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import bs4 as bs
import requests

import keras as ks
import cv2 as cv
import sklearn as sk
from sklearn import neighbors,metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import re
import os
import datetime
import random
import time
import missingno as msno

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
sns.set(style='whitegrid', color_codes=True, font_scale=1.5, palette='Blues', font='Arial', rc={'figure.figsize':(8,6)})

In [2]:
df = pd.read_csv("car.data")

In [3]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [5]:
df.columns

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class'], dtype='object')

In [6]:
df.describe().T

Unnamed: 0,count,unique,top,freq
buying,1728,4,vhigh,432
maint,1728,4,vhigh,432
doors,1728,4,2,432
persons,1728,3,2,576
lug_boot,1728,3,small,576
safety,1728,3,low,576
class,1728,4,unacc,1210


In [7]:
df.shape

(1728, 7)

In [8]:
# Labels
X = df[['buying','maint','safety']].values
y= df['class']

In [9]:
X

array([['vhigh', 'vhigh', 'low'],
       ['vhigh', 'vhigh', 'med'],
       ['vhigh', 'vhigh', 'high'],
       ...,
       ['low', 'low', 'low'],
       ['low', 'low', 'med'],
       ['low', 'low', 'high']], dtype=object)

In [10]:
y

0       unacc
1       unacc
2       unacc
3       unacc
4       unacc
        ...  
1723     good
1724    vgood
1725    unacc
1726     good
1727    vgood
Name: class, Length: 1728, dtype: object

In [11]:
# Converting the labels using Label Encoder
le = LabelEncoder()
for i in range(len(X[0])):
    X[:,i] = le.fit_transform(X[:,i])

In [27]:
X # Encoded labels 3 - good, 2 - med, 1 - bad
# Label encoding - process of converting the labels(categorical variables) into numerical values

array([[3, 3, 1],
       [3, 3, 2],
       [3, 3, 0],
       ...,
       [1, 1, 1],
       [1, 1, 2],
       [1, 1, 0]], dtype=object)

In [13]:
y

0       unacc
1       unacc
2       unacc
3       unacc
4       unacc
        ...  
1723     good
1724    vgood
1725    unacc
1726     good
1727    vgood
Name: class, Length: 1728, dtype: object

In [14]:
y.unique()

array(['unacc', 'acc', 'vgood', 'good'], dtype=object)

In [15]:
dict_mapping = {'unacc':0, 'acc':1, 'good':2, 'vgood':3}

In [16]:
y = np.array(y.map(dict_mapping))

In [17]:
y

array([0, 0, 0, ..., 0, 2, 3], dtype=int64)

In [18]:
# Model Creation KNN

In [19]:
print(X,y)

[[3 3 1]
 [3 3 2]
 [3 3 0]
 ...
 [1 1 1]
 [1 1 2]
 [1 1 0]] [0 0 0 ... 0 2 3]


In [20]:
knn = neighbors.KNeighborsClassifier(n_neighbors=15,weights='distance')

# knn.fit(X,y)
# Split in train and test data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [21]:
print(len(X_train),len(X_test),len(y_train),len(y_test))

1382 346 1382 346


In [22]:
knn.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=15, weights='distance')

In [23]:
predictions = knn.predict(X_test)

In [30]:
predictions


array([0, 0, 0, 2, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 3, 0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 3, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       2, 0, 1, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       3, 3, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 1, 3, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,

In [31]:
accuracy = metrics.accuracy_score(y_test,predictions)

In [32]:
accuracy

0.7427745664739884

In [41]:
knn2 = neighbors.KNeighborsClassifier(n_neighbors=10)
knn2.fit(X_train,y_train)
predictions2 = knn2.predict(X_test)
accuracy2 = metrics.accuracy_score(y_test,predictions2)
accuracy2

0.7023121387283237

In [42]:
print("Accuracy using KNN with n_neighbors=15 and weights='distance' is: ",accuracy)
print("Accuracy using KNN with n_neighbors=10 is: ",accuracy2)

Accuracy using KNN with n_neighbors=15 and weights='distance' is:  0.7427745664739884
Accuracy using KNN with n_neighbors=10 is:  0.7023121387283237


In [43]:
# So use KNN with n_neighbors=15 and weights='distance' !

In [49]:
print("actual value",y[0:10])
print("predicted value",predictions[0:10])
countOfwrongPredictions = 0
for i in range(len(y[0:10])):
    countOfwrongPredictions += int(y_test[i] != predictions[i])
print("Number of wrong predictions: ",countOfwrongPredictions)

actual value [0 0 0 0 0 0 0 0 0 0]
predicted value [0 0 0 2 1 0 0 0 1 0]
Number of wrong predictions:  1


In [50]:
# KNN Basics done
