# Using K-Nearest Neighbors to predict the safety metric of an arbitrary car #

### We use a 5-featured set of over 1700 cars consisting of the number of doors, lug boot quality, maintenance and price to predict our target: safety ###

### The K-Nearest Neighbours learning algorithim classifies points based on its "k" nearest neighbors and their respective classes. ###

In [102]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import sklearn 
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model, preprocessing
## I did not add this: 
from sklearn.utils import shuffle


In [103]:
df = pd.read_csv("car.data", sep=",")

car_data = df

#Encode target labels with value between 0 and n-1 (n classes) 
le = preprocessing.LabelEncoder()  
#We do this for each column to ensure everything is encoded. We cannot perform computations on string data, must be numerical in this case: 

buying = le.fit_transform(list(car_data["buying"]))
maint = le.fit_transform(list(car_data["maint"]))
door = le.fit_transform(list(car_data["door"]))
persons = le.fit_transform(list(car_data["persons"]))
lug_boot = le.fit_transform(list(car_data["lug_boot"]))
safety = le.fit_transform(list(car_data["safety"]))
cls = le.fit_transform(list(car_data["class"]))


predict = "class"
#data:
X = list(zip(buying,maint,door,persons,lug_boot,safety))
y = list(cls)

x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.1)

In [104]:
#Visualizing shapes
print(f" Training x is a {len(x_train)} x {len(x_train[0])} matrix\n")
print(f" Training y is a {len(y_train)} vector\n")
print(f" Testing x is a {len(x_test)} x {len(x_test[0])} matrix\n")
print(f" Testing y is a {len(y_test)} vector\n")

 Training x is a 1555 x 6 matrix

 Training y is a 1555 vector

 Testing x is a 173 x 6 matrix

 Testing y is a 173 vector



In [105]:
# Define the Model:
model = KNeighborsClassifier(n_neighbors=7)
#Fit the model to some training set: 
model.fit(x_train, y_train)
#Test model on the test sets 
acc = model.score(x_test, y_test)
print(acc)


0.976878612716763


In [106]:
# predict data 

predicted = model.predict(x_test)
names =["unacceptable","acceptable","good","very good"]


for i in range(len(predicted)): 
    print(f"Predicted: {names[predicted[i]]} Data: {x_test[i]}, Actual: {names[y_test[i]]}\n")
    n = model.kneighbors([x_test[i]],7,True)
    print(f"Distances:{n[0]}\n Indices: {n[1]}\n")

Predicted: good Data: (np.int64(3), np.int64(3), np.int64(3), np.int64(1), np.int64(1), np.int64(2)), Actual: good

Distances:[[1. 1. 1. 1. 1. 1. 1.]]
 Indices: [[ 231  908 1435 1150 1243  861  447]]

Predicted: unacceptable Data: (np.int64(2), np.int64(3), np.int64(3), np.int64(1), np.int64(0), np.int64(2)), Actual: unacceptable

Distances:[[1. 1. 1. 1. 1. 1. 1.]]
 Indices: [[ 861  912  858 1150  679  178  101]]

Predicted: unacceptable Data: (np.int64(1), np.int64(0), np.int64(2), np.int64(1), np.int64(2), np.int64(2)), Actual: unacceptable

Distances:[[1. 1. 1. 1. 1. 1. 1.]]
 Indices: [[ 176   47 1349  983  778  469  623]]

Predicted: good Data: (np.int64(3), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(0)), Actual: good

Distances:[[1.         1.         1.         1.         1.         1.
  1.41421356]]
 Indices: [[1330 1449  859  904 1375 1185 1160]]

Predicted: good Data: (np.int64(3), np.int64(2), np.int64(0), np.int64(0), np.int64(0), np.int64(0)), Actual: good