In [2]:
import numpy as np
import pandas as pd
from sklearn import neighbors, metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [4]:
data = pd.read_csv("car.data")
print(data.head())

  buying  maint doors persons lug_boot safety  class
0  vhigh  vhigh     2       2    small    low  unacc
1  vhigh  vhigh     2       2    small    med  unacc
2  vhigh  vhigh     2       2    small   high  unacc
3  vhigh  vhigh     2       2      med    low  unacc
4  vhigh  vhigh     2       2      med    med  unacc


In [9]:
X = data[[
    'buying',
    'maint',
    'safety'
]].values

y = data[['class']]

print(X, y)

[['vhigh' 'vhigh' 'low']
 ['vhigh' 'vhigh' 'med']
 ['vhigh' 'vhigh' 'high']
 ...
 ['low' 'low' 'low']
 ['low' 'low' 'med']
 ['low' 'low' 'high']]       class
0     unacc
1     unacc
2     unacc
3     unacc
4     unacc
...     ...
1723   good
1724  vgood
1725  unacc
1726   good
1727  vgood

[1728 rows x 1 columns]


In [10]:
# We have strings like 'low', 'unacc' and we need to convert these into numbers to feed into the machine learning algorithm
# The following is one kind of encoding, which uses a label encoder
Le = LabelEncoder()
for i in range(len(X[0])):
    X[:, i] = Le.fit_transform(X[:, i])

print(X)

[[3 3 1]
 [3 3 2]
 [3 3 0]
 ...
 [1 1 1]
 [1 1 2]
 [1 1 0]]


In [11]:
# For y values, we will use a different type of conversion from the label encoder
# We use mapping
label_mapping = {
    'unacc' : 0,
    'acc' : 1,
    'good' : 2,
    'vgood' : 3
}

y['class'] = y['class'].map(label_mapping)

# Convert to a numpy array
y = np.array(y)
print(y)

[[0]
 [0]
 [0]
 ...
 [0]
 [2]
 [3]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['class'] = y['class'].map(label_mapping)


In [12]:
# Now, data is in the correct form. We can start creating the model
knn = neighbors.KNeighborsClassifier(n_neighbors=25, weights='uniform')

In [14]:
# Create training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1382, 3)
(346, 3)
(1382, 1)
(346, 1)


In [15]:
# Train the model
knn.fit(X_train, y_train)

  knn.fit(X_train, y_train)


KNeighborsClassifier(n_neighbors=25)

In [17]:
# Make a predictions
prediction = knn.predict(X_test)
print(prediction)

[0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 3 1 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 2 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0
 0 0 0 3 0 0 0 2 0 0 0 0 1 1 2 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0
 0 1 1 0 1 0 0 0 0 0 0 3 1 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 2 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 2 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1
 0 0 0 0 2 1 0 3 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 2 0 0 0 0 0
 0 0 0 0 1 1 1 1 0 3 0 0 2 0 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 3 0 0 0 1
 0 0 0 0 0 0 0 0 0 3 1 0 1 0 0 0 1 0 3 3 0 1 0 0 1 0 0 0 3 0 0 0 1 0 0 0 1
 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0
 0 0 1 0 1 0 1 0 0 0 0 0 0]


In [18]:
# Check the accuracy of the predictions
accuracy = metrics.accuracy_score(y_test, prediction)
print(accuracy)

0.7456647398843931


In [20]:
# Testing the prediction
print("Actual value: ", y[20])
print("Predicted value: ", knn.predict(X)[20])

Actual value:  [0]
Predicted value:  0
