# Import packages

In [1]:
from pathlib import Path
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
import matplotlib.pylab as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt
import seaborn as sns
from sklearn.decomposition import PCA
import numpy as np

# Read the data, use only the first 1436 records and select columns for KNN

In [2]:
car_df = pd.read_csv('TrainingDataToyotaCorolla.csv') 
car_df = car_df.iloc[0:1436]

# Add index and present the 1436 data records

In [3]:
car_df['Number'] = car_df.index + 1
car_df.head(1436)

Unnamed: 0,Price,Age_08_04,HP,Quarterly_Tax,Automatic_airco,Number
0,13500,23,90,210,0,1
1,13750,23,90,210,0,2
2,13950,24,90,210,0,3
3,14950,26,90,210,0,4
4,13750,30,90,210,0,5
...,...,...,...,...,...,...
1431,7500,69,86,69,0,1432
1432,10845,72,86,69,0,1433
1433,8500,71,86,69,0,1434
1434,7250,70,86,69,0,1435


# Spliting the data into training and testing set

In [4]:
trainData, validData = train_test_split(car_df, test_size=0.4, random_state=1437)
print(trainData.shape, validData.shape)

(861, 6) (575, 6)


# Establish Testing data to be classified by the model

In [5]:
car_df1 = pd.DataFrame({'Age_08_04': [23,23],'HP':[90,90], 'Quarterly_Tax':[210,210],'Automatic_airco': [0,0]})
car_df1

Unnamed: 0,Age_08_04,HP,Quarterly_Tax,Automatic_airco
0,23,90,210,0
1,23,90,210,0


# As the data variables (Age_08_04 and Automatic_airco) are with different ranges, they must be normalized into [z scores]  and 
# Transform the full dataset

In [6]:
scaler = preprocessing.StandardScaler()
scaler.fit(trainData[['Age_08_04', 'HP','Quarterly_Tax','Automatic_airco']])  # Note the use of an array of column names  

carNorm = pd.concat([pd.DataFrame(scaler.transform(car_df[['Age_08_04','HP','Quarterly_Tax' ,'Automatic_airco']]),
                                    columns=['zAge_08_04','zHP','zQuarterly_Tax','zAutomatic_airco']),
                       car_df[['Price', 'Number']]], axis=1)
trainNorm = carNorm.iloc[trainData.index]
validNorm = carNorm.iloc[validData.index]
newcarNorm = pd.DataFrame(scaler.transform(car_df1), columns=['zAge_08_04','zHP','zQuarterly_Tax','zAutomatic_airco'])

# Use k-nearest neighbour against the normalized training data with K=5

In [7]:
knn = NearestNeighbors(n_neighbors=5)
knn.fit(trainNorm[['zAge_08_04','zHP','zQuarterly_Tax','zAutomatic_airco']])
distances, indices = knn.kneighbors(newcarNorm)
print(trainNorm.iloc[indices[0], :])  # indices is a list of lists, we are only interested in the first element

   zAge_08_04       zHP  zQuarterly_Tax  zAutomatic_airco  Price  Number
0   -1.764844 -0.777545        3.067661         -0.266262  13500       1
1   -1.764844 -0.777545        3.067661         -0.266262  13750       2
2   -1.711624 -0.777545        3.067661         -0.266262  13950       3
3   -1.605185 -0.777545        3.067661         -0.266262  14950       4
7   -1.392306 -0.777545        3.067661         -0.266262  18600       8


# Use k-nearest neighbour against the normalized training data with K=10

In [8]:
knn = NearestNeighbors(n_neighbors=10)
knn.fit(trainNorm[['zAge_08_04','zHP','zQuarterly_Tax','zAutomatic_airco']])
distances, indices = knn.kneighbors(newcarNorm)
print(trainNorm.iloc[indices[0], :])  # indices is a list of lists, we are only interested in the first element

     zAge_08_04       zHP  zQuarterly_Tax  zAutomatic_airco  Price  Number
0     -1.764844 -0.777545        3.067661         -0.266262  13500       1
1     -1.764844 -0.777545        3.067661         -0.266262  13750       2
2     -1.711624 -0.777545        3.067661         -0.266262  13950       3
3     -1.605185 -0.777545        3.067661         -0.266262  14950       4
7     -1.392306 -0.777545        3.067661         -0.266262  18600       8
5     -1.285866 -0.777545        3.067661         -0.266262  12950       6
243   -1.232646 -0.777545        3.067661         -0.266262  13500     244
214   -1.232646 -0.777545        3.067661         -0.266262  13500     215
48    -1.818064 -0.777545        3.662879         -0.266262  17950      49
118   -1.924503 -0.777545        3.662879         -0.266262  19250     119


# To check which K gives the best results lets initialize a data frame with two columns: `k` and `accuracy

In [9]:
train_X = trainNorm[['zAge_08_04','zHP','zQuarterly_Tax','zAutomatic_airco']]
train_y = trainNorm['Price']
valid_X = validNorm[['zAge_08_04','zHP','zQuarterly_Tax','zAutomatic_airco']]
valid_y = validNorm['Price']
# Train a classifier for different values of k
results = []
for k in range(1,861):
    knn = KNeighborsClassifier(n_neighbors=k).fit(train_X, train_y)
    results.append({'k': k,'accuracy': accuracy_score(valid_y, knn.predict(valid_X))})  
    # Convert results to a pandas data frame
results = pd.DataFrame(results)
print(results)

       k  accuracy
0      1  0.073043
1      2  0.050435
2      3  0.046957
3      4  0.043478
4      5  0.057391
..   ...       ...
855  856  0.067826
856  857  0.067826
857  858  0.067826
858  859  0.067826
859  860  0.067826

[860 rows x 2 columns]


In [10]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)