# K Nearest Neighbors Classifier

**Basic steps:**

1. Import the learning algorithm
2. Instantiate the model (choose hyper-parameters)
3. Learn the model
4. Predict the response

In [1]:
import pandas as pd


# Get Example Data

In [9]:
# target = InMichelin, whether or not a restaurant is in the Michelin guide
data = pd.read_csv("http://gattonweb.uky.edu/sheather/book/docs/datasets/MichelinNY.csv" , encoding="latin_1")
data.head()

Unnamed: 0,InMichelin,Restaurant Name,Food,Decor,Service,Price
0,0,14 Wall Street,19,20,19,50
1,0,212,17,17,16,43
2,0,26 Seats,23,17,21,35
3,1,44,19,23,16,52
4,0,A,23,12,19,24


In [10]:
# Delete extra variable that is not continuous
data = data.loc[:, data.columns != 'Restaurant Name']

data.head()

Unnamed: 0,InMichelin,Food,Decor,Service,Price
0,0,19,20,19,50
1,0,17,17,16,43
2,0,23,17,21,35
3,1,19,23,16,52
4,0,23,12,19,24


# Change variable names to X, y to create train/test split

In [12]:
y = data['InMichelin']
X = data.loc[:, data.columns != 'InMichelin']

print(y[0:5])
X.head()

0    0
1    0
2    0
3    1
4    0
Name: InMichelin, dtype: int64


Unnamed: 0,Food,Decor,Service,Price
0,19,20,19,50
1,17,17,16,43
2,23,17,21,35
3,19,23,16,52
4,23,12,19,24


# Train test split

In [13]:
from sklearn.model_selection import train_test_split

# Use train_test_split(X,y) to create four new data sets, defaults to .75/.25 split
X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train.head()

Unnamed: 0,Food,Decor,Service,Price
124,22,22,19,53
76,27,26,27,95
93,22,15,19,39
158,22,14,16,27
106,25,19,23,50


### Train model with k=5

In [14]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

#Print accuracy rounded to two digits to the right of decimal
print("accuracy: {:.2f}".format(knn.score(X_test, y_test)))

y_pred = knn.predict(X_test) # y_pred includes your predictions

accuracy: 0.83


### Train model with k=10

In [15]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)

#Print accuracy rounded to two digits to the right of decimal
print("accuracy: {:.2f}".format(knn.score(X_test, y_test)))
y_pred = knn.predict(X_test)


accuracy: 0.83


In [16]:
y_pred # view predictions for test data

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1],
      dtype=int64)

## Using Cross validation for model evaluation

In [19]:
#import cross validation functions from sk learn

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import KFold

# Set up function parameters for diff't cross validation strategies
kfold = KFold(n_splits=5)
skfold = StratifiedKFold(n_splits=5, shuffle=True)
rkf = RepeatedKFold(n_splits=5, n_repeats=10)

print("KFold:\n{}".format(
cross_val_score(KNeighborsClassifier(), X, y, cv=kfold)))

print("StratifiedKFold:\n{}".format(
cross_val_score(KNeighborsClassifier(n_neighbors=5), X, y, cv=skfold)))

print("RepeatedKFold:\n{}".format(
cross_val_score(KNeighborsClassifier(n_neighbors=5), X, y, cv=rkf)))


KFold:
[0.78787879 0.84848485 0.75757576 0.78787879 0.78125   ]
StratifiedKFold:
[0.81818182 0.87878788 0.81818182 0.75757576 0.78125   ]
RepeatedKFold:
[0.72727273 0.81818182 0.84848485 0.87878788 0.6875     0.81818182
 0.78787879 0.72727273 0.87878788 0.71875    0.81818182 0.84848485
 0.78787879 0.81818182 0.71875    0.78787879 0.78787879 0.6969697
 0.84848485 0.71875    0.75757576 0.75757576 0.81818182 0.6969697
 0.84375    0.6969697  0.81818182 0.84848485 0.84848485 0.8125
 0.84848485 0.75757576 0.78787879 0.75757576 0.875      0.72727273
 0.81818182 0.81818182 0.81818182 0.75       0.84848485 0.81818182
 0.78787879 0.84848485 0.75       0.6969697  0.72727273 0.90909091
 0.90909091 0.6875    ]


## Tuning models with grid search

In [20]:
from sklearn.model_selection import GridSearchCV
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

#create dictionary data object with keys equal to parameter name 'n_neighbors' 
#for knn model and values equal to range of k values to create models for

param_grid = {'n_neighbors': np.arange(1, 15, 2)} #np.arange creates sequence of numbers for each k value

grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=10)

#use meta model methods to fit score and predict model:
grid.fit(X_train, y_train)

#extract best score and parameter by calling objects "best_score_" and "best_params_"
print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters: {}".format(grid.best_params_))
print("test-set score: {:.3f}".format(grid.score(X_test, y_test)))


best mean cross-validation score: 0.772
best parameters: {'n_neighbors': 13}
test-set score: 0.927


In [74]:
# view data with complete tuning results
results = pd.DataFrame(grid.cv_results_)
results


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_n_neighbors,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,...,split7_test_score,split7_train_score,split8_test_score,split8_train_score,split9_test_score,split9_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.0009,0.0005,0.682927,1.0,1,{u'n_neighbors': 1},7,1.0,1.0,0.769231,...,0.416667,1.0,0.636364,1.0,0.909091,1.0,0.000539,0.0005,0.182205,0.0
1,0.0009,0.0005,0.739837,0.847301,3,{u'n_neighbors': 3},6,0.692308,0.827273,0.846154,...,0.583333,0.873874,0.818182,0.857143,0.727273,0.848214,0.0003,0.0005,0.124125,0.015701
2,0.0008,0.0005,0.764228,0.841911,5,{u'n_neighbors': 5},5,0.846154,0.818182,0.923077,...,0.5,0.873874,0.909091,0.8125,0.818182,0.857143,0.0004,0.0005,0.134918,0.019855
3,0.0007,0.0006,0.821138,0.822042,7,{u'n_neighbors': 7},1,1.0,0.809091,0.846154,...,0.666667,0.846847,0.909091,0.8125,0.818182,0.821429,0.000458,0.00049,0.110445,0.011399
4,0.0007,0.0003,0.796748,0.812123,9,{u'n_neighbors': 9},2,1.0,0.781818,0.846154,...,0.583333,0.846847,0.909091,0.794643,0.818182,0.803571,0.000458,0.000458,0.119982,0.018205
5,0.0006,0.0006,0.796748,0.803081,11,{u'n_neighbors': 11},2,1.0,0.772727,0.846154,...,0.583333,0.828829,0.909091,0.794643,0.818182,0.794643,0.00049,0.00049,0.119982,0.015396
6,0.0008,0.0004,0.780488,0.803982,13,{u'n_neighbors': 13},4,1.0,0.772727,0.846154,...,0.5,0.855856,0.818182,0.785714,0.818182,0.803571,0.0004,0.00049,0.131116,0.023481
