# K-Nearest Neighbors

In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics, preprocessing

%cd "G:/Archive"

G:\Archive


## Data Prep

In [25]:
#load data
data = pd.read_csv("data/wine quality red.csv", header = 0)

#aggregate into binary classification problem
data["quality"] = pd.Categorical(np.where(data["quality"] > 5, "above_avg", "below_avg"))

#split data
X = data.iloc[:, :11]
y = data.iloc[:, 11]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

## Searching Optimal Parameters
KNN is essentially measuring distance between sample points, so it is sensitive to numerical scales of variables.
### No scaling

In [27]:
#10-fold cv with accuracy as target metrics
grid1 = {"n_neighbors": range(1, 11)}
model1 = GridSearchCV(KNeighborsClassifier(), param_grid = grid1, cv = 10, scoring = "accuracy")
model1.fit(X_train, y_train)

#show tuning results
print("Best score: ", model1.best_score_)
print("Corresponding parms: ", model1.best_params_)
print("Model description: ", model1.best_estimator_)

#predict on test set
pred1 = model1.predict(X_test)

#show prediction performance
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, pred1)))
print("Recall: {0:.4f}".format(metrics.recall_score(y_test, pred1, pos_label = "above_avg", average = "binary")))
print("Precision: {0:.4f}".format(metrics.precision_score(y_test, pred1, pos_label = "above_avg", average = "binary")))
print("F1 score: {0:.4f}".format(metrics.f1_score(y_test, pred1, pos_label = "above_avg", average = "binary")))

Best score:  0.6935285433070866
Corresponding parms:  {'n_neighbors': 1}
Model description:  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')
Accuracy: 0.7312
Recall: 0.7442
Precision: 0.7529
F1 score: 0.7485


### Standardization
Each variable is scaled to a distribution with 0 mean, 1 standard deviation.

In [35]:
#use same scaling to train and test sets
scale1 = preprocessing.StandardScaler().fit(X_train)
X_train_scale1 = scale1.transform(X_train)
X_test_scale1 = scale1.transform(X_test)

In [39]:
model2 = GridSearchCV(KNeighborsClassifier(), param_grid = grid1, cv = 10, scoring = "accuracy")
model2.fit(X_train_scale1, y_train)

#show tuning results
print("Best score: ", model2.best_score_)
print("Corresponding parms: ", model2.best_params_)
print("Model description: ", model2.best_estimator_)

#predict on test set
pred2 = model2.predict(X_test_scale1)

#show prediction performance
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, pred2)))
print("Recall: {0:.4f}".format(metrics.recall_score(y_test, pred2, pos_label = "above_avg", average = "binary")))
print("Precision: {0:.4f}".format(metrics.precision_score(y_test, pred2, pos_label = "above_avg", average = "binary")))
print("F1 score: {0:.4f}".format(metrics.f1_score(y_test, pred2, pos_label = "above_avg", average = "binary")))

Best score:  0.766197096456693
Corresponding parms:  {'n_neighbors': 1}
Model description:  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')
Accuracy: 0.7562
Recall: 0.7616
Precision: 0.7798
F1 score: 0.7706


### Normalization
Each variable is scaled to a [0, 1] range.

In [37]:
scale2 = preprocessing.MinMaxScaler().fit(X_train)
X_train_scale2 = scale2.transform(X_train)
X_test_scale2 = scale2.transform(X_test)

In [41]:
model3 = GridSearchCV(KNeighborsClassifier(), param_grid = grid1, cv = 10, scoring = "accuracy")
model3.fit(X_train_scale2, y_train)

#show tuning results
print("Best score: ", model3.best_score_)
print("Corresponding parms: ", model3.best_params_)
print("Model description: ", model3.best_estimator_)

#predict on test set
pred3 = model3.predict(X_test)

#show prediction performance
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, pred3)))
print("Recall: {0:.4f}".format(metrics.recall_score(y_test, pred3, pos_label = "above_avg", average = "binary")))
print("Precision: {0:.4f}".format(metrics.precision_score(y_test, pred3, pos_label = "above_avg", average = "binary")))
print("F1 score: {0:.4f}".format(metrics.f1_score(y_test, pred3, pos_label = "above_avg", average = "binary")))

Best score:  0.7568343996062993
Corresponding parms:  {'n_neighbors': 1}
Model description:  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')
Accuracy: 0.4906
Recall: 0.8953
Precision: 0.5151
F1 score: 0.6539


array([[154,  18],
       [145,   3]], dtype=int64)

## Visualization