# KNN

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier

from ml import *

# Wine Quality Dataset

In [None]:
# Prepocess dataset
wineDF = pd.read_csv("./data/winequality-red.csv")

wineDF['label'] = wineDF['quality'].apply(lambda x: 'good' if x > 6 else 'bad')
wineDF.drop('quality', axis = 1)

wine_y = wineDF['label'].values
wine_X = wineDF.drop('label', axis = 1).values

In [None]:
# Standardize dataset
scaler = StandardScaler()
wine_X = scaler.fit_transform(wine_X)

In [None]:
# Cross validate
all_metrics = []

outerIndex = 1
train_score = {}
test_score = {}

n_neighbors = np.arange(2, 30, 1)
for i in n_neighbors:
    outerIndex += 1

    X_train, X_test, y_train, y_test = train_test_split(wine_X, wine_y, test_size = 0.2, stratify = wine_y)
    
    model = KNeighborsClassifier(n_neighbors = i)

    metrics, report, cm = fit_and_predict(model, X_train, X_test, y_train, y_test)
    all_metrics.append(metrics)
    print(f"\n- Metrics for {i} neighbors in KNN:")
    print(report)

    train_score[i] = model.score(X_train, y_train)
    test_score[i] = model.score(X_test, y_test)

    print

print("\nAverage metrics:")
print_avg_metrics(all_metrics)

plt.plot(n_neighbors, train_score.values(), label="Train Accuracy")
plt.plot(n_neighbors, test_score.values(), label="Test Accuracy")
plt.xlabel("Number Of Neighbors")
plt.ylabel("Accuracy")
plt.title("KNN: Varying number of Neighbors")
plt.legend()
plt.xlim(0, 33)
plt.ylim(0.70, 1.00)
plt.grid()
plt.show()
    

In [None]:
# Finding the best parameters

for key, value in test_score.items():
    if value == max(test_score.values()):
        print(f"Best parameter for KNN based on accuracy: amount of neighbors {key}")

kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
parameter = {'n_neighbors':np.arange(2, 30, 1)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid = parameter, cv = kf, verbose = 1)
knn_cv.fit(X_train, y_train)
print(f"Best parameter is: {knn_cv.best_params_}")

# Heart Disease Dataset

In [None]:
# Preprocess dataset
heartDF = pd.read_csv("./data/heart_statlog_cleveland_hungary_final.csv")

heartDF['target'] = heartDF['target'].apply(lambda x: 'heart disease' if x else 'normal')
heart_y = heartDF['target'].values
heart_X = heartDF.drop('target', axis=1).values

In [None]:
# Standardize dataset
scaler = StandardScaler()
heart_X = scaler.fit_transform(heart_X)

In [None]:
# Cross validate
all_metrics = []

outerIndex = 1
train_score = {}
test_score = {}

n_neighbors = np.arange(2, 30, 1)
for i in n_neighbors:
    outerIndex += 1

    X_train, X_test, y_train, y_test = train_test_split(heart_X, heart_y, test_size = 0.2, stratify = heart_y)
    
    model = KNeighborsClassifier(n_neighbors = i)

    metrics, report, cm = fit_and_predict(model, X_train, X_test, y_train, y_test)
    all_metrics.append(metrics)
    print(f"\n- Metrics for {i} neighbors in KNN:")
    print(report)

    train_score[i] = model.score(X_train, y_train)
    test_score[i] = model.score(X_test, y_test)

    print

print("\nAverage metrics:")
print_avg_metrics(all_metrics)

plt.plot(n_neighbors, train_score.values(), label="Train Accuracy")
plt.plot(n_neighbors, test_score.values(), label="Test Accuracy")
plt.xlabel("Number Of Neighbors")
plt.ylabel("Accuracy")
plt.title("KNN: Varying number of Neighbors")
plt.legend()
plt.xlim(0, 33)
plt.ylim(0.70, 1.00)
plt.grid()
plt.show()

In [None]:
for key, value in test_score.items():
    if value == max(test_score.values()):
        print(f"Best parameter for KNN based on accuracy: amount of neighbors {key}")

kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
parameter = {'n_neighbors':np.arange(2, 30, 1)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid = parameter, cv = kf, verbose = 1)
knn_cv.fit(X_train, y_train)
print(f"Best parameter is: {knn_cv.best_params_}")