In [11]:
%run ./global.ipynb

# KNN Classification

[Reference](https://realpython.com/knn-python/#use-knn-to-predict-the-age-of-sea-slugs)

## K = 3

In [43]:
# create and fit knn classifier

from sklearn.neighbors import KNeighborsClassifier
k = 3
model = KNeighborsClassifier(n_neighbors=k)
model.fit(X_train, y_train)

## GridSeachCV with neighbors

In [58]:
# create and fit knn classifier using grid search
# grid search parameters are neighbors

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
parameters = {
    "n_neighbors": range(1, 50),
}
model = GridSearchCV(KNeighborsClassifier(), parameters)
_ = model.fit(X_train, y_train)
model.best_params_

{'n_neighbors': 9}

## GridSeachCV with neighbors and weights

In [59]:
# create and fit knn classifier using grid search
# grid search parameters are neighbors and weights

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
parameters = {
    "n_neighbors": range(1, 50),
    "weights": ["uniform", "distance"],
}
model = GridSearchCV(KNeighborsClassifier(), parameters)
_ = model.fit(X_train, y_train)
model.best_params_

{'n_neighbors': 9, 'weights': 'uniform'}

## Bagging with GridSeachCV

With neighbors and weights as GridSearchCV parameters.

In [63]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
parameters = {
    "n_neighbors": range(1, 50),
    "weights": ["uniform", "distance"],
}
gscv = GridSearchCV(KNeighborsClassifier(), parameters)
_ = gscv.fit(X_train, y_train)
best_params = gscv.best_params_
best_params

bagged_model = KNeighborsClassifier(**best_params)

from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier(bagged_model, n_estimators=100)
model.fit(X_train, y_train)

{'n_neighbors': 9, 'weights': 'uniform'}

In [62]:
# predict and measure
import matplotlib.pyplot as plt
import numpy as np

train_preds = model.predict(X_train)
train_matches = train_preds == y_train
train_match_cnt = np.count_nonzero(train_matches == True)
train_cnt = len(train_matches)
train_accuracy = train_match_cnt / train_cnt

test_preds = model.predict(X_test)
test_matches = test_preds == y_test
test_match_cnt = np.count_nonzero(test_matches == True)
test_cnt = len(test_matches)
test_accuracy = test_match_cnt / test_cnt

f"Train accuracy: {train_match_cnt}/{train_cnt} ({round(train_accuracy, 4) * 100} %)"
f"Test accuracy: {test_match_cnt}/{test_cnt} ({round(test_accuracy, 4) * 100} %)"

NotFittedError: This BaggingClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

## Comparison

GS = grid search
GridSearchCV with weights and neighbors found default weights (uniform) the best.
|     |k=3   |GS - k|GS - k, w|bagging|
|:---:|:---: |:---: |:---:    |:---:  |
|train|94,29%|93,85%|93,85%   |       |
|test |93,98%|93,86%|93,86%   |       |