### Check Version of the Relevant Libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# Overfitting the test set

- Import `breast_cancer` dataset from Scikitlearn
- Scale the feature set
- Split data into first trainval and test, then split trainval into validation and train datasets
- Fit train dataset for `k=5` neighbors
- print the score on validation and test datasets

In [13]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.neighbors import KNeighborsClassifier

data = load_breast_cancer()
X = data.data
y = data.target

# is the data balanced?
print(np.unique(y, return_counts = True))

# Scale the feature set
X_scaled = scale(X)

# Split into trainval and test
X_trainval, X_test, y_trainval, y_test = train_test_split(X_scaled, y, stratify = y, random_state=0) # ensures the ratio of 0s and 1s in target is the SAME in both train and test dataset. Even though data quite balanced here worth always stratifying for classifcation problems

#split trainval into train and validation
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, stratify = y_trainval, random_state=0)

X_trainval.shape, X_train.shape, X_val.shape, X_test.shape, X.shape

# fit the train dataset for k=5 neighbors
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
print('Validation score is:', knn.score(X_val, y_val))
print('Test score is: ', knn.score(X_test, y_test))

(array([0, 1]), array([212, 357]))
Validation score is: 0.9719626168224299
Test score is:  0.958041958041958


- Create a loop and fit your train dataset and test your validation dataset for `k=1,15,2`.
- Print the best validation score out of three neighbors and determine the `k` value with the highest score, $best_n$
- Retrain your model with trainvalidation dataset for $k=best_n$
- Print the score for test dataset for the latest trained model

In [25]:
val_scores = []
nset = np.arange(1,15,2)
for n in nset:
  knn = KNeighborsClassifier(n_neighbors=n)
  knn.fit(X_train, y_train)
  val_scores.append(knn.score(X_val, y_val))

print(val_scores)
# Find the index of the highest validation score
best_index = np.argmax(val_scores)
print(best_index)

# Step 2: Get the corresponding n_neighbors value from nset
best_n = nset[best_index]
print(f'The best number of neighbors is {best_n}')

[0.9532710280373832, 0.9906542056074766, 0.9719626168224299, 0.9719626168224299, 0.9626168224299065, 0.9719626168224299, 0.9626168224299065]
1
The best number of neighbors is 3


In [27]:
# retrain model with trainval dataset for k = Bestn. For optimised model, it is best to train on all the training dataset  
knn_best = KNeighborsClassifier(n_neighbors = best_n)
knn_best.fit(X_trainval, y_trainval)
print(f'Optimised hyperparameter model performance is {knn_best.score(X_test, y_test)}')

Optimised hyperparameter model performance is 0.951048951048951


# Cross validation

- Import `cross_val_score` from `sklearn.model_selection`
- Split your data into train and test datasets
- For `neighbors=1, 15, 2`, compute cross validation scores for trainvalidation dataset with kfold=10.
- Print the best validation score out of three neighbors and determine the `k` value with the highest score, $best_n$
- Retrain your model with trainvalidation dataset for $k=best_n$
- Print the score for test dataset for the latest trained model

In [35]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state=0)

nset = np.arange(1,15,2)
cross_val_scores = []

for n in nset:
    print(f'n_neighbors: {n}')
    knn = KNeighborsClassifier(n_neighbors = n)
    scores = cross_val_score(knn, X_train, y_train, cv = 5) # Take your model and then train it 5 times on 5 different splits of the data
    print(scores)
    cross_val_scores.append(np.mean(scores))

print(cross_val_scores)

#best cross validation score value
best_cross = np.max(cross_val_scores)
# best cross validation score index in nrange(1,15,2)
best_index = np.argmax(best_cross)
print(best_index)
#best n_neighbors
best_neighbors = nset[best_index]

print(f'Best cross validation score is: {np.max(cross_val_scores)}')
print(f'Best n_neighbors in cross validation is: {best_neighbors}')

n_neighbors: 1
[0.95348837 0.92941176 0.90588235 0.94117647 0.90588235]
n_neighbors: 3
[0.94186047 0.95294118 0.89411765 0.96470588 0.91764706]
n_neighbors: 5
[0.94186047 0.96470588 0.90588235 0.95294118 0.92941176]
n_neighbors: 7
[0.94186047 0.96470588 0.90588235 0.95294118 0.92941176]
n_neighbors: 9
[0.94186047 0.95294118 0.91764706 0.96470588 0.92941176]
n_neighbors: 11
[0.94186047 0.94117647 0.90588235 0.96470588 0.92941176]
n_neighbors: 13
[0.94186047 0.95294118 0.90588235 0.95294118 0.92941176]
[np.float64(0.9271682626538988), np.float64(0.9342544459644323), np.float64(0.9389603283173734), np.float64(0.9389603283173734), np.float64(0.941313269493844), np.float64(0.9366073871409029), np.float64(0.9366073871409029)]
0
Best cross validation score is: 0.941313269493844
Best n_neighbors in cross validation is: 1


- Compute cross validation score for `KNeighborsClassifier()` when `cv=KFold(n_splits=5)` and `cv=StratifiedKFold(n_splits=5, shuffle=True)`

In [None]:
np.set_printoptions(precision=2)

# GridSearchCV

- Import `GridSearchCV` from `sklearn.model_selection`
- Split your data into train and test datasets
- For `neighbors=1 to 30`, compute `GridSearchCV` for train dataset with kfold=10.
- Print the best cross validation score
- Priont the best parameter
- Print the test score

In [40]:
from sklearn.model_selection import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state=0)
param_grid = {'n_neighbors': np.arange(1,31)}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv = StratifiedKFold(n_splits = 10), return_train_score=True) #return training data score as well as validation data scores
grid_search.fit(X_train, y_train)

In [42]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.9436323366555925
{'n_neighbors': np.int64(5)}
