In [24]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

In [6]:
df = pd.read_csv('data/processed/mushrooms_pca.csv')
y = df['class']
X = df.drop(columns=['class']).values
X.view()

array([[ 0.24373845,  0.16963259,  0.18727096, ..., -0.2621439 ,
        -0.04811481,  0.10087257],
       [ 1.5713711 , -0.39656834,  0.45390158, ..., -0.11453773,
         0.41043087, -0.28486641],
       [ 0.01171017,  0.71284725, -0.7501287 , ...,  0.00313137,
        -0.25922659, -0.06733154],
       ...,
       [-0.89697748, -0.1697835 ,  0.3055329 , ...,  0.53583814,
         0.06813399, -0.06614521],
       [-1.88993255, -0.39236622, -0.58529202, ...,  0.1682014 ,
         0.37251319, -0.41726506],
       [ 0.66668421,  0.4509778 , -0.09989701, ..., -0.18299176,
        -0.0747118 , -0.36911983]])

## Grid Search

In [8]:
from sklearn.model_selection import GridSearchCV

In [12]:
parameters = {'n_neighbors': range(5,51,4)}
knn = KNeighborsClassifier()
clf = GridSearchCV(knn, parameters, scoring='f1', cv=10, n_jobs=3)
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=3,
       param_grid={'n_neighbors': range(5, 51, 4)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [13]:
print(clf.best_estimator_)
print(clf.best_score_)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=25, p=2,
           weights='uniform')
0.840839337597152


In [11]:
X_train, X_test, y_train, y_test = \
        train_test_split(X, y.ravel(), test_size=.1, random_state=42)

In [14]:
import pickle
pickle.dump(clf, open("models/knn-high-res.p", "wb" ) )

In [16]:
results = pd.DataFrame(clf.cv_results_)
print(results.head())

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.069572      0.005172         1.663486        0.121639   
1       0.066175      0.005642         1.572018        0.055929   
2       0.075498      0.018433         1.665307        0.112641   
3       0.065529      0.002129         1.563478        0.059176   
4       0.063630      0.004631         1.535010        0.008876   

  param_n_neighbors               params  split0_test_score  \
0                 5   {'n_neighbors': 5}           0.794760   
1                 9   {'n_neighbors': 9}           0.797080   
2                13  {'n_neighbors': 13}           0.800582   
3                17  {'n_neighbors': 17}           0.797654   
4                21  {'n_neighbors': 21}           0.808955   

   split1_test_score  split2_test_score  split3_test_score       ...         \
0           0.813953           0.836676           0.819103       ...          
1           0.809524           0.851312           0.840580  



In [21]:
neighbors = list(map(lambda x: x['n_neighbors'] ,results['params']))
f1scores = results['mean_test_score']
print(f1scores)
plt.plot(neighbors, f1scores)
plt.xlabel('number of neighbors')
plt.ylabel('f1 score')
plt.title('k-NN performance')
plt.savefig("C:/Users/user/Desktop/kto_ma_szkole_ten_ma_lzej/Erasm/mushrooms/reports/figures/knn-high-res.png")
plt.show()

0     0.816860
1     0.823958
2     0.833968
3     0.834367
4     0.837949
5     0.840839
6     0.837531
7     0.838565
8     0.838662
9     0.835975
10    0.832866
11    0.833785
Name: mean_test_score, dtype: float64


In [28]:
y_true, y_pred = y_test, clf.predict(X_test)
confusion_matrix(y_true, y_pred.ravel())


TypeError: 'numpy.ndarray' object is not callable