In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import pickle

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import log_loss

from sklearn.model_selection import GridSearchCV

<h2>3000</h2>

In [27]:
X = pd.read_csv('conc_3000.csv', index_col=[0]).sample(frac=1).reset_index(drop=True)
Y = X.pop('LABEL').values
X_train, X_test, Y_train, Y_test = train_test_split(X.values, Y, test_size=0.2)

In [28]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,Y_train)

Y_pred = knn.predict(X_test)

print('Accuracy %f' % knn.score(X_test,Y_test))
PRF = precision_recall_fscore_support(Y_test, Y_pred, average='macro')
print('Precision: %.2f' % PRF[0])
print('Recall: %.2f' % PRF[1])
print('F1: %.2f' % PRF[2])

print('LOGARITHMIC LOSS')
print('Logarithmic loss: %.2f' % log_loss(Y_test, Y_pred,normalize=True))

Accuracy 0.884167
Precision: 0.89
Recall: 0.89
F1: 0.88
LOGARITHMIC LOSS
Logarithmic loss: 4.00


In [29]:
# save the model to disk
filename = 'knn_3000.sav'
pickle.dump(knn, open(filename, 'wb'))

<h2>100</h2>

In [31]:
X = pd.read_csv('conc_100.csv', index_col=[0]).sample(frac=1).reset_index(drop=True)
Y = X.pop('LABEL').values
X_train, X_test, Y_train, Y_test = train_test_split(X.values, Y, test_size=0.2)

In [34]:
print('k-parameter tuning...')
param_grid = {'n_neighbors':np.arange(2,50)}
knn = KNeighborsClassifier()
knn_cv= GridSearchCV(knn,param_grid,cv=5)
knn_cv.fit(X_train,Y_train)
print('Best achieved score: %f' % knn_cv.best_score_)
print('k-parameter value: %i' % knn_cv.best_params_['n_neighbors'])

k-parameter tuning...
Best achieved score: 0.781250
k-parameter value: 2




In [15]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,Y_train)

Y_pred = knn.predict(X_test)

print('Accuracy %f' % knn.score(X_test,Y_test))
PRF = precision_recall_fscore_support(Y_test, Y_pred, average='macro')
print('Precision: %.2f' % PRF[0])
print('Recall: %.2f' % PRF[1])
print('F1: %.2f' % PRF[2])

print('LOGARITHMIC LOSS')
print('Logarithmic loss: %.2f' % log_loss(Y_test, Y_pred,normalize=True))

Accuracy 0.775000
Precision: 0.77
Recall: 0.77
F1: 0.77
LOGARITHMIC LOSS
Logarithmic loss: 7.77


In [16]:
# save the model to disk
filename = 'knn_100.sav'
pickle.dump(knn, open(filename, 'wb'))

<h2>500</h2>

In [17]:
X = pd.read_csv('conc_500.csv', index_col=[0]).sample(frac=1).reset_index(drop=True)
Y = X.pop('LABEL').values
X_train, X_test, Y_train, Y_test = train_test_split(X.values, Y, test_size=0.2)

In [18]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,Y_train)

Y_pred = knn.predict(X_test)

print('Accuracy %f' % knn.score(X_test,Y_test))
PRF = precision_recall_fscore_support(Y_test, Y_pred, average='macro')
print('Precision: %.2f' % PRF[0])
print('Recall: %.2f' % PRF[1])
print('F1: %.2f' % PRF[2])

print('LOGARITHMIC LOSS')
print('Logarithmic loss: %.2f' % log_loss(Y_test, Y_pred,normalize=True))

Accuracy 0.865000
Precision: 0.87
Recall: 0.87
F1: 0.86
LOGARITHMIC LOSS
Logarithmic loss: 4.66


In [19]:
# save the model to disk
filename = 'knn_500.sav'
pickle.dump(knn, open(filename, 'wb'))

<h2>1000</h2>

In [21]:
X = pd.read_csv('conc_1000.csv', index_col=[0]).sample(frac=1).reset_index(drop=True)
Y = X.pop('LABEL').values
X_train, X_test, Y_train, Y_test = train_test_split(X.values, Y, test_size=0.2)

In [22]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,Y_train)

Y_pred = knn.predict(X_test)

print('Accuracy %f' % knn.score(X_test,Y_test))
PRF = precision_recall_fscore_support(Y_test, Y_pred, average='macro')
print('Precision: %.2f' % PRF[0])
print('Recall: %.2f' % PRF[1])
print('F1: %.2f' % PRF[2])

print('LOGARITHMIC LOSS')
print('Logarithmic loss: %.2f' % log_loss(Y_test, Y_pred,normalize=True))

Accuracy 0.875000
Precision: 0.88
Recall: 0.87
F1: 0.87
LOGARITHMIC LOSS
Logarithmic loss: 4.32


In [23]:
# save the model to disk
filename = 'knn_1000.sav'
pickle.dump(knn, open(filename, 'wb'))

<h2>2000</h2>

In [24]:
X = pd.read_csv('conc_2000.csv', index_col=[0]).sample(frac=1).reset_index(drop=True)
Y = X.pop('LABEL').values
X_train, X_test, Y_train, Y_test = train_test_split(X.values, Y, test_size=0.2)

In [25]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,Y_train)

Y_pred = knn.predict(X_test)

print('Accuracy %f' % knn.score(X_test,Y_test))
PRF = precision_recall_fscore_support(Y_test, Y_pred, average='macro')
print('Precision: %.2f' % PRF[0])
print('Recall: %.2f' % PRF[1])
print('F1: %.2f' % PRF[2])

print('LOGARITHMIC LOSS')
print('Logarithmic loss: %.2f' % log_loss(Y_test, Y_pred,normalize=True))

Accuracy 0.885000
Precision: 0.89
Recall: 0.89
F1: 0.88
LOGARITHMIC LOSS
Logarithmic loss: 3.97


In [26]:
# save the model to disk
filename = 'knn_2000.sav'
pickle.dump(knn, open(filename, 'wb'))