In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import pickle

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import log_loss

from sklearn.model_selection import GridSearchCV

import time

<h2>3000</h2>

In [31]:
X = pd.read_csv('conc_3000.csv', index_col=[0]).sample(frac=1).reset_index(drop=True)
Y = X.pop('LABEL').values
X_train, X_test, Y_train, Y_test = train_test_split(X.values, Y, test_size=0.2)

In [32]:
knn = KNeighborsClassifier(n_neighbors=2)
start = time.time()
knn.fit(X_train,Y_train)
end = time.time()
Y_pred = knn.predict(X_test)

print('Accuracy %.4f' % knn.score(X_test,Y_test))
PRF = precision_recall_fscore_support(Y_test, Y_pred, average='macro')
print('Precision: %.4f' % PRF[0])
print('Recall: %.4f' % PRF[1])
print('F1: %.4f' % PRF[2])

print(end - start)

Accuracy 0.8592
Precision: 0.8681
Recall: 0.8589
F1: 0.8582
0.17478632926940918


In [29]:
# save the model to disk
filename = 'knn_3000.sav'
pickle.dump(knn, open(filename, 'wb'))

<h2>100</h2>

In [34]:
X = pd.read_csv('conc_100.csv', index_col=[0]).sample(frac=1).reset_index(drop=True)
Y = X.pop('LABEL').values
X_train, X_test, Y_train, Y_test = train_test_split(X.values, Y, test_size=0.2)

In [34]:
print('k-parameter tuning...')
param_grid = {'n_neighbors':np.arange(2,50)}
knn = KNeighborsClassifier()
knn_cv= GridSearchCV(knn,param_grid,cv=5)
knn_cv.fit(X_train,Y_train)
print('Best achieved score: %f' % knn_cv.best_score_)
print('k-parameter value: %i' % knn_cv.best_params_['n_neighbors'])

k-parameter tuning...
Best achieved score: 0.781250
k-parameter value: 2




In [35]:
knn = KNeighborsClassifier(n_neighbors=2)
start = time.time()
knn.fit(X_train,Y_train)
end = time.time()

Y_pred = knn.predict(X_test)

print('Accuracy %.4f' % knn.score(X_test,Y_test))
PRF = precision_recall_fscore_support(Y_test, Y_pred, average='macro')
print('Precision: %.4f' % PRF[0])
print('Recall: %.4f' % PRF[1])
print('F1: %.4f' % PRF[2])
print(end - start)

Accuracy 0.8250
Precision: 0.8200
Recall: 0.8125
F1: 0.8157
0.003041505813598633


In [16]:
# save the model to disk
filename = 'knn_100.sav'
pickle.dump(knn, open(filename, 'wb'))

<h2>500</h2>

In [36]:
X = pd.read_csv('conc_500.csv', index_col=[0]).sample(frac=1).reset_index(drop=True)
Y = X.pop('LABEL').values
X_train, X_test, Y_train, Y_test = train_test_split(X.values, Y, test_size=0.2)

In [37]:
knn = KNeighborsClassifier(n_neighbors=2)
start = time.time()
knn.fit(X_train,Y_train)
end = time.time()

Y_pred = knn.predict(X_test)

print('Accuracy %.4f' % knn.score(X_test,Y_test))
PRF = precision_recall_fscore_support(Y_test, Y_pred, average='macro')
print('Precision: %.4f' % PRF[0])
print('Recall: %.4f' % PRF[1])
print('F1: %.4f' % PRF[2])
print(end - start)

Accuracy 0.8500
Precision: 0.8588
Recall: 0.8549
F1: 0.8499
0.01675868034362793


In [19]:
# save the model to disk
filename = 'knn_500.sav'
pickle.dump(knn, open(filename, 'wb'))

<h2>1000</h2>

In [22]:
X = pd.read_csv('conc_1000.csv', index_col=[0]).sample(frac=1).reset_index(drop=True)
Y = X.pop('LABEL').values
X_train, X_test, Y_train, Y_test = train_test_split(X.values, Y, test_size=0.2)

In [38]:
knn = KNeighborsClassifier(n_neighbors=2)
start = time.time()
knn.fit(X_train,Y_train)
end = time.time()

Y_pred = knn.predict(X_test)

print('Accuracy %.4f' % knn.score(X_test,Y_test))
PRF = precision_recall_fscore_support(Y_test, Y_pred, average='macro')
print('Precision: %.4f' % PRF[0])
print('Recall: %.4f' % PRF[1])
print('F1: %.4f' % PRF[2])

print('LOGARITHMIC LOSS')
print('Logarithmic loss: %.2f' % log_loss(Y_test, Y_pred,normalize=True))
print(end - start)

Accuracy 0.8500
Precision: 0.8588
Recall: 0.8549
F1: 0.8499
LOGARITHMIC LOSS
Logarithmic loss: 5.18
0.018171310424804688


In [23]:
# save the model to disk
filename = 'knn_1000.sav'
pickle.dump(knn, open(filename, 'wb'))

<h2>2000</h2>

In [39]:
X = pd.read_csv('conc_2000.csv', index_col=[0]).sample(frac=1).reset_index(drop=True)
Y = X.pop('LABEL').values
X_train, X_test, Y_train, Y_test = train_test_split(X.values, Y, test_size=0.2)

In [40]:
knn = KNeighborsClassifier(n_neighbors=2)
start = time.time()
knn.fit(X_train,Y_train)
end = time.time()

Y_pred = knn.predict(X_test)

print('Accuracy %.4f' % knn.score(X_test,Y_test))
PRF = precision_recall_fscore_support(Y_test, Y_pred, average='macro')
print('Precision: %.4f' % PRF[0])
print('Recall: %.4f' % PRF[1])
print('F1: %.4f' % PRF[2])
print(end - start)

Accuracy 0.8588
Precision: 0.8586
Recall: 0.8616
F1: 0.8584
0.12832021713256836


In [26]:
# save the model to disk
filename = 'knn_2000.sav'
pickle.dump(knn, open(filename, 'wb'))

In [41]:
X = pd.read_csv('restricted_conc_3000.csv', index_col=[0]).sample(frac=1).reset_index(drop=True)
Y = X.pop('LABEL').values
X_train, X_test, Y_train, Y_test = train_test_split(X.values, Y, test_size=0.2)
knn = KNeighborsClassifier(n_neighbors=2)
start = time.time()
knn.fit(X_train,Y_train)
end = time.time()

Y_pred = knn.predict(X_test)

print('Accuracy %.4f' % knn.score(X_test,Y_test))
PRF = precision_recall_fscore_support(Y_test, Y_pred, average='macro')
print('Precision: %.4f' % PRF[0])
print('Recall: %.4f' % PRF[1])
print('F1: %.4f' % PRF[2])
print(end - start)

Accuracy 0.4833
Precision: 0.6779
Recall: 0.5047
F1: 0.3355
0.11349320411682129
