In [1]:
import sys
sys.path.append('./models/')
import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd
from KNN import KNN
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score

In [2]:
def print_error_validation(accuracy, precission, roc_auc):
    print('Ошибки на кросс валидации:')
    print(f'Accuracy  = {accuracy}')
    print(f'Precision = {precision}')
    print(f'Roc auc   = {roc_auc}')
    
def retrain(model, x_train, x_test, y_train, y_test):
    
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_train = np.column_stack((x_train, y_train))
    
    model.fit(x_train, y_train)
    
    y_pred_test = model.predict(x_train, x_test)
    y_pred_train = model.predict(x_train, x_train)
    
    acc = accuracy_score(y_pred_train, y_train), accuracy_score(y_pred_test, y_test)
    
    prec = precision_score(y_pred_train, y_train, pos_label='positive', average='micro'), precision_score(y_pred_test, y_test, pos_label='positive', average='micro')
    
    roc = roc_auc_score(pd.get_dummies(y_train), pd.get_dummies(y_pred_train), multi_class='ovr'), roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred_test), multi_class='ovr')
    
    print('Ошибки на выборках')
    print('            Train                  Test')
    print(f'Accuracy  = {acc[0]}  |  {acc[1]}')
    print(f'Precision = {prec[0]}  |  {prec[1]}')
    print(f'Roc auc   = {roc[0]}  |  {roc[1]}')
    
def sklearn_retrain(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    
    y_pred_test = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    acc = accuracy_score(y_pred_train, y_train), accuracy_score(y_pred_test, y_test)
    
    prec = precision_score(y_pred_train, y_train, pos_label='positive', average='micro'), precision_score(y_pred_test, y_test, pos_label='positive', average='micro')
    
    roc = roc_auc_score(pd.get_dummies(y_train), pd.get_dummies(y_pred_train), multi_class='ovr'), roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred_test), multi_class='ovr')
    
    print('Ошибки на выборках')
    print('            Train                  Test')
    print(f'Accuracy  = {acc[0]}  |  {acc[1]}')
    print(f'Precision = {prec[0]}  |  {prec[1]}')
    print(f'Roc auc   = {roc[0]}  |  {roc[1]}')
    
def search_n_neighbors(x_train, y_train, x_test, y_test, n_neighbors_list):
    best_n = None
    best_roc = -10
    for n_n in n_neighbors_list:
        model = KNN(n_n, 5)
        accuracy, precision, roc_auc = cross_val_score(model, x_train.values, y_train.values, x_test.values, y_test.values)
        if roc_auc > best_roc:
            best_n = n_n

In [3]:
def cross_val_score(model, x_train, y_train, x_test, y_test):
    acc_scores = []
    prec_scores = []
    roc_scores = []

    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_train = np.column_stack((x_train, y_train))
    
    sc = model.fit(x_train, y_train)
    
    print(f'CV scores: {sc}')
    mn = sum(sc) / float(len(sc))
    print(f'Train data accuracy: {mn}')
    
    y_pred = model.predict(x_train, x_test)
    
    acc_scores.append(accuracy_score(y_test, y_pred))
    prec_scores.append(precision_score(y_test, y_pred, pos_label='positive', average='micro'))
    roc_scores.append(roc_auc_score(pd.get_dummies(y_test), 
                                    pd.get_dummies(y_pred), multi_class='ovr'))
    
    score = np.mean(acc_scores), np.mean(prec_scores), np.mean(roc_scores)
    return score

# Data splitting

In [4]:
mobile_data = pd.read_csv('models/clearDataset.csv')
X, Y = mobile_data.drop(['price_range'], axis=1), mobile_data['price_range']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

# KNN Classificator implementation

### Searching for parameters

In [5]:
n_neighbors = search_n_neighbors(x_train, y_train, x_test, y_test, np.arange(3,12,2))
print(f'Best n: {n_neighbors}')

Best n: 5

### Evaluating model

In [6]:
model = KNN(5)
accuracy, precision, roc_auc = cross_val_score(model, x_train.values, y_train.values, x_test.values, y_test.values)
print_error_validation(accuracy, precision, roc_auc)
# model.fit(x_train, y_train)
# y_pred = model.predict(x_test)

CV scores: [0.8951310861423221, 0.9026217228464419, 0.9213483146067416, 0.9213483146067416, 0.947565543071161]
Train data accuracy: 0.9176029962546817
Ошибки на кросс валидации:
Accuracy  = 0.9241274658573596
Precision = 0.9241274658573596
Roc auc   = 0.9495108095293339


In [7]:
retrain(model, x_train.values, x_test.values, y_train.values, y_test.values)

Ошибки на выборках
            Train                  Test
Accuracy  = 0.9520958083832335  |  0.9241274658573596
Precision = 0.9520958083832335  |  0.9241274658573596
Roc auc   = 0.9680700989877046  |  0.9495108095293339


# KNN with Sklearn

In [8]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)

neigh.fit(x_train, y_train)
y_pred = neigh.predict(x_test)

print(f'Ошибка на выборках:\nAccuracy = {accuracy_score(y_test, y_pred)}')

Ошибка на выборках:
Accuracy = 0.9241274658573596


In [9]:
sklearn_retrain(neigh, x_train.values, x_test.values, y_train.values, y_test.values)

Ошибки на выборках
            Train                  Test
Accuracy  = 0.9520958083832335  |  0.9241274658573596
Precision = 0.9520958083832335  |  0.9241274658573596
Roc auc   = 0.9680700989877046  |  0.9495108095293339
