In [1]:
import sys
sys.path.append('./models/')
import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd
from SVM import SVM
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score

In [2]:
def cross_val_scoring(model, X, y):
    acc_scores = []
    prec_scores = []
    roc_scores = []
    
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc_score = accuracy_score(y_test, y_pred)
        prec_score = precision_score(y_test, y_pred, pos_label='positive', average='micro')
        roc_score = roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred), multi_class='ovr')
        acc_scores.append(acc_score)
        prec_scores.append(prec_score)
        roc_scores.append(roc_score)

    score = np.mean(acc_scores), np.mean(prec_scores), np.mean(roc_scores)
    return score

In [3]:
def print_error_validation(accuracy, precission, roc_auc):
    print('Ошибки на кросс валидации:')
    print(f'Accuracy  = {accuracy}')
    print(f'Precision = {precision}')
    print(f'Roc auc   = {roc_auc}')
    
def retrain(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    
    y_pred_test = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    acc = accuracy_score(y_pred_train, y_train), accuracy_score(y_pred_test, y_test)
    
    prec = precision_score(y_pred_train, y_train, pos_label='positive', average='micro'), precision_score(y_pred_test, y_test, pos_label='positive', average='micro')
    
    roc = roc_auc_score(pd.get_dummies(y_train), pd.get_dummies(y_pred_train), multi_class='ovr'), roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred_test), multi_class='ovr')
    
    print('Ошибки на выборках')
    print('            Train                   Test')
    print(f'Accuracy  = {acc[0]}  |  {acc[1]}')
    print(f'Precision = {prec[0]}  |  {prec[1]}')
    print(f'Roc auc   = {roc[0]}  |  {roc[1]}')
    
    
def search_parametrs_C(X, Y, C_list):
    best_C = None
    best_roc = -10
    
    for C in C_list:
        model = SVM(C=C)
        accuracy, precision, roc_auc = cross_val_scoring(model, X.values, Y.values)
        if roc_auc > best_roc:
            best_C = C
    return best_C

# Data splitting

In [4]:
mobile_data = pd.read_csv('models/clearDataset.csv')
X, Y = mobile_data.drop(['price_range'], axis=1), mobile_data['price_range']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

# SVM Classificator implementation

### Searching for parameters

In [5]:
C = search_parametrs_C(X, Y, C_list=np.arange(1, 10, 1))
print(f'Best C = {2}')

Best C = 2


In [6]:
model = SVM(C=2)
accuracy, precision, roc_auc = cross_val_scoring(model, X.values, Y.values)
print_error_validation(accuracy, precision, roc_auc)

Ошибки на кросс валидации:
Accuracy  = 0.5548872180451128
Precision = 0.5548872180451128
Roc auc   = 0.6726917825855202


In [7]:
retrain(model, x_train.values, x_test.values, y_train.values, y_test.values)

Ошибки на выборках
            Train                   Test
Accuracy  = 0.4588323353293413  |  0.44764795144157815
Precision = 0.4588323353293413  |  0.44764795144157815
Roc auc   = 0.6191507423349809  |  0.6112348013912755


# SVM with Sklearn

In [8]:
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

### Searching for parameters

In [9]:
clf = svm.SVC(decision_function_shape = 'ovr')
parameters = {'C':np.arange(1, 30, 5)}
clf_gs = GridSearchCV(clf, parameters)

In [10]:
clf_gs.fit(X, Y)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([ 1,  6, 11, 16, 21, 26])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [11]:
clf_gs.best_estimator_

SVC(C=21, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [12]:
retrain(clf, x_train.values, x_test.values, y_train.values, y_test.values)

Ошибки на выборках
            Train                   Test
Accuracy  = 0.9468562874251497  |  0.9559939301972686
Precision = 0.9468562874251497  |  0.9559939301972686
Roc auc   = 0.9645010781616375  |  0.9707807420761585
