In [1]:
import sys
sys.path.append('./models/')
import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd
from logRegression import LogRegression

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score

In [2]:
def cross_val_score(model, X, y):
    acc_scores = []
    prec_scores = []
    roc_scores = []
    
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc_score = accuracy_score(y_test, y_pred)
        prec_score = precision_score(y_test, y_pred, pos_label='positive', average='micro')
        roc_score = roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred), multi_class='ovr')
        acc_scores.append(acc_score)
        prec_scores.append(prec_score)
        roc_scores.append(roc_score)

    score = np.mean(acc_scores), np.mean(prec_scores), np.mean(roc_scores)
    return score

# Data splitting

In [3]:
mobile_data = pd.read_csv('models/clearDataset.csv')
X, Y = mobile_data.drop(['price_range'], axis=1), mobile_data['price_range']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [4]:
def print_error_validation(accuracy, precission, roc_auc):
    print('Ошибки на кросс валидации:')
    print(f'Accuracy  = {accuracy}')
    print(f'Precision = {precision}')
    print(f'Roc auc   = {roc_auc}')
    
def retrain(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    
    y_pred_test = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    acc = accuracy_score(y_pred_train, y_train), accuracy_score(y_pred_test, y_test)
    
    prec = precision_score(y_pred_train, y_train, pos_label='positive', average='micro'), precision_score(y_pred_test, y_test, pos_label='positive', average='micro')
    
    roc = roc_auc_score(pd.get_dummies(y_train), pd.get_dummies(y_pred_train), multi_class='ovr'), roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred_test), multi_class='ovr')
    
    print('Ошибки на выборках')
    print('            Train                   Test')
    print(f'Accuracy  = {acc[0]}  |  {acc[1]}')
    print(f'Precision = {prec[0]}  |  {prec[1]}')
    print(f'Roc auc   = {roc[0]}  |  {roc[1]}')
    
    
def search_parametrs_C(C_list):
    best_C = None
    best_roc = -10
    
    for C in C_list:
        model = LogRegression(C=C)
        accuracy, precision, roc_auc = cross_val_score(model, X.values, Y.values)
        if roc_auc > best_roc:
            best_C = C
    return best_C

# Logistic Regression implementation

# Without regularization

In [5]:
model = LogRegression()

In [6]:
accuracy, precision, roc_auc = cross_val_score(model, X.values, Y.values)
print_error_validation(accuracy, precision, roc_auc)

Ошибки на кросс валидации:
Accuracy  = 0.4045112781954887
Precision = 0.4045112781954887
Roc auc   = 0.6272036634337573


In [7]:
retrain(model, x_train.values, x_test.values, y_train.values, y_test.values)

Ошибки на выборках
            Train                   Test
Accuracy  = 0.5  |  0.4992412746585736
Precision = 0.5  |  0.4992412746585736
Roc auc   = 0.6720754157429131  |  0.672596230890465


# With regularization

### Searching for parameters

In [8]:
C = search_parametrs_C(C_list=np.arange(0,1,0.05))
print(f'Best parameter C = {C}')

Best parameter C = 0.9500000000000001


In [9]:
model = LogRegression(C=C)

In [10]:
accuracy,precision,roc_auc = cross_val_score(model,X.values,Y.values)
print_error_validation(accuracy,precision,roc_auc)

Ошибки на кросс валидации:
Accuracy  = 0.3568922305764411
Precision = 0.3568922305764411
Roc auc   = 0.579102713717152


In [11]:
retrain(model, x_train.values, x_test.values, y_train.values, y_test.values)

Ошибки на выборках
            Train                   Test
Accuracy  = 0.47380239520958084  |  0.4628224582701062
Precision = 0.47380239520958084  |  0.4628224582701062
Roc auc   = 0.6019687936145997  |  0.6233187374443899


# Logistic Regression with Sklearn

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV
scoring = ['precision_micro', 'roc_auc','accuracy']
scoring_test = ['test_accuracy','test_precision_micro','test_roc_auc']

In [13]:
logreg = LogisticRegression(C=1e5)
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
print(f'Ошибка на выборках:\nAccuracy = {accuracy_score(y_test, y_pred)}')

Ошибка на выборках:
Accuracy = 0.622154779969651


In [14]:
logreg = LogisticRegressionCV(cv=5, multi_class='ovr', random_state=0).fit(x_train, y_train)
y_pred = logreg.predict(x_test)

print(f'Ошибки на кросс валидации:\nAccuracy = {accuracy_score(y_test, y_pred)} ')

Ошибки на кросс валидации:
Accuracy = 0.6889226100151745


In [15]:
retrain(logreg, x_train.values, x_test.values, y_train.values, y_test.values)

Ошибки на выборках
            Train                   Test
Accuracy  = 0.7155688622754491  |  0.6889226100151745
Precision = 0.7155688622754491  |  0.6889226100151745
Roc auc   = 0.8097067621777602  |  0.7940238612896533
