In [2]:
import sys
sys.path.append('./models/')
import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd
from decisionTree import DecisionTree
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score

In [3]:
def cross_val_scoring(model, X, y):
    acc_scores = []
    prec_scores = []
    roc_scores = []
    
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc_score = accuracy_score(y_test, y_pred)
        prec_score = precision_score(y_test, y_pred, pos_label='positive', average='micro')
        roc_score = roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred), multi_class='ovr')
        acc_scores.append(acc_score)
        prec_scores.append(prec_score)
        roc_scores.append(roc_score)

    score = np.mean(acc_scores), np.mean(prec_scores), np.mean(roc_scores)
    return score

In [4]:
def print_error_validation(accuracy, precission, roc_auc):
    print('Ошибки на кросс валидации:')
    print(f'Accuracy  = {accuracy}')
    print(f'Precision = {precision}')
    print(f'Roc auc   = {roc_auc}')
    
def retrain(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    
    y_pred_test = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    acc = accuracy_score(y_pred_train, y_train), accuracy_score(y_pred_test, y_test)
    
    prec = precision_score(y_pred_train, y_train, pos_label='positive', average='micro'), precision_score(y_pred_test, y_test, pos_label='positive', average='micro')
    
    roc = roc_auc_score(pd.get_dummies(y_train), pd.get_dummies(y_pred_train), multi_class='ovr'), roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred_test), multi_class='ovr')
    
    print('Ошибки на выборках')
    print('            Train                  Test')
    print(f'Accuracy  = {acc[0]}  |  {acc[1]}')
    print(f'Precision = {prec[0]}  |  {prec[1]}')
    print(f'Roc auc   = {roc[0]}  |  {roc[1]}')
    

def search_parametrs(x, y, max_depth_list, min_size_list):
    best_max_depth = None
    best_min_size = None
    best_roc = -10
    
    for max_depth in max_depth_list:
        for min_size in min_size_list:
            model = DecisionTree(max_depth=max_depth, min_size=min_size)
            accuracy, precision, roc_auc = cross_val_scoring(model, x.values, y.values)
            if roc_auc > best_roc:
                best_max_depth, best_min_size = max_depth, min_size
    return best_max_depth, best_min_size

# Data splitiing

In [5]:
mobile_data = pd.read_csv('models/clearDataset.csv')
X, Y = mobile_data.drop(['price_range'], axis=1), mobile_data['price_range']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

### Searching for parameters

In [27]:
max_depth, min_size = search_parametrs(X, Y, max_depth_list=np.arange(5,10,1), min_size_list=np.arange(5,10,1))
print(f'Best: depth = {max_depth}, size = {min_size}')

Best: depth = 9, size = 9


# Decision tree classificataion implementation

In [5]:
model = DecisionTree()
accuracy, precision, roc_auc = cross_val_scoring(model, X.values, Y.values)
print_error_validation(accuracy, precision, roc_auc)

Ошибки на кросс валидации:
Accuracy  = 0.8441102756892229
Precision = 0.8441102756892229
Roc auc   = 0.8965187796097869


In [6]:
retrain(model, x_train.values, x_test.values, y_train.values, y_test.values)

Ошибки на выборках
            Train                  Test
Accuracy  = 0.9461077844311377  |  0.8194233687405159
Precision = 0.9461077844311377  |  0.8194233687405159
Roc auc   = 0.9641322657183546   |  0.8795584678359948


# Decision tree with Sklearn

In [19]:
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
scoring = ['precision_macro', 'roc_auc','accuracy']
scoring_test = ['test_accuracy','test_precision_macro','test_roc_auc']

dt = tree.DecisionTreeClassifier()

dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)

print(f'Ошибка на выборках:\nAccuracy = {accuracy_score(y_test, y_pred)}')

Ошибка на выборках:
Accuracy = 0.8270106221547799


### Searching for parameters

In [7]:
clf = tree.DecisionTreeClassifier()
parameters = {'max_depth':np.arange(5,10,1), 'min_samples_split':np.arange(5,10,1)}
clf_cv = GridSearchCV(clf, parameters)

In [8]:
clf_cv.fit(x_train,y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': array([5, 6, 7, 8, 9]),
             

In [11]:
clf_cv.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=9,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [30]:
clf = tree.DecisionTreeClassifier(max_depth = max_depth, min_samples_split=min_size)

In [31]:
retrain(clf, x_train.values, x_test.values, y_train.values, y_test.values)

Ошибки на выборках
            Train                  Test
Accuracy  = 0.9558383233532934  |  0.8270106221547799
Precision = 0.9558383233532934  |  0.8270106221547799
Roc auc   = 0.9705206250064274  |  0.8851529675356118
