In [None]:
import random

import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score, mean_squared_error

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from dt_class import DecisionTree

%matplotlib inline

In [None]:
RANDOM_STATE = 14

In [None]:
def insert_null_values(X, features, null_ratio, random_state=RANDOM_STATE):
    X_copied = X.copy()
    random.seed(random_state)
    for feat in features:
        null_indexes = random.sample(range(X.shape[0]), int(round(X.shape[0] * null_ratio)))
        X_copied[:, feat][null_indexes] = np.nan
    return X_copied

In [None]:
def plot_gs_cv(grid, estimators, x_label, y_label, negative=False, filename=None):
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(16,6))
    
    ax1.set_title('Train score')
    for lbl, est in estimators.items():
        mean = est.cv_results_['mean_train_score']
        if negative: mean = mean * -1.0
        std = est.cv_results_['std_train_score']
        ax1.plot(grid, mean, label=lbl)
        ax1.fill_between(grid, mean - std, mean + std, alpha = 0.2)
    ax1.set_xlabel(x_label)
    ax1.set_ylabel(y_label)
    ax1.grid()
    ax1.legend()
    
    ax2.set_title('Test score')
    for lbl, est in estimators.items():
        mean = est.cv_results_['mean_test_score']
        if negative: mean = mean * -1.0
        std = est.cv_results_['std_test_score']
        ax2.plot(grid, mean, label=lbl)
        ax2.fill_between(grid, mean - std, mean + std, alpha = 0.2)
    ax2.set_xlabel(x_label)
    ax2.set_ylabel(y_label)
    ax2.grid()
    ax2.legend()
    
    if filename is not None:
        plt.savefig(filename, format='png', dpi=150)

In [None]:
def plot_cv_scores(grid, scores, x_label, y_label, negative=False, filename=None):
    plt.figure(figsize=(16, 6))
    for lbl, s in scores.items():
        mean = np.array(s[0])
        if negative: mean = mean * -1.0
        plt.plot(grid, mean, label=lbl)
        plt.fill_between(grid, mean - np.array(s[1]), mean + np.array(s[1]), alpha = 0.2)
    plt.title('Cross-validated score')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.grid()
    plt.legend()
    
    if filename is not None:
        plt.savefig(filename, format='png', dpi=150)

In [None]:
def plot_nulls_scores(grid, simple_score, surrogate_score, x_label, y_label, filename=None):
    plt.figure(figsize=(16, 6))
    plt.plot(grid, simple_score, label='simple splits')
    plt.plot(grid, surrogate_score, label='surrogate splits')
    plt.title('Test score')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.grid()
    plt.legend()
    
    if filename is not None:
        plt.savefig(filename, format='png', dpi=150)

### classification test

In [None]:
X, y = make_classification(n_features=10, n_informative=5, n_redundant=2, n_samples=1000, 
                           n_clusters_per_class=5, class_sep=1.0, random_state=RANDOM_STATE)

In [None]:
gs_param_grid = {'max_depth': range(2, 16)}

gscv_params = {'param_grid': gs_param_grid,
               'scoring': 'roc_auc', 
               'cv': 5, 
               'return_train_score': True, 
               'n_jobs': -1}

#### gini

In [None]:
sklearn_tree = DecisionTreeClassifier(criterion='gini', random_state=RANDOM_STATE)
sklearn_gscv = GridSearchCV(estimator=sklearn_tree, **gscv_params)

custom_tree = DecisionTree(criterion='gini')
custom_gscv = GridSearchCV(estimator=custom_tree, **gscv_params)

In [None]:
%%time
sklearn_gscv.fit(X, y)

In [None]:
%%time
custom_gscv.fit(X, y)

In [None]:
plot_gs_cv(grid=gs_param_grid['max_depth'], 
           estimators={'sklearn (gini)': sklearn_gscv, 'custom (gini)': custom_gscv}, 
           x_label='max_depth', y_label='roc_auc',
           filename='./img/comparsion_gini.png')

#### entropy

In [None]:
sklearn_tree = DecisionTreeClassifier(criterion='entropy', random_state=RANDOM_STATE)
sklearn_gscv = GridSearchCV(estimator=sklearn_tree, **gscv_params)

custom_tree = DecisionTree(criterion='entropy')
custom_gscv = GridSearchCV(estimator=custom_tree, **gscv_params)

In [None]:
%%time
sklearn_gscv.fit(X, y)

In [None]:
%%time
custom_gscv.fit(X, y)

In [None]:
plot_gs_cv(grid=gs_param_grid['max_depth'], 
           estimators={'sklearn (entropy)': sklearn_gscv, 'custom (entropy)': custom_gscv}, 
           x_label='max_depth', y_label='roc_auc', 
           filename='./img/comparsion_entropy.png')

#### missing values

In [None]:
custom_tree_simple = DecisionTree(criterion='gini', max_depth=7, use_surrogate_splits=False)
custom_tree_surrogate = DecisionTree(criterion='gini', max_depth=7, use_surrogate_splits=True)

In [None]:
%%time

null_ratios = np.arange(0.0, 0.55, 0.05)

means_simple, means_surrogate = [], []
stds_simple, stds_surrogate = [], []

for null_ratio in null_ratios:
    X_nulls = insert_null_values(X, features=range(10), null_ratio=null_ratio)
    
    score_simple = cross_val_score(custom_tree_simple, X_nulls, y, scoring='roc_auc', cv=5, n_jobs=-1)
    means_simple.append(np.mean(score_simple))
    stds_simple.append(np.std(score_simple))
    
    score_surrogate = cross_val_score(custom_tree_surrogate, X_nulls, y, scoring='roc_auc', cv=5, n_jobs=-1)
    means_surrogate.append(np.mean(score_surrogate))
    stds_surrogate.append(np.std(score_surrogate))

In [None]:
plot_cv_scores(grid=null_ratios, 
               scores={'simple splits': (means_simple, stds_simple), 
                       'surrogate splits': (means_surrogate, stds_surrogate)}, 
               x_label='Null values ratio', y_label='roc_auc',
               filename='./img/missing_classification.png')

#### missing only test values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
%%time
custom_tree_simple.fit(X, y)

In [None]:
%%time
custom_tree_surrogate.fit(X, y)

In [None]:
test_scores_simple = []
test_scores_surrogate = []

for null_ratio in null_ratios:
    X_test_nulls = insert_null_values(X_test, features=range(10), null_ratio=null_ratio)
    test_scores_simple.append(roc_auc_score(y_test, custom_tree_simple.predict_proba(X_test_nulls)[:, 1]))
    test_scores_surrogate.append(roc_auc_score(y_test, custom_tree_surrogate.predict_proba(X_test_nulls)[:, 1]))

In [None]:
plot_nulls_scores(null_ratios, test_scores_simple, test_scores_surrogate, 
                  x_label='Null values ratio', y_label='roc_auc', filename='./img/test_missing_classification.png')

### regression test

In [None]:
X, y = make_regression(n_features=10, n_informative=7, n_samples=1000, 
                       noise=0.05 ,random_state=RANDOM_STATE)

In [None]:
gs_param_grid = {'max_depth': range(2, 16)}

gscv_params = {'param_grid': gs_param_grid,
               'scoring': 'neg_mean_squared_error', 
               'cv': 5, 
               'return_train_score': True, 
               'n_jobs': -1}

#### variance

In [None]:
sklearn_tree = DecisionTreeRegressor(criterion='mse', random_state=RANDOM_STATE)
sklearn_gscv = GridSearchCV(estimator=sklearn_tree, **gscv_params)

custom_v_tree = DecisionTree(criterion='variance')
custom_v_gscv = GridSearchCV(estimator=custom_v_tree, **gscv_params)

custom_mm_tree = DecisionTree(criterion='mad_median')
custom_mm_gscv = GridSearchCV(estimator=custom_mm_tree, **gscv_params)

In [None]:
%%time
sklearn_gscv.fit(X, y)

In [None]:
%%time
custom_v_gscv.fit(X, y)

In [None]:
%%time
custom_mm_gscv.fit(X, y)

In [None]:
plot_gs_cv(grid=gs_param_grid['max_depth'], 
           estimators={'sklearn (MSE)': sklearn_gscv, 
                       'custom (variance)': custom_v_gscv, 
                       'custom (mad_median)': custom_mm_gscv}, 
           x_label='max_depth', y_label='MSE',
           negative=True,
           filename='./img/comparsion_regression.png')

#### missing values

In [None]:
custom_tree_simple = DecisionTree(criterion='variance', max_depth=8, use_surrogate_splits=False)
custom_tree_surrogate = DecisionTree(criterion='variance', max_depth=8, use_surrogate_splits=True)

In [None]:
%%time

null_ratios = np.arange(0.0, 0.55, 0.05)

means_simple, means_surrogate = [], []
stds_simple, stds_surrogate = [], []

for null_ratio in null_ratios:
    X_nulls = insert_null_values(X, features=range(10), null_ratio=null_ratio)
    
    score_simple = cross_val_score(custom_tree_simple, X_nulls, y, 
                                   scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
    means_simple.append(np.mean(score_simple))
    stds_simple.append(np.std(score_simple))
    
    score_surrogate = cross_val_score(custom_tree_surrogate, X_nulls, y, 
                                      scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
    means_surrogate.append(np.mean(score_surrogate))
    stds_surrogate.append(np.std(score_surrogate))

In [None]:
plot_cv_scores(grid=null_ratios, 
               scores={'simple splits': (means_simple, stds_simple), 
                       'surrogate splits': (means_surrogate, stds_surrogate)}, 
               x_label='Null values ratio', y_label='MSE',
               negative=True,
               filename='./img/missing_regression.png')

#### missing only test values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
%%time
custom_tree_simple.fit(X, y)

In [None]:
%%time
custom_tree_surrogate.fit(X, y)

In [None]:
test_scores_simple = []
test_scores_surrogate = []

for null_ratio in null_ratios:
    X_test_nulls = insert_null_values(X_test, features=range(10), null_ratio=null_ratio)
    test_scores_simple.append(mean_squared_error(y_test, custom_tree_simple.predict(X_test_nulls)))
    test_scores_surrogate.append(mean_squared_error(y_test, custom_tree_surrogate.predict(X_test_nulls)))

In [None]:
plot_nulls_scores(null_ratios, test_scores_simple, test_scores_surrogate, 
                  x_label='Null values ratio', y_label='MSE', filename='./img/test_missing_regression.png')