<a href="https://colab.research.google.com/github/antbartash/product_failure/blob/main/trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading libraries and data

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import roc_auc_score
from scipy.stats import uniform, randint

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
X_train = pd.read_csv('drive/MyDrive/product_failure/data/X_train_dummies_scaled_knn.csv', index_col=0)
X_test = pd.read_csv('drive/MyDrive/product_failure/data/X_test_dummies_scaled_knn.csv', index_col=0)
y_train = pd.read_csv('drive/MyDrive/product_failure/data/y_train.csv', index_col=0)
y_test = pd.read_csv('drive/MyDrive/product_failure/data/y_test.csv', index_col=0)

Check first 5 observations and data shapes to make sure that the data was read correctly

In [4]:
X_train.head()

Unnamed: 0,loading,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,...,"measurement_2_grouped_(-0.001, 2.0]","measurement_2_grouped_(2.0, 3.0]","measurement_2_grouped_(3.0, 4.0]","measurement_2_grouped_(4.0, 5.0]","measurement_2_grouped_(5.0, 6.0]","measurement_2_grouped_(6.0, 7.0]","measurement_2_grouped_(7.0, 8.0]","measurement_2_grouped_(8.0, 9.0]","measurement_2_grouped_(9.0, 11.0]","measurement_2_grouped_(11.0, 24.0]"
0,-0.513603,2.078416,-1.485328,-0.681222,-0.653551,-0.015593,-0.319033,-0.014153,-0.06752,0.327396,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0364,-1.070214,0.179974,-0.379065,-0.549359,-0.731627,1.703556,0.598172,-0.121901,-0.066527,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.019484,-0.828012,1.845276,-0.379065,-1.697273,0.577577,-1.657378,-0.286633,1.45214,0.304632,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.060479,0.867404,0.179974,-0.076908,-1.114317,0.677278,0.361192,-0.752161,-0.738225,0.907394,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,-0.54974,-0.585809,-0.533727,-0.076908,-0.828338,0.043825,-1.348916,0.015005,0.087568,-1.010756,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [5]:
print("X_train.shape: ", X_train.shape)
print("y_train.shape: ", y_train.shape)
print("X_test.shape: ", X_test.shape)
print("y_test.shape:", y_test.shape)

X_train.shape:  (19927, 64)
y_train.shape:  (19927, 1)
X_test.shape:  (6643, 64)
y_test.shape: (6643, 1)


In [6]:
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

print("y_train.shape: ", y_train.shape)
print("y_test.shape:", y_test.shape)

y_train.shape:  (19927,)
y_test.shape: (6643,)


# Baseline tree model <br>
Build a baseline decision tree model and evaluate its performance on train and test sets. For performance evaluation AUC will be used

In [None]:
tree_baseline = DecisionTreeClassifier()
tree_baseline.fit(X_train, y_train)

print("Train set AUC: {}".format(roc_auc_score(y_train, tree_baseline.predict(X_train))))
print("Test set AUC: {}".format(roc_auc_score(y_test, tree_baseline.predict(X_test))))

Train set AUC: 1.0
Test set AUC: 0.5071397012684099


AUC values crearly indicate, that the model overfits the training set. We can try to tune its parameters to improve quality of the model

# Tuning trees parameters values <br>
To tune parameters values GridSearchCV will be used. To prevent data leakage, we'll apply data transformation on each training folder separately. In order to do that, we need to read the datasets created as a result of one-hot encoding of the categorical features, but without other preprocessing steps <br>
Since sklearn implementation of decision trees dosn't handle missing values, missing values are filled using KNN. The standarization step is not required by decision trees themselves, but is needed for KNN.  
Also features dtype will be changed from float64 to float32 to speed up computations

In [7]:
X_train_dummies = pd.read_csv('drive/MyDrive/product_failure/data/X_train_dummies.csv', index_col=0)
X_test_dummies = pd.read_csv('drive/MyDrive/product_failure/data/X_test_dummies.csv', index_col=0)

X_train_dummies = X_train_dummies.astype(np.float32)
X_test_dummies = X_test_dummies.astype(np.float32)

In [None]:
pipe = Pipeline([
    ('transformer', ColumnTransformer([
        ('scaler', StandardScaler(), [*range(0, 19)])],
         remainder='passthrough')),
    ('imputer', KNNImputer()),
    ('tree', DecisionTreeClassifier(random_state=0))
])

Round 1

In [None]:
param_grid = {
    'tree__max_depth': [None, 20, 10, 5, 3],
    'tree__min_samples_leaf': [100, 50, 25, 1],
    'tree__min_impurity_decrease': [0, 0.01, 0.1]
}

grid = GridSearchCV(pipe, param_grid, cv=5,
                    scoring='roc_auc',
                    verbose=2)

grid.fit(X_train_dummies, y_train)

print('Best score: ', grid.best_score_)
print('Best params: ', grid.best_params_)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV] END tree__max_depth=None, tree__min_impurity_decrease=0, tree__min_samples_leaf=100; total time=  29.8s
[CV] END tree__max_depth=None, tree__min_impurity_decrease=0, tree__min_samples_leaf=100; total time=  29.8s
[CV] END tree__max_depth=None, tree__min_impurity_decrease=0, tree__min_samples_leaf=100; total time=  28.2s
[CV] END tree__max_depth=None, tree__min_impurity_decrease=0, tree__min_samples_leaf=100; total time=  26.9s
[CV] END tree__max_depth=None, tree__min_impurity_decrease=0, tree__min_samples_leaf=100; total time=  27.0s
[CV] END tree__max_depth=None, tree__min_impurity_decrease=0, tree__min_samples_leaf=50; total time=  30.5s
[CV] END tree__max_depth=None, tree__min_impurity_decrease=0, tree__min_samples_leaf=50; total time=  32.0s
[CV] END tree__max_depth=None, tree__min_impurity_decrease=0, tree__min_samples_leaf=50; total time=  29.0s
[CV] END tree__max_depth=None, tree__min_impurity_decrease=0, tree__m

In result of the first round of grid search, models with the most regularized models were selected. Therefore, in the next round the regularization will be increased. <br>
Also the first round has shown, that changing min_impurity_decrease value from the default one doesn't affect models performance. However, in the second round we'll also test 0.001 as a value of min_impurity_decrease

Round 2

In [None]:
param_grid = {
    'tree__max_depth': [5, 4, 3, 2, 1],
    'tree__min_samples_leaf': [250, 200, 150, 100],
    'tree__min_impurity_decrease': [0, 0.001]
}

grid = GridSearchCV(pipe, param_grid, cv=5,
                    scoring='roc_auc',
                    verbose=2)

grid.fit(X_train_dummies, y_train)

print('Best score: ', grid.best_score_)
print('Best params: ', grid.best_params_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END tree__max_depth=5, tree__min_impurity_decrease=0, tree__min_samples_leaf=250; total time=  31.6s
[CV] END tree__max_depth=5, tree__min_impurity_decrease=0, tree__min_samples_leaf=250; total time=  28.3s
[CV] END tree__max_depth=5, tree__min_impurity_decrease=0, tree__min_samples_leaf=250; total time=  26.5s
[CV] END tree__max_depth=5, tree__min_impurity_decrease=0, tree__min_samples_leaf=250; total time=  28.0s
[CV] END tree__max_depth=5, tree__min_impurity_decrease=0, tree__min_samples_leaf=250; total time=  27.4s
[CV] END tree__max_depth=5, tree__min_impurity_decrease=0, tree__min_samples_leaf=200; total time=  30.3s
[CV] END tree__max_depth=5, tree__min_impurity_decrease=0, tree__min_samples_leaf=200; total time=  29.6s
[CV] END tree__max_depth=5, tree__min_impurity_decrease=0, tree__min_samples_leaf=200; total time=  27.6s
[CV] END tree__max_depth=5, tree__min_impurity_decrease=0, tree__min_samples_leaf=200; tot

The results of the second round show, that the optimal value for max_depth is 3, optimal min_impurity decrease equals 0. <br>
In the third round 400, 350, 300, 250 will be tested as a values of min_samples_leaf.

Round 3

In [None]:
param_grid = {
    'tree__max_depth': [4, 3, 2],
    'tree__min_samples_leaf': [400, 350, 300, 250]
}

grid = GridSearchCV(pipe, param_grid, cv=5,
                    scoring='roc_auc',
                    verbose=2)

grid.fit(X_train_dummies, y_train)

print('Best score: ', grid.best_score_)
print('Best params: ', grid.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=400; total time= 2.0min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=400; total time= 2.0min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=400; total time= 1.4min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=400; total time= 1.4min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=400; total time= 1.4min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=350; total time= 1.9min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=350; total time= 1.9min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=350; total time= 1.3min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=350; total time= 1.4min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=350; total time= 1.4min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=300; total time= 1.9min
[CV] END ......tree__max_depth=4, tree__min_samp

The optimal values of a decision tree's parameters are max_depth=3, min_samples_leaf=300.

# Tree final

In [None]:
tree_final = Pipeline([
    ('transformer', ColumnTransformer([
        ('scaler', StandardScaler(), [*range(0, 19)])],
         remainder='passthrough')),
    ('imputer', KNNImputer()),
    ('tree', DecisionTreeClassifier(max_depth=3, min_samples_leaf=300,
                                    random_state=0))
])
tree_final.fit(X_train_dummies, y_train)


print("Train set AUC: {}".format(roc_auc_score(y_train, tree_final.predict(X_train_dummies))))
print("Test set AUC: {}".format(roc_auc_score(y_test, tree_final.predict(X_test_dummies))))

Train set AUC: 0.5
Test set AUC: 0.5


 AUC for both datasets is 0.5, which means that in both cases the tree predicts the negative category for all observations. At the end, the final model will be built on the whole labeled data (which for now has been splitted into a training and testing sets). So on a larger dataset we may get a larger decision tree. Moreover, the parameters values used in the tree_final were selected as the best ones during cross-validation, which means that on average (on different training and testing datasets) the tree_final model has the highest AUC among other decision trees. <br>
 To get higher quality of predictions, more complicated models will be used.

# Baseline random forest model

In [None]:
rf_baseline = RandomForestClassifier(random_state=0)
rf_baseline.fit(X_train, y_train)

print("Train set AUC: {}".format(roc_auc_score(y_train, rf_baseline.predict(X_train))))
print("Test set AUC: {}".format(roc_auc_score(y_test,  rf_baseline.predict(X_test))))

Train set AUC: 1.0
Test set AUC: 0.5


AUC equals 1 on the training set and 0.5 on the test set. In order to achieve better results, GridSearchCV will be used. As before, the pipeline hasa scaling step (needed for KNN), KNN for imputing missing values and a classifier

# Tuning rf parameters values

In [None]:
pipe = Pipeline([
    ('transformer', ColumnTransformer([
        ('scaler', StandardScaler(), [*range(0, 19)])],
         remainder='passthrough')),
    ('imputer', KNNImputer()),
    ('rf', RandomForestClassifier(random_state=0, n_jobs=-1))
])

Round 1

In [None]:
param_grid = {
    'rf__n_estimators': [500, 250, 100],
    'rf__max_depth': [None, 10, 5],
    'rf__min_samples_split': [2, 10, 50]
}

grid = GridSearchCV(pipe, param_grid, cv=5,
                    scoring='roc_auc',
                    verbose=2)

grid.fit(X_train_dummies, y_train)

print('Best score: ', grid.best_score_)
print('Best params: ', grid.best_params_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END rf__max_depth=None, rf__min_samples_split=2, rf__n_estimators=500; total time=  54.2s
[CV] END rf__max_depth=None, rf__min_samples_split=2, rf__n_estimators=500; total time=  54.7s
[CV] END rf__max_depth=None, rf__min_samples_split=2, rf__n_estimators=500; total time=  52.1s
[CV] END rf__max_depth=None, rf__min_samples_split=2, rf__n_estimators=500; total time=  55.3s
[CV] END rf__max_depth=None, rf__min_samples_split=2, rf__n_estimators=500; total time=  53.1s
[CV] END rf__max_depth=None, rf__min_samples_split=2, rf__n_estimators=250; total time=  41.9s
[CV] END rf__max_depth=None, rf__min_samples_split=2, rf__n_estimators=250; total time=  42.7s
[CV] END rf__max_depth=None, rf__min_samples_split=2, rf__n_estimators=250; total time=  40.6s
[CV] END rf__max_depth=None, rf__min_samples_split=2, rf__n_estimators=250; total time=  40.6s
[CV] END rf__max_depth=None, rf__min_samples_split=2, rf__n_estimators=250; total t

As a result of the first round of grid search, the minimum values for max_depth and min_samples_split were chosen. Also the highest number of decision trees were used in the best model.<br>
Therefore, at the second round the number of estimators was increased and other values for max_depth were tested.

Round 2

In [None]:
param_grid = {
    'rf__n_estimators': [1500, 1000, 500],
    'rf__max_depth': [8, 5, 3]
}

grid = GridSearchCV(pipe, param_grid, cv=5,
                    scoring='roc_auc',
                    verbose=2)

grid.fit(X_train_dummies, y_train)

print('Best score: ', grid.best_score_)
print('Best params: ', grid.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END .............rf__max_depth=8, rf__n_estimators=1500; total time= 1.1min
[CV] END .............rf__max_depth=8, rf__n_estimators=1500; total time= 1.0min
[CV] END .............rf__max_depth=8, rf__n_estimators=1500; total time= 1.0min
[CV] END .............rf__max_depth=8, rf__n_estimators=1500; total time= 1.0min
[CV] END .............rf__max_depth=8, rf__n_estimators=1500; total time= 1.0min
[CV] END .............rf__max_depth=8, rf__n_estimators=1000; total time=  51.5s
[CV] END .............rf__max_depth=8, rf__n_estimators=1000; total time=  51.4s
[CV] END .............rf__max_depth=8, rf__n_estimators=1000; total time=  50.9s
[CV] END .............rf__max_depth=8, rf__n_estimators=1000; total time=  49.2s
[CV] END .............rf__max_depth=8, rf__n_estimators=1000; total time=  49.0s
[CV] END ..............rf__max_depth=8, rf__n_estimators=500; total time=  43.1s
[CV] END ..............rf__max_depth=8, rf__n_est

The optimal parameters values for the first and the second rounds are the same. In the third round [7, 6, 5, 4] will be tested as values of the max_depth parameter. We'll use random forests consisting of 750, 500 or 400 decision trees.

Round 3

In [None]:
param_grid = {
    'rf__n_estimators': [750, 500, 400],
    'rf__max_depth': [7, 6, 5, 4]
}

grid = GridSearchCV(pipe, param_grid, cv=5,
                    scoring='roc_auc',
                    verbose=2)

grid.fit(X_train_dummies, y_train)

print('Best score: ', grid.best_score_)
print('Best params: ', grid.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ..............rf__max_depth=7, rf__n_estimators=750; total time= 1.4min
[CV] END ..............rf__max_depth=7, rf__n_estimators=750; total time= 1.4min
[CV] END ..............rf__max_depth=7, rf__n_estimators=750; total time=  54.9s
[CV] END ..............rf__max_depth=7, rf__n_estimators=750; total time=  42.3s
[CV] END ..............rf__max_depth=7, rf__n_estimators=750; total time=  40.7s
[CV] END ..............rf__max_depth=7, rf__n_estimators=500; total time=  40.0s
[CV] END ..............rf__max_depth=7, rf__n_estimators=500; total time=  41.9s
[CV] END ..............rf__max_depth=7, rf__n_estimators=500; total time=  37.9s
[CV] END ..............rf__max_depth=7, rf__n_estimators=500; total time=  36.9s
[CV] END ..............rf__max_depth=7, rf__n_estimators=500; total time=  38.5s
[CV] END ..............rf__max_depth=7, rf__n_estimators=400; total time=  37.1s
[CV] END ..............rf__max_depth=7, rf__n_es

Based on three round of grid search, we can conclude, that the optimal parameters of a random forest are max_depth=5, n_estimators=500. These values give us average CV AUC of 0.579388

# Baseline AdaBoost model

In [None]:
ada_baseline = AdaBoostClassifier(DecisionTreeClassifier(random_state=42),
                                 random_state=42)
ada_baseline.fit(X_train, y_train)

print("Train set AUC: {}".format(roc_auc_score(y_train, ada_baseline.predict(X_train))))
print("Test set AUC: {}".format(roc_auc_score(y_test,  ada_baseline.predict(X_test))))

Train set AUC: 1.0
Test set AUC: 0.5026381271066188


# Tuning AdaBoost parameters values

In [9]:
pipe = Pipeline([
    ('transformer', ColumnTransformer([
        ('scaler', StandardScaler(), [*range(0, 19)])],
         remainder='passthrough')),
    ('imputer', KNNImputer()),
    ('ada', AdaBoostClassifier(DecisionTreeClassifier(),
                               random_state=42))
])

Round 1

In [None]:
param_grid = {
    'ada__learning_rate': [0.1, 0.5, 0.75, 1],
    'ada__n_estimators': [250, 100, 50],
    'ada__base_estimator__max_depth': [None, 5, 1]
}

grid = GridSearchCV(pipe, param_grid, cv=5,
                    scoring='roc_auc',
                    verbose=3)

grid.fit(X_train_dummies, y_train)

print('Best score: ', grid.best_score_)
print('Best params: ', grid.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END ada__base_estimator__max_depth=None, ada__learning_rate=0.1, ada__n_estimators=250;, score=0.506 total time= 1.4min
[CV 2/5] END ada__base_estimator__max_depth=None, ada__learning_rate=0.1, ada__n_estimators=250;, score=0.512 total time= 1.5min
[CV 3/5] END ada__base_estimator__max_depth=None, ada__learning_rate=0.1, ada__n_estimators=250;, score=0.518 total time= 1.1min
[CV 4/5] END ada__base_estimator__max_depth=None, ada__learning_rate=0.1, ada__n_estimators=250;, score=0.496 total time= 1.0min
[CV 5/5] END ada__base_estimator__max_depth=None, ada__learning_rate=0.1, ada__n_estimators=250;, score=0.502 total time=  56.6s
[CV 1/5] END ada__base_estimator__max_depth=None, ada__learning_rate=0.1, ada__n_estimators=100;, score=0.506 total time=  50.7s
[CV 2/5] END ada__base_estimator__max_depth=None, ada__learning_rate=0.1, ada__n_estimators=100;, score=0.512 total time=  53.7s
[CV 3/5] END ada__base_estimator__m

In the first round of grid search models with the lowest learning rate and the lowest number of trees were selected. So in the second round values of these parameters will be decreased

Round 2

In [None]:
param_grid = {
    'ada__learning_rate': [0.1, 0.01, 0.001, 0.0001, 0.00001],
    'ada__n_estimators': [100, 50, 25],
    'ada__base_estimator__max_depth': [5, 3, 1]
}

grid = GridSearchCV(pipe, param_grid, cv=5,
                    scoring='roc_auc',
                    verbose=3)

grid.fit(X_train_dummies, y_train)

print('Best score: ', grid.best_score_)
print('Best params: ', grid.best_params_)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
[CV 1/5] END ada__base_estimator__max_depth=5, ada__learning_rate=0.1, ada__n_estimators=100;, score=0.527 total time=  59.9s
[CV 2/5] END ada__base_estimator__max_depth=5, ada__learning_rate=0.1, ada__n_estimators=100;, score=0.550 total time=  52.7s
[CV 3/5] END ada__base_estimator__max_depth=5, ada__learning_rate=0.1, ada__n_estimators=100;, score=0.553 total time=  45.6s
[CV 4/5] END ada__base_estimator__max_depth=5, ada__learning_rate=0.1, ada__n_estimators=100;, score=0.558 total time=  48.9s
[CV 5/5] END ada__base_estimator__max_depth=5, ada__learning_rate=0.1, ada__n_estimators=100;, score=0.549 total time=  53.7s
[CV 1/5] END ada__base_estimator__max_depth=5, ada__learning_rate=0.1, ada__n_estimators=50;, score=0.545 total time=  52.1s
[CV 2/5] END ada__base_estimator__max_depth=5, ada__learning_rate=0.1, ada__n_estimators=50;, score=0.560 total time=  46.4s
[CV 3/5] END ada__base_estimator__max_depth=5, ada__learni

In the second round of grid search, models with max_depth=1 and low numbers of estimators were selected. So in the third round max_depth values from {3, 2, 1} will be tested with relatively low number of estimators and diffenent values of learning_rate

Round 3

In [None]:
param_grid = {
    'ada__learning_rate': [0.4, 0.3, 0.2, 0.1, 0.05],
    'ada__n_estimators': [40, 30, 20, 10],
    'ada__base_estimator__max_depth': [3, 2, 1]
}

grid = GridSearchCV(pipe, param_grid, cv=5,
                    scoring='roc_auc',
                    verbose=3)

grid.fit(X_train_dummies, y_train)

print('Best score: ', grid.best_score_)
print('Best params: ', grid.best_params_)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV 1/5] END ada__base_estimator__max_depth=3, ada__learning_rate=0.4, ada__n_estimators=40;, score=0.559 total time=  36.9s
[CV 2/5] END ada__base_estimator__max_depth=3, ada__learning_rate=0.4, ada__n_estimators=40;, score=0.573 total time=  35.4s
[CV 3/5] END ada__base_estimator__max_depth=3, ada__learning_rate=0.4, ada__n_estimators=40;, score=0.565 total time=  33.2s
[CV 4/5] END ada__base_estimator__max_depth=3, ada__learning_rate=0.4, ada__n_estimators=40;, score=0.566 total time=  32.2s
[CV 5/5] END ada__base_estimator__max_depth=3, ada__learning_rate=0.4, ada__n_estimators=40;, score=0.557 total time=  32.0s
[CV 1/5] END ada__base_estimator__max_depth=3, ada__learning_rate=0.4, ada__n_estimators=30;, score=0.564 total time=  34.0s
[CV 2/5] END ada__base_estimator__max_depth=3, ada__learning_rate=0.4, ada__n_estimators=30;, score=0.579 total time=  33.4s
[CV 3/5] END ada__base_estimator__max_depth=3, ada__learning_ra

Based on the results of the previous rounds we can conclude, that the optimal values for max_depth is 1 or 2, learning_rate should belong to range from 0.01 to 0.1, and the optimal value for n_estimators is below 50. Taking that into account, we'll perform the 4th round of grid search

Round 4

In [10]:
param_grid = {
    'ada__learning_rate': [0.1, 0.075, 0.05, 0.025, 0.01],
    'ada__n_estimators': [50, 40, 30, 20, 10],
    'ada__base_estimator__max_depth': [3, 2, 1]
}

grid = GridSearchCV(pipe, param_grid, cv=5,
                    scoring='roc_auc',
                    verbose=3)

grid.fit(X_train_dummies, y_train)

print('Best score: ', grid.best_score_)
print('Best params: ', grid.best_params_)

Fitting 5 folds for each of 75 candidates, totalling 375 fits
[CV 1/5] END ada__base_estimator__max_depth=3, ada__learning_rate=0.1, ada__n_estimators=50;, score=0.586 total time=  46.0s
[CV 2/5] END ada__base_estimator__max_depth=3, ada__learning_rate=0.1, ada__n_estimators=50;, score=0.571 total time=  35.5s
[CV 3/5] END ada__base_estimator__max_depth=3, ada__learning_rate=0.1, ada__n_estimators=50;, score=0.582 total time=  37.2s
[CV 4/5] END ada__base_estimator__max_depth=3, ada__learning_rate=0.1, ada__n_estimators=50;, score=0.576 total time=  35.8s
[CV 5/5] END ada__base_estimator__max_depth=3, ada__learning_rate=0.1, ada__n_estimators=50;, score=0.575 total time=  35.4s
[CV 1/5] END ada__base_estimator__max_depth=3, ada__learning_rate=0.1, ada__n_estimators=40;, score=0.587 total time=  37.3s
[CV 2/5] END ada__base_estimator__max_depth=3, ada__learning_rate=0.1, ada__n_estimators=40;, score=0.577 total time=  35.0s
[CV 3/5] END ada__base_estimator__max_depth=3, ada__learning_ra

# Baseline ExtraTrees model

In [13]:
et_baseline = ExtraTreesClassifier(random_state=0)
et_baseline.fit(X_train, y_train)

print("Train set AUC: {}".format(roc_auc_score(y_train, et_baseline.predict(X_train))))
print("Test set AUC: {}".format(roc_auc_score(y_test,  et_baseline.predict(X_test))))

Train set AUC: 1.0
Test set AUC: 0.499917259942021


# Tuning ExtraTrees parameters values

In [None]:
### DELETE THIS CELL ###
param_grid = {'n_estimators': [100, 250],
              'max_features': [3, 4, 5, 6, 20],
              'max_depth': [None, 20, 10, 5],
              'min_samples_leaf': [1, 10, 25, 50],
              'min_impurity_decrease': [0, 0.05]}

In [14]:
pipe = Pipeline([
    ('transformer', ColumnTransformer([
        ('scaler', StandardScaler(), [*range(0, 19)])],
         remainder='passthrough')),
    ('imputer', KNNImputer()),
    ('et', ExtraTreesClassifier(random_state=42))
])

In [16]:
distr = {
    'et__n_estimators': randint(10, 1000),
    'et__max_features': randint(1, 50),
    'et__max_depth': randint(1, 20),
    'et__min_samples_leaf': randint(1, 50),
    'et__min_impurity_decrease': uniform(0, 0.1)
}

clf = RandomizedSearchCV(pipe, distr, n_iter=10, cv=5, scoring='roc_auc',
                         random_state=42, verbose=3)

clf.fit(X_train_dummies, y_train)

print('Best score: ', grid.best_score_)
print('Best params: ', grid.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END et__max_depth=8.49080237694725, et__max_features=48.53571532049581, et__min_impurity_decrease=0.0731993941811405, et__min_samples_leaf=30.93292420985183, et__n_estimators=164.45845403801215;, score=nan total time=  24.5s
[CV 2/5] END et__max_depth=8.49080237694725, et__max_features=48.53571532049581, et__min_impurity_decrease=0.0731993941811405, et__min_samples_leaf=30.93292420985183, et__n_estimators=164.45845403801215;, score=nan total time=  23.2s


KeyboardInterrupt: ignored