<a href="https://colab.research.google.com/github/antbartash/product_failure/blob/main/trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading libraries and data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import roc_auc_score

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
X_train = pd.read_csv('drive/MyDrive/product_failure/data/X_train_dummies_scaled_knn.csv', index_col=0)
X_test = pd.read_csv('drive/MyDrive/product_failure/data/X_test_dummies_scaled_knn.csv', index_col=0)
y_train = pd.read_csv('drive/MyDrive/product_failure/data/y_train.csv', index_col=0)
y_test = pd.read_csv('drive/MyDrive/product_failure/data/y_test.csv', index_col=0)

Check first 5 observations and data shapes to make sure that the data was read correctly

In [4]:
X_train.head()

Unnamed: 0,loading,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,...,"measurement_2_grouped_(-0.001, 2.0]","measurement_2_grouped_(2.0, 3.0]","measurement_2_grouped_(3.0, 4.0]","measurement_2_grouped_(4.0, 5.0]","measurement_2_grouped_(5.0, 6.0]","measurement_2_grouped_(6.0, 7.0]","measurement_2_grouped_(7.0, 8.0]","measurement_2_grouped_(8.0, 9.0]","measurement_2_grouped_(9.0, 11.0]","measurement_2_grouped_(11.0, 24.0]"
0,-0.513603,2.078416,-1.485328,-0.681222,-0.653551,-0.015593,-0.319033,-0.014153,-0.06752,0.327396,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0364,-1.070214,0.179974,-0.379065,-0.549359,-0.731627,1.703556,0.598172,-0.121901,-0.066527,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.019484,-0.828012,1.845276,-0.379065,-1.697273,0.577577,-1.657378,-0.286633,1.45214,0.304632,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.060479,0.867404,0.179974,-0.076908,-1.114317,0.677278,0.361192,-0.752161,-0.738225,0.907394,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,-0.54974,-0.585809,-0.533727,-0.076908,-0.828338,0.043825,-1.348916,0.015005,0.087568,-1.010756,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [5]:
print("X_train.shape: ", X_train.shape)
print("y_train.shape: ", y_train.shape)
print("X_test.shape: ", X_test.shape)
print("y_test.shape:", y_test.shape)

X_train.shape:  (19927, 64)
y_train.shape:  (19927, 1)
X_test.shape:  (6643, 64)
y_test.shape: (6643, 1)


# Baseline tree model <br>
Build a baseline decision tree model and evaluate its performance on train and test sets. For performance evaluation AUC will be used

In [6]:
tree_baseline = DecisionTreeClassifier()
tree_baseline.fit(X_train, y_train)

print("Train set AUC: {}".format(roc_auc_score(y_train, tree_baseline.predict(X_train))))
print("Test set AUC: {}".format(roc_auc_score(y_test, tree_baseline.predict(X_test))))

Train set AUC: 1.0
Test set AUC: 0.4994318651547124


AUC values crearly indicate, that the model overfits the training set. We can try to tune its parameters to improve quality of the model

# Tuning trees parameters values <br>
To tune parameters values GridSearchCV will be used. To prevent data leakage, we'll apply data transformation on each training folder separately. In order to do that, we need to read the datasets created as a result of one-hot encoding of the categorical features, but without other preprocessing steps <br>
Also features dtype will be changed from float64 to float32 to speed up computations

In [7]:
X_train_dummies = pd.read_csv('drive/MyDrive/product_failure/data/X_train_dummies.csv', index_col=0)
X_test_dummies = pd.read_csv('drive/MyDrive/product_failure/data/X_test_dummies.csv', index_col=0)

X_train_dummies = X_train_dummies.astype(np.float32)
X_test_dummies = X_test_dummies.astype(np.float32)

In [8]:
pipe = Pipeline([
    ('transformer', ColumnTransformer([
        ('scaler', StandardScaler(), [*range(0, 19)])],
         remainder='passthrough')),
    ('imputer', KNNImputer()),
    ('tree', DecisionTreeClassifier(random_state=0))
])

Round 1

In [None]:
param_grid = {
    'tree__max_depth': [None, 20, 10, 5, 3],
    'tree__min_samples_leaf': [100, 50, 25, 1],
    'tree__min_impurity_decrease': [0, 0.01, 0.1]
}

grid = GridSearchCV(pipe, param_grid, cv=5,
                    scoring='roc_auc',
                    verbose=2)

grid.fit(X_train_dummies, y_train)

print('Best score: ', grid.best_score_)
print('Best params: ', grid.best_params_)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV] END tree__max_depth=None, tree__min_impurity_decrease=0, tree__min_samples_leaf=100; total time=  28.2s
[CV] END tree__max_depth=None, tree__min_impurity_decrease=0, tree__min_samples_leaf=100; total time=  27.6s
[CV] END tree__max_depth=None, tree__min_impurity_decrease=0, tree__min_samples_leaf=100; total time=  25.8s
[CV] END tree__max_depth=None, tree__min_impurity_decrease=0, tree__min_samples_leaf=100; total time=  27.1s
[CV] END tree__max_depth=None, tree__min_impurity_decrease=0, tree__min_samples_leaf=100; total time=  30.4s
[CV] END tree__max_depth=None, tree__min_impurity_decrease=0, tree__min_samples_leaf=50; total time=  30.9s
[CV] END tree__max_depth=None, tree__min_impurity_decrease=0, tree__min_samples_leaf=50; total time=  33.4s
[CV] END tree__max_depth=None, tree__min_impurity_decrease=0, tree__min_samples_leaf=50; total time=  30.7s
[CV] END tree__max_depth=None, tree__min_impurity_decrease=0, tree__m

In result of the first round of cross-validation, models with the highest regularization were selected. Therefore, in the next round we're incresing the regularization. <br>
Also the first round has shown, that changing min_impurity_decrease value from the default doesn't affect models performance. However, in the second round we'll also 0.001 as a values of min_impurity_decrease

Round 2

In [None]:
param_grid = {
    'tree__max_depth': [5, 4, 3, 2, 1],
    'tree__min_samples_leaf': [250, 200, 150, 100],
    'tree__min_impurity_decrease': [0, 0.001]
}

grid = GridSearchCV(pipe, param_grid, cv=5,
                    scoring='roc_auc',
                    verbose=2)

grid.fit(X_train_dummies, y_train)

print('Best score: ', grid.best_score_)
print('Best params: ', grid.best_params_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END tree__max_depth=5, tree__min_impurity_decrease=0, tree__min_samples_leaf=250; total time=  31.6s
[CV] END tree__max_depth=5, tree__min_impurity_decrease=0, tree__min_samples_leaf=250; total time=  28.3s
[CV] END tree__max_depth=5, tree__min_impurity_decrease=0, tree__min_samples_leaf=250; total time=  26.5s
[CV] END tree__max_depth=5, tree__min_impurity_decrease=0, tree__min_samples_leaf=250; total time=  28.0s
[CV] END tree__max_depth=5, tree__min_impurity_decrease=0, tree__min_samples_leaf=250; total time=  27.4s
[CV] END tree__max_depth=5, tree__min_impurity_decrease=0, tree__min_samples_leaf=200; total time=  30.3s
[CV] END tree__max_depth=5, tree__min_impurity_decrease=0, tree__min_samples_leaf=200; total time=  29.6s
[CV] END tree__max_depth=5, tree__min_impurity_decrease=0, tree__min_samples_leaf=200; total time=  27.6s
[CV] END tree__max_depth=5, tree__min_impurity_decrease=0, tree__min_samples_leaf=200; tot

The results of the second round show, that the optimal value for max_depth is 3, optimal min_impurity decrease equals 0. <br>
In the third round 400, 350, 300, 250 will be tested as a values of min_samples_leaf.

Round 3

In [12]:
param_grid = {
    'tree__max_depth': [4, 3, 2],
    'tree__min_samples_leaf': [400, 350, 300, 250]
}

grid = GridSearchCV(pipe, param_grid, cv=5,
                    scoring='roc_auc',
                    verbose=2)

grid.fit(X_train_dummies, y_train)

print('Best score: ', grid.best_score_)
print('Best params: ', grid.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=400; total time= 2.0min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=400; total time= 2.0min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=400; total time= 1.4min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=400; total time= 1.4min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=400; total time= 1.4min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=350; total time= 1.9min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=350; total time= 1.9min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=350; total time= 1.3min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=350; total time= 1.4min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=350; total time= 1.4min
[CV] END ......tree__max_depth=4, tree__min_samples_leaf=300; total time= 1.9min
[CV] END ......tree__max_depth=4, tree__min_samp

The optimal values of a decision tree's parameters are max_depth=3, min_samples=300. Using these values a disicion tree on the whole dataset will be built.

# Tree final

In [13]:
tree_final = Pipeline([
    ('transformer', ColumnTransformer([
        ('scaler', StandardScaler(), [*range(0, 19)])],
         remainder='passthrough')),
    ('imputer', KNNImputer()),
    ('tree', DecisionTreeClassifier(max_depth=3, min_samples_leaf=300,
                                    random_state=0))
])
tree_final.fit(X_train_dummies, y_train)


print("Train set AUC: {}".format(roc_auc_score(y_train, tree_final.predict(X_train_dummies))))
print("Test set AUC: {}".format(roc_auc_score(y_test, tree_final.predict(X_test_dummies))))

Train set AUC: 0.5
Test set AUC: 0.5
