<a href="https://colab.research.google.com/github/arthursl12/POC1/blob/main/POC1_Turbofan_FD001_Model_Tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import glob

In [2]:
# Enable HalvingSearch
# from sklearn.experimental import enable_halving_search_cv
# from sklearn.model_selection import HalvingRandomSearchCV, HalvingGridSearchCV
# from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import ConvergenceWarning

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PolynomialFeatures

In [4]:
# from sklearn.metrics import mean_squared_error, r2_score
# from sklearn.model_selection import cross_val_score

# from sklearn.linear_model import LinearRegression
# from sklearn.linear_model import SGDRegressor

# from sklearn.pipeline import Pipeline
# from sklearn.compose import TransformedTargetRegressor
# from sklearn.exceptions import ConvergenceWarning

# from sklearn.preprocessing import FunctionTransformer
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.preprocessing import StandardScaler

In [5]:
# from warnings import simplefilter

In [6]:
sns.set_palette('colorblind')

# Data Preparation

In [7]:
# Dataset Download 
os.system('git clone https://github.com/arthursl12/dataset_2')
os.system('mv /content/dataset_2/CMaps /content/CMaps')
os.system('mv /content/dataset_2/data_processing /content/data_processing')
os.system('rm -rf dataset_2')

0

In [8]:
from data_processing.processing import DatasetProcessing
from data_processing.training import HyperparameterSearch, reclipper_scorer
from data_processing.eval import Evaluation

In [9]:
proc = DatasetProcessing()

## Data Integration

The data are provided as a zip-compressed text file with 26 columns of numbers, separated by spaces. Each row is a snapshot of data taken during a single operational cycle, each column is a different variable. The columns correspond to:  

1) unit number   
2) time, in cycles  
3) operational setting 1  
4) operational setting 2  
5) operational setting 3    
6) sensor measurement 1    
7) sensor measurement 2  
...  
26) sensor measurement 20


There are 6 conditions (or combinations) which the 3 operational settings can take.  
Condition 1: Altitude = 0, Mach Number = 0, TRA = 100  
Condition 2: Altitude = 10, Mach Number = 0.25, TRA = 100  
Condition 3: Altitude = 20, Mach Number = 0.7 TRA = 100  
Condition 4: Altitude = 25, Mach Number = 0.62, TRA = 60  
Condition 5: Altitude = 35 Mach Number = 0.84, TRA = 100  
Condition 6: Altitude = 42, Mach Number = 0.84, TRA = 100  
  
There is slight variation in all these conditions so you may get numbers like 24.453 instead of 25 exactly.

FD001: Condition 1 only  
FD002: Mix of all the conditions  
FD003: Condition 1 only  
FD004: Mix of all conditions  


In [10]:
index_cols, settings_cols, sensors_cols, cols = proc.column_names()
train, test, y_test = proc.read_dataset(1)
train

Unnamed: 0,unit_number,time,op_1,op_2,op_3,s_0,s_1,s_2,s_3,s_4,...,s_11,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,519.49,2388.26,8137.60,8.4956,0.03,397,2388,100.0,38.49,22.9735
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.50,1433.58,14.62,...,519.68,2388.22,8136.50,8.5139,0.03,395,2388,100.0,38.30,23.1594
20628,100,198,0.0004,0.0000,100.0,518.67,643.42,1602.46,1428.18,14.62,...,520.01,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,519.67,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.0640


## Preprocessing

### Test Set Transformation 
Test set has samples for all cycles, but has annotations only for last one

In [11]:
test.shape, y_test.shape

((13096, 26), (100, 1))

In [12]:
test_last = proc.transform_test(test)
test_last.head()

Unnamed: 0,s_0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,...,s_11,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20
0,518.67,642.58,1581.22,1398.91,14.62,21.61,554.42,2388.08,9056.4,1.3,...,521.79,2388.06,8130.11,8.4024,0.03,393,2388,100.0,38.81,23.3552
1,518.67,642.55,1586.59,1410.83,14.62,21.61,553.52,2388.1,9044.77,1.3,...,521.74,2388.09,8126.9,8.4505,0.03,391,2388,100.0,38.81,23.2618
2,518.67,642.88,1589.75,1418.89,14.62,21.61,552.59,2388.16,9049.26,1.3,...,520.83,2388.14,8131.46,8.4119,0.03,395,2388,100.0,38.93,23.274
3,518.67,642.78,1594.53,1406.88,14.62,21.61,552.64,2388.13,9051.3,1.3,...,521.88,2388.11,8133.64,8.4634,0.03,395,2388,100.0,38.58,23.2581
4,518.67,642.27,1589.94,1419.36,14.62,21.61,553.29,2388.1,9053.99,1.3,...,521.0,2388.15,8125.74,8.4362,0.03,394,2388,100.0,38.75,23.4117


In [13]:
X_test = test_last

### Remaining Useful Life (RUL)

In [14]:
train = proc.add_remaining_useful_life_linear(train)
train[index_cols+['RUL']].head()

Unnamed: 0,unit_number,time,RUL
0,1,1,191
1,1,2,190
2,1,3,189
3,1,4,188
4,1,5,187


## Attributes and target separation

In [15]:
X_train, y_train = proc.X_y_train_divide(train)

In [16]:
y_train.head()

Unnamed: 0,RUL
0,191
1,190
2,189
3,188
4,187


In [17]:
X_train.head()

Unnamed: 0,s_0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,...,s_11,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20
0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


## Training and Evaluation functions

In [18]:
eval = Evaluation()

In [19]:
search = HyperparameterSearch()

# DecisionTreeRegressor

## Linear RUL

In [55]:
model = Pipeline([
    ('tree_reg'  ,   DecisionTreeRegressor(random_state=42))
])

In [56]:
GRID_SEARCH = True
if (GRID_SEARCH):
    param_distributions = {
        "tree_reg__criterion": ["squared_error","friedman_mse","absolute_error","poisson"],
        "tree_reg__splitter": ["best", "random"],
        "tree_reg__max_depth": [None,100,200,300,400,500],
        "tree_reg__min_samples_split": [2,5,10,15,20],
        "tree_reg__min_samples_leaf": [1,2,5,10,15,20,50,100],
        "tree_reg__max_features": [None, "sqrt", "log2"],
        "tree_reg__min_impurity_decrease": list(np.arange(0,147)/10),
        "tree_reg__ccp_alpha": list(np.round(np.linspace(0, 2, 81), decimals=3)) 
    }
    model = search.run_HR_GS(model, X_train, y_train, param_distributions, 
                            scorer='r2', ignore_warnings=True)
    print(model)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 500
max_resources_: 20631
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 41
n_resources: 500
Fitting 5 folds for each of 41 candidates, totalling 205 fits
----------
iter: 1
n_candidates: 14
n_resources: 1500
Fitting 5 folds for each of 14 candidates, totalling 70 fits
----------
iter: 2
n_candidates: 5
n_resources: 4500
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 3
n_candidates: 2
n_resources: 13500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'tree_reg__splitter': 'best', 'tree_reg__min_samples_split': 2, 'tree_reg__min_samples_leaf': 20, 'tree_reg__min_impurity_decrease': 6.8, 'tree_reg__max_features': 'sqrt', 'tree_reg__max_depth': 200, 'tree_reg__criterion': 'squared_error', 'tree_reg__ccp_alpha': 1.925}
Pipeline(steps=[('tree_reg',
                 DecisionTreeRegressor(ccp_alpha=1.925, max_depth=200,
          

Best Model:
```
DecisionTreeRegressor(ccp_alpha=1.925, max_depth=200,
                                       max_features='sqrt',
                                       min_impurity_decrease=6.8,
                                       min_samples_leaf=20, random_state=42))
```

In [57]:
model = Pipeline([
    ('tree_reg'  ,   DecisionTreeRegressor(ccp_alpha=1.925, max_depth=200,
                                       max_features='sqrt',
                                       min_impurity_decrease=6.8,
                                       min_samples_leaf=20, random_state=42))
])
model

Pipeline(steps=[('tree_reg',
                 DecisionTreeRegressor(ccp_alpha=1.925, max_depth=200,
                                       max_features='sqrt',
                                       min_impurity_decrease=6.8,
                                       min_samples_leaf=20, random_state=42))])

In [58]:
model.fit(X_train, y_train)
eval.show_result(y_train, model.predict(X_train))
eval.show_result_cv(y_train, X_train, model)

R2=0.572,RMSE=-45.087
(CV) R2=0.528,RMSE=-46.682


In [59]:
eval.show_result(y_test, model.predict(X_test))

R2=0.314,RMSE=-34.411


Worse than simple linear regression

## Non-linear RUL

In [62]:
model = Pipeline([
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = DecisionTreeRegressor(random_state=42),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':50})))
    ])

In [64]:
GRID_SEARCH = True
if (GRID_SEARCH):
    param_distributions = {
        "trf_reg__transformer__kw_args": search.generate_clip_dicts(80,150,1),
        "trf_reg__regressor__criterion": ["squared_error","friedman_mse","absolute_error","poisson"],
        "trf_reg__regressor__splitter": ["best", "random"],
        "trf_reg__regressor__max_depth": [None,100,200,300,400,500,600,700],
        "trf_reg__regressor__min_samples_split": [2,5,10,15,20],
        "trf_reg__regressor__min_samples_leaf": [1,2,5,10,15,20,50,100],
        "trf_reg__regressor__max_features": [None, "sqrt", "log2"],
        "trf_reg__regressor__min_impurity_decrease": list(np.arange(0,147)/10),
        "trf_reg__regressor__ccp_alpha": list(np.round(np.linspace(0, 2, 81), decimals=3)) 
    
    }
    model = search.run_HR_GS(model, X_train, y_train, param_distributions)
    print(model)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 500
max_resources_: 20631
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 41
n_resources: 500
Fitting 5 folds for each of 41 candidates, totalling 205 fits
----------
iter: 1
n_candidates: 14
n_resources: 1500
Fitting 5 folds for each of 14 candidates, totalling 70 fits
----------
iter: 2
n_candidates: 5
n_resources: 4500
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 3
n_candidates: 2
n_resources: 13500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 110}, 'trf_reg__regressor__splitter': 'best', 'trf_reg__regressor__min_samples_split': 2, 'trf_reg__regressor__min_samples_leaf': 50, 'trf_reg__regressor__min_impurity_decrease': 11.3, 'trf_reg__regressor__max_features': None, 'trf_reg__regressor__max_depth': 100, 'trf_reg__regressor__criterion': 'friedman_mse', 'trf_reg__r

Best Model:

```
{'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 110}}

DecisionTreeRegressor(ccp_alpha=0.225,
    criterion='friedman_mse',
    max_depth=100,
    min_impurity_decrease=11.3,
    min_samples_leaf=50,
    random_state=42),
```



In [65]:
model = Pipeline([
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = DecisionTreeRegressor(ccp_alpha=0.225,
                                        criterion='friedman_mse',
                                        max_depth=100,
                                        min_impurity_decrease=11.3,
                                        min_samples_leaf=50,
                                        random_state=42),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':110})))
    ])
model

Pipeline(steps=[('trf_reg',
                 TransformedTargetRegressor(check_inverse=False,
                                            regressor=DecisionTreeRegressor(ccp_alpha=0.225,
                                                                            criterion='friedman_mse',
                                                                            max_depth=100,
                                                                            min_impurity_decrease=11.3,
                                                                            min_samples_leaf=50,
                                                                            random_state=42),
                                            transformer=FunctionTransformer(func=<function clip at 0x7f321b839cb0>,
                                                                            kw_args={'a_max': 110,
                                                                                     'a_min': 0})))])

In [66]:
model.fit(X_train, y_train)
reclipped_y =  model['trf_reg'].transformer.transform(y_train)
eval.show_result(reclipped_y, model.predict(X_train))
eval.show_result_cv(reclipped_y, X_train, model)

R2=0.828,RMSE=-14.969
(CV) R2=0.789,RMSE=-16.544


In [67]:
eval.show_result(y_test, model.predict(X_test))

R2=0.771,RMSE=-19.889


The best result we had so far in training and in testing

## PolyFeatures + Linear RUL

In [70]:
model = Pipeline([
    ('poly_ft'  ,   PolynomialFeatures()),
    ('tree_reg'  ,   DecisionTreeRegressor(random_state=42))
])
model

Pipeline(steps=[('poly_ft', PolynomialFeatures()),
                ('tree_reg', DecisionTreeRegressor(random_state=42))])

In [72]:
GRID_SEARCH = True
if (GRID_SEARCH):
    param_distributions = {
        "poly_ft__degree": [1,2,3],
        "poly_ft__interaction_only": [False, True],
        "poly_ft__include_bias": [True, False],
        "tree_reg__criterion": ["squared_error","friedman_mse","absolute_error","poisson"],
        "tree_reg__splitter": ["best", "random"],
        "tree_reg__max_depth": [None,100,200,300,400,500],
        "tree_reg__min_samples_split": [2,5,10,15,20],
        "tree_reg__min_samples_leaf": [1,2,5,10,15,20,50,100],
        "tree_reg__max_features": [None, "sqrt", "log2"],
        "tree_reg__min_impurity_decrease": list(np.arange(0,147)/10),
        "tree_reg__ccp_alpha": list(np.round(np.linspace(0, 2, 81), decimals=3)) 
    }
    model = search.run_HR_GS(model, X_train, y_train, 
                            param_distributions, scorer='r2')
    print(model)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 500
max_resources_: 20631
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 41
n_resources: 500
Fitting 5 folds for each of 41 candidates, totalling 205 fits
----------
iter: 1
n_candidates: 14
n_resources: 1500
Fitting 5 folds for each of 14 candidates, totalling 70 fits
----------
iter: 2
n_candidates: 5
n_resources: 4500
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 3
n_candidates: 2
n_resources: 13500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'tree_reg__splitter': 'best', 'tree_reg__min_samples_split': 2, 'tree_reg__min_samples_leaf': 100, 'tree_reg__min_impurity_decrease': 12.6, 'tree_reg__max_features': 'sqrt', 'tree_reg__max_depth': 300, 'tree_reg__criterion': 'friedman_mse', 'tree_reg__ccp_alpha': 1.55, 'poly_ft__interaction_only': True, 'poly_ft__include_bias': True, 'poly_ft__degree': 3}
Pipeline(steps=[('poly

Best Model:
```
PolynomialFeatures(degree=3, interaction_only=True))
DecisionTreeRegressor(ccp_alpha=1.55, criterion='friedman_mse',
        max_depth=300, max_features='sqrt',
        min_impurity_decrease=12.6,
        min_samples_leaf=100,
        random_state=42))
```


In [74]:
model = Pipeline([
    ('poly_ft'  ,   PolynomialFeatures(degree=3, interaction_only=True)),
    ('tree_reg'  ,   DecisionTreeRegressor(ccp_alpha=1.55, 
                                    criterion='friedman_mse',
                                    max_depth=300, max_features='sqrt',
                                    min_impurity_decrease=12.6,
                                    min_samples_leaf=100,
                                    random_state=42))
])
model

Pipeline(steps=[('poly_ft',
                 PolynomialFeatures(degree=3, interaction_only=True)),
                ('tree_reg',
                 DecisionTreeRegressor(ccp_alpha=1.55, criterion='friedman_mse',
                                       max_depth=300, max_features='sqrt',
                                       min_impurity_decrease=12.6,
                                       min_samples_leaf=100,
                                       random_state=42))])

In [75]:
model.fit(X_train, y_train)
eval.show_result(y_train, model.predict(X_train))
eval.show_result_cv(y_train, X_train, model)

R2=0.636,RMSE=-41.540
(CV) R2=0.569,RMSE=-44.636


In [76]:
eval.show_result(y_test, model.predict(X_test))

R2=0.278,RMSE=-35.308


No clear benefit from polynomial features in linear RUL, as with the other models. Actually, it worsened the results.

## PolyFeatures + Non-Linear RUL

In [77]:
model = Pipeline([
    ('poly_ft'  ,   PolynomialFeatures()),
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = DecisionTreeRegressor(random_state=42),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':96})))
])

In [79]:
# ~3min
GRID_SEARCH = True
if (GRID_SEARCH):
    param_distributions = {
        "poly_ft__degree": [1,2,3],
        "poly_ft__interaction_only": [False, True],
        "poly_ft__include_bias": [True, False],
        "trf_reg__transformer__kw_args": search.generate_clip_dicts(70,150,1),
        "trf_reg__regressor__criterion": ["squared_error","friedman_mse","absolute_error","poisson"],
        "trf_reg__regressor__splitter": ["best", "random"],
        "trf_reg__regressor__max_depth": [None,100,200,300,400,500,600,700],
        "trf_reg__regressor__min_samples_split": [2,5,10,15,20],
        "trf_reg__regressor__min_samples_leaf": [1,2,5,10,15,20,50,100],
        "trf_reg__regressor__max_features": [None, "sqrt", "log2"],
        "trf_reg__regressor__min_impurity_decrease": list(np.arange(0,147)/10),
        "trf_reg__regressor__ccp_alpha": list(np.round(np.linspace(0, 2, 81), decimals=3)) 
    }
    model = search.run_HR_GS(model, X_train, y_train, param_distributions)
    print(model)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 500
max_resources_: 20631
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 41
n_resources: 500
Fitting 5 folds for each of 41 candidates, totalling 205 fits
----------
iter: 1
n_candidates: 14
n_resources: 1500
Fitting 5 folds for each of 14 candidates, totalling 70 fits
----------
iter: 2
n_candidates: 5
n_resources: 4500
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 3
n_candidates: 2
n_resources: 13500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 92}, 'trf_reg__regressor__splitter': 'best', 'trf_reg__regressor__min_samples_split': 10, 'trf_reg__regressor__min_samples_leaf': 1, 'trf_reg__regressor__min_impurity_decrease': 0.4, 'trf_reg__regressor__max_features': None, 'trf_reg__regressor__max_depth': 400, 'trf_reg__regressor__criterion': 'squared_error', 'trf_reg__re

Best Model:
```
'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 92}
PolynomialFeatures(degree=3, interaction_only=True)
DecisionTreeRegressor(ccp_alpha=1.175,
                    max_depth=400,
                    min_impurity_decrease=0.4,
                    min_samples_split=10,
                    random_state=42),
```


In [80]:
model = Pipeline([
    ('poly_ft', PolynomialFeatures(degree=3, interaction_only=True)),
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = DecisionTreeRegressor(ccp_alpha=1.175,
                                            max_depth=400,
                                            min_impurity_decrease=0.4,
                                            min_samples_split=10,
                                            random_state=42),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':92})))
])
model

Pipeline(steps=[('poly_ft',
                 PolynomialFeatures(degree=3, interaction_only=True)),
                ('trf_reg',
                 TransformedTargetRegressor(check_inverse=False,
                                            regressor=DecisionTreeRegressor(ccp_alpha=1.175,
                                                                            max_depth=400,
                                                                            min_impurity_decrease=0.4,
                                                                            min_samples_split=10,
                                                                            random_state=42),
                                            transformer=FunctionTransformer(func=<function clip at 0x7f321b839cb0>,
                                                                            kw_args={'a_max': 92,
                                                                                     'a_min': 0})))])

In [81]:
# ~3min
model.fit(X_train, y_train)
reclipped_y =  model['trf_reg'].transformer.transform(y_train)
eval.show_result(reclipped_y, model.predict(X_train))
eval.show_result_cv(reclipped_y, X_train, model)

R2=0.834,RMSE=-11.864
(CV) R2=0.816,RMSE=-12.450


In [82]:
eval.show_result(y_test, model.predict(X_test))

R2=0.684,RMSE=-23.371


Using the polynominal features actually worsened the perfomance a little bit, when comparing with not using them and the non-linear RUL.

# RandomForestRegressor

## Linear RUL

In [87]:
model = Pipeline([
    ('tree_reg'  ,   RandomForestRegressor(random_state=42, n_jobs=-1))
])

In [88]:
# We need specific shape 1D arrays for this model
y_train = np.array(y_train).ravel()

In [89]:
# ~7min
GRID_SEARCH = False
if (GRID_SEARCH):
    param_distributions = {
        "tree_reg__n_estimators": [50,100,200,500],
        "tree_reg__criterion": ['squared_error','absolute_error','poisson'],
        "tree_reg__max_depth": [None, 50, 100, 150],
        "tree_reg__max_depth": [None,100,200,300,400,500],
        "tree_reg__min_samples_split": [2,5,10,15,20],
        "tree_reg__min_samples_leaf": [1,2,5,10,15,20,50,100],
        "tree_reg__max_features": [None, "sqrt", "log2"],
        "tree_reg__min_impurity_decrease": list(np.arange(0,147)/10),
        "tree_reg__ccp_alpha": list(np.round(np.linspace(0, 2, 81), decimals=3)),
        "tree_reg__oob_score": [True, False]
    }
    model = search.run_HR_GS(model, X_train, y_train, param_distributions, 
                            scorer='r2', ignore_warnings=True)
    print(model)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 500
max_resources_: 20631
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 41
n_resources: 500
Fitting 5 folds for each of 41 candidates, totalling 205 fits
----------
iter: 1
n_candidates: 14
n_resources: 1500
Fitting 5 folds for each of 14 candidates, totalling 70 fits
----------
iter: 2
n_candidates: 5
n_resources: 4500
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 3
n_candidates: 2
n_resources: 13500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'tree_reg__oob_score': False, 'tree_reg__n_estimators': 50, 'tree_reg__min_samples_split': 5, 'tree_reg__min_samples_leaf': 10, 'tree_reg__min_impurity_decrease': 1.5, 'tree_reg__max_features': 'log2', 'tree_reg__max_depth': 500, 'tree_reg__criterion': 'squared_error', 'tree_reg__ccp_alpha': 0.025}
Pipeline(steps=[('tree_reg',
                 RandomForestRegressor(ccp_alpha=0.

Best Model:
```
RandomForestRegressor(ccp_alpha=0.025, max_depth=500,
                                       max_features='log2',
                                       min_impurity_decrease=1.5,
                                       min_samples_leaf=10, min_samples_split=5,
                                       n_estimators=50, n_jobs=-1,
                                       random_state=42))
```

In [91]:
model = Pipeline([
    ('tree_reg'  ,   RandomForestRegressor(ccp_alpha=0.025, max_depth=500,
                                       max_features='log2',
                                       min_impurity_decrease=1.5,
                                       min_samples_leaf=10, min_samples_split=5,
                                       n_estimators=50, n_jobs=-1,
                                       random_state=42))
])
model

Pipeline(steps=[('tree_reg',
                 RandomForestRegressor(ccp_alpha=0.025, max_depth=500,
                                       max_features='log2',
                                       min_impurity_decrease=1.5,
                                       min_samples_leaf=10, min_samples_split=5,
                                       n_estimators=50, n_jobs=-1,
                                       random_state=42))])

In [92]:
model.fit(X_train, y_train)
eval.show_result(y_train, model.predict(X_train))
eval.show_result_cv(y_train, X_train, model)

R2=0.664,RMSE=-39.952
(CV) R2=0.611,RMSE=-42.434


In [93]:
eval.show_result(y_test, model.predict(X_test))

R2=0.479,RMSE=-30.003


As bad as linear regression

## Non-linear RUL

In [94]:
model = Pipeline([
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = RandomForestRegressor(random_state=42, n_jobs=-1),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':50})))
    ])

In [95]:
# ~8min
GRID_SEARCH = False
if (GRID_SEARCH):
    param_distributions = {
        "trf_reg__transformer__kw_args": search.generate_clip_dicts(80,150,1),
        "trf_reg__regressor__n_estimators": [50,100,200,500],
        "trf_reg__regressor__criterion": ['squared_error','absolute_error','poisson'],
        "trf_reg__regressor__max_depth": [None, 50, 100, 150],
        "trf_reg__regressor__max_depth": [None,100,200,300,400,500],
        "trf_reg__regressor__min_samples_split": [2,5,10,15,20],
        "trf_reg__regressor__min_samples_leaf": [1,2,5,10,15,20,50,100],
        "trf_reg__regressor__max_features": [None, "sqrt", "log2"],
        "trf_reg__regressor__min_impurity_decrease": list(np.arange(0,147)/10),
        "trf_reg__regressor__ccp_alpha": list(np.round(np.linspace(0, 2, 81), decimals=3)),
        "trf_reg__regressor__oob_score": [True, False]
    
    }
    model = search.run_HR_GS(model, X_train, y_train, param_distributions)
    print(model)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 500
max_resources_: 20631
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 41
n_resources: 500
Fitting 5 folds for each of 41 candidates, totalling 205 fits
----------
iter: 1
n_candidates: 14
n_resources: 1500
Fitting 5 folds for each of 14 candidates, totalling 70 fits
----------
iter: 2
n_candidates: 5
n_resources: 4500
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 3
n_candidates: 2
n_resources: 13500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 101}, 'trf_reg__regressor__oob_score': False, 'trf_reg__regressor__n_estimators': 500, 'trf_reg__regressor__min_samples_split': 10, 'trf_reg__regressor__min_samples_leaf': 10, 'trf_reg__regressor__min_impurity_decrease': 2.5, 'trf_reg__regressor__max_features': 'log2', 'trf_reg__regressor__max_depth': 200, 'trf_reg__regress

Best Model:

```
{'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 101}}

RandomForestRegressor(ccp_alpha=1.425,
                    max_depth=200,
                    max_features='log2',
                    min_impurity_decrease=2.5,
                    min_samples_leaf=10,
                    min_samples_split=10,
                    n_estimators=500,
                    n_jobs=-1,
                    random_state=42),
```



In [96]:
model = Pipeline([
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = RandomForestRegressor(ccp_alpha=1.425,
                                        max_depth=200,
                                        max_features='log2',
                                        min_impurity_decrease=2.5,
                                        min_samples_leaf=10,
                                        min_samples_split=10,
                                        n_estimators=500,
                                        n_jobs=-1,
                                        random_state=42),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':110})))
    ])
model

Pipeline(steps=[('trf_reg',
                 TransformedTargetRegressor(check_inverse=False,
                                            regressor=RandomForestRegressor(ccp_alpha=1.425,
                                                                            max_depth=200,
                                                                            max_features='log2',
                                                                            min_impurity_decrease=2.5,
                                                                            min_samples_leaf=10,
                                                                            min_samples_split=10,
                                                                            n_estimators=500,
                                                                            n_jobs=-1,
                                                                            random_state=42),
                                            transform

In [97]:
model.fit(X_train, y_train)
reclipped_y =  model['trf_reg'].transformer.transform(y_train)
eval.show_result(reclipped_y, model.predict(X_train))
eval.show_result_cv(reclipped_y, X_train, model)

R2=0.817,RMSE=-15.457
(CV) R2=0.806,RMSE=-15.840


In [98]:
eval.show_result(y_test, model.predict(X_test))

R2=0.778,RMSE=-19.599


Following the results from the decision trees, we have a good result and good generalization.

## PolyFeatures + Linear RUL

In [99]:
model = Pipeline([
    ('poly_ft'  ,   PolynomialFeatures()),
    ('tree_reg'  ,   RandomForestRegressor(random_state=42, n_jobs=-1)),
])
model

Pipeline(steps=[('poly_ft', PolynomialFeatures()),
                ('tree_reg',
                 RandomForestRegressor(n_jobs=-1, random_state=42))])

In [100]:
# ~25min
GRID_SEARCH = False
if (GRID_SEARCH):
    param_distributions = {
        "poly_ft__degree": [1,2,3],
        "poly_ft__interaction_only": [False, True],
        "poly_ft__include_bias": [True, False],
        "tree_reg__n_estimators": [50,100,200,500],
        "tree_reg__criterion": ['squared_error','absolute_error','poisson'],
        "tree_reg__max_depth": [None, 50, 100, 150],
        "tree_reg__max_depth": [None,100,200,300,400,500],
        "tree_reg__min_samples_split": [2,5,10,15,20],
        "tree_reg__min_samples_leaf": [1,2,5,10,15,20,50,100],
        "tree_reg__max_features": [None, "sqrt", "log2"],
        "tree_reg__min_impurity_decrease": list(np.arange(0,147)/10),
        "tree_reg__ccp_alpha": list(np.round(np.linspace(0, 2, 81), decimals=3)),
        "tree_reg__oob_score": [True, False]
    }
    model = search.run_HR_GS(model, X_train, y_train, 
                            param_distributions, scorer='r2')
    print(model)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 500
max_resources_: 20631
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 41
n_resources: 500
Fitting 5 folds for each of 41 candidates, totalling 205 fits
----------
iter: 1
n_candidates: 14
n_resources: 1500
Fitting 5 folds for each of 14 candidates, totalling 70 fits
----------
iter: 2
n_candidates: 5
n_resources: 4500
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 3
n_candidates: 2
n_resources: 13500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'tree_reg__oob_score': True, 'tree_reg__n_estimators': 500, 'tree_reg__min_samples_split': 10, 'tree_reg__min_samples_leaf': 2, 'tree_reg__min_impurity_decrease': 2.0, 'tree_reg__max_features': 'log2', 'tree_reg__max_depth': 100, 'tree_reg__criterion': 'squared_error', 'tree_reg__ccp_alpha': 0.7, 'poly_ft__interaction_only': False, 'poly_ft__include_bias': True, 'poly_ft__degre

Best Model:
```
('poly_ft', PolynomialFeatures(degree=3))
RandomForestRegressor(ccp_alpha=0.7, max_depth=100,
                                       max_features='log2',
                                       min_impurity_decrease=2.0,
                                       min_samples_leaf=2, min_samples_split=10,
                                       n_estimators=500, n_jobs=-1,
                                       oob_score=True, random_state=42))
```


In [101]:
model = Pipeline([
    ('poly_ft'  ,   PolynomialFeatures(degree=3)),
    ('tree_reg'  ,   RandomForestRegressor(ccp_alpha=0.7, max_depth=100,
                                       max_features='log2',
                                       min_impurity_decrease=2.0,
                                       min_samples_leaf=2, min_samples_split=10,
                                       n_estimators=500, n_jobs=-1,
                                       oob_score=True, random_state=42))
])
model

Pipeline(steps=[('poly_ft', PolynomialFeatures(degree=3)),
                ('tree_reg',
                 RandomForestRegressor(ccp_alpha=0.7, max_depth=100,
                                       max_features='log2',
                                       min_impurity_decrease=2.0,
                                       min_samples_leaf=2, min_samples_split=10,
                                       n_estimators=500, n_jobs=-1,
                                       oob_score=True, random_state=42))])

In [102]:
model.fit(X_train, y_train)
eval.show_result(y_train, model.predict(X_train))
eval.show_result_cv(y_train, X_train, model)

R2=0.684,RMSE=-38.691
(CV) R2=0.614,RMSE=-42.249


In [103]:
eval.show_result(y_test, model.predict(X_test))

R2=0.439,RMSE=-31.132


No clear benefit from polynomial features in linear RUL, as with the other models. Actually, it worsened the results.

## PolyFeatures + Non-Linear RUL

In [104]:
model = Pipeline([
    ('poly_ft'  ,   PolynomialFeatures()),
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = RandomForestRegressor(random_state=42, n_jobs=-1),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':96})))
])

In [105]:
# ~1h BE AWARE
GRID_SEARCH = False
if (GRID_SEARCH):
    param_distributions = {
        "poly_ft__degree": [1,2,3],
        "poly_ft__interaction_only": [False, True],
        "poly_ft__include_bias": [True, False],
        "trf_reg__transformer__kw_args": search.generate_clip_dicts(70,150,1),
        "trf_reg__regressor__n_estimators": [50,100,200,500],
        "trf_reg__regressor__criterion": ['squared_error','absolute_error','poisson'],
        "trf_reg__regressor__max_depth": [None, 50, 100, 150],
        "trf_reg__regressor__max_depth": [None,100,200,300,400,500],
        "trf_reg__regressor__min_samples_split": [2,5,10,15,20],
        "trf_reg__regressor__min_samples_leaf": [1,2,5,10,15,20,50,100],
        "trf_reg__regressor__max_features": [None, "sqrt", "log2"],
        "trf_reg__regressor__min_impurity_decrease": list(np.arange(0,147)/10),
        "trf_reg__regressor__ccp_alpha": list(np.round(np.linspace(0, 2, 81), decimals=3)),
        "trf_reg__regressor__oob_score": [True, False]
    }
    model = search.run_HR_GS(model, X_train, y_train, param_distributions)
    print(model)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 500
max_resources_: 20631
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 41
n_resources: 500
Fitting 5 folds for each of 41 candidates, totalling 205 fits
----------
iter: 1
n_candidates: 14
n_resources: 1500
Fitting 5 folds for each of 14 candidates, totalling 70 fits
----------
iter: 2
n_candidates: 5
n_resources: 4500
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 3
n_candidates: 2
n_resources: 13500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 75}, 'trf_reg__regressor__oob_score': True, 'trf_reg__regressor__n_estimators': 50, 'trf_reg__regressor__min_samples_split': 10, 'trf_reg__regressor__min_samples_leaf': 20, 'trf_reg__regressor__min_impurity_decrease': 4.1, 'trf_reg__regressor__max_features': 'sqrt', 'trf_reg__regressor__max_depth': 300, 'trf_reg__regressor_

Best Model:
```
'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 75}
PolynomialFeatures(degree=3, include_bias=False,
                                    interaction_only=True)
RandomForestRegressor(ccp_alpha=0.65,
                max_depth=300,
                max_features='sqrt',
                min_impurity_decrease=4.1,
                min_samples_leaf=20,
                min_samples_split=10,
                n_estimators=50,
                n_jobs=-1,
                oob_score=True,
                random_state=42),
```


In [106]:
model = Pipeline([
    ('poly_ft', PolynomialFeatures(degree=3, include_bias=False,
                                    interaction_only=True)),
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = RandomForestRegressor(ccp_alpha=0.65,
                max_depth=300,
                max_features='sqrt',
                min_impurity_decrease=4.1,
                min_samples_leaf=20,
                min_samples_split=10,
                n_estimators=50,
                n_jobs=-1,
                oob_score=True,
                random_state=42),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':75})))
])
model

Pipeline(steps=[('poly_ft',
                 PolynomialFeatures(degree=3, include_bias=False,
                                    interaction_only=True)),
                ('trf_reg',
                 TransformedTargetRegressor(check_inverse=False,
                                            regressor=RandomForestRegressor(ccp_alpha=0.65,
                                                                            max_depth=300,
                                                                            max_features='sqrt',
                                                                            min_impurity_decrease=4.1,
                                                                            min_samples_leaf=20,
                                                                            min_samples_split=10,
                                                                            n_estimators=50,
                                                                            n_job

In [107]:
# ~3min
model.fit(X_train, y_train)
reclipped_y =  model['trf_reg'].transformer.transform(y_train)
eval.show_result(reclipped_y, model.predict(X_train))
eval.show_result_cv(reclipped_y, X_train, model)

R2=0.854,RMSE=-8.587
(CV) R2=0.847,RMSE=-8.747


In [109]:
eval.show_result(y_test, model.predict(X_test))

R2=0.422,RMSE=-31.588


Using the polynominal features actually worsened the perfomance, when comparing with not using them and the non-linear RUL.