<a href="https://colab.research.google.com/github/arthursl12/POC1/blob/main/POC1_Turbofan_FD004_Model_Tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import glob

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PolynomialFeatures

In [3]:
sns.set_palette('colorblind')

# Data Preparation

In [4]:
# Dataset Download 
os.system('git clone https://github.com/arthursl12/dataset_2')
os.system('mv /content/dataset_2/CMaps /content/CMaps')
os.system('mv /content/dataset_2/data_processing /content/data_processing')
os.system('rm -rf dataset_2')

0

In [5]:
from data_processing.processing import DatasetProcessing
from data_processing.training import HyperparameterSearch, reclipper_scorer
from data_processing.eval import Evaluation

In [6]:
proc = DatasetProcessing()

## Data Integration

The data are provided as a zip-compressed text file with 26 columns of numbers, separated by spaces. Each row is a snapshot of data taken during a single operational cycle, each column is a different variable. The columns correspond to:  

1) unit number   
2) time, in cycles  
3) operational setting 1  
4) operational setting 2  
5) operational setting 3    
6) sensor measurement 1    
7) sensor measurement 2  
...  
26) sensor measurement 20


There are 6 conditions (or combinations) which the 3 operational settings can take.  
Condition 1: Altitude = 0, Mach Number = 0, TRA = 100  
Condition 2: Altitude = 10, Mach Number = 0.25, TRA = 100  
Condition 3: Altitude = 20, Mach Number = 0.7 TRA = 100  
Condition 4: Altitude = 25, Mach Number = 0.62, TRA = 60  
Condition 5: Altitude = 35 Mach Number = 0.84, TRA = 100  
Condition 6: Altitude = 42, Mach Number = 0.84, TRA = 100  
  
There is slight variation in all these conditions so you may get numbers like 24.453 instead of 25 exactly.

FD001: Condition 1 only  
FD002: Mix of all the conditions  
FD003: Condition 1 only  
FD004: Mix of all conditions  


In [7]:
index_cols, settings_cols, sensors_cols, cols = proc.column_names()
train, test, y_test = proc.read_dataset(4)
train

Unnamed: 0,unit_number,time,op_1,op_2,op_3,s_0,s_1,s_2,s_3,s_4,...,s_11,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20
0,1,1,42.0049,0.8400,100.0,445.00,549.68,1343.43,1112.93,3.91,...,129.78,2387.99,8074.83,9.3335,0.02,330,2212,100.00,10.62,6.3670
1,1,2,20.0020,0.7002,100.0,491.19,606.07,1477.61,1237.50,9.35,...,312.59,2387.73,8046.13,9.1913,0.02,361,2324,100.00,24.37,14.6552
2,1,3,42.0038,0.8409,100.0,445.00,548.95,1343.12,1117.05,3.91,...,129.62,2387.97,8066.62,9.4007,0.02,329,2212,100.00,10.48,6.4213
3,1,4,42.0000,0.8400,100.0,445.00,548.70,1341.24,1118.03,3.91,...,129.80,2388.02,8076.05,9.3369,0.02,328,2212,100.00,10.54,6.4176
4,1,5,25.0063,0.6207,60.0,462.54,536.10,1255.23,1033.59,7.05,...,164.11,2028.08,7865.80,10.8366,0.02,305,1915,84.93,14.03,8.6754
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61244,249,251,9.9998,0.2500,100.0,489.05,605.33,1516.36,1315.28,10.52,...,380.16,2388.73,8185.69,8.4541,0.03,372,2319,100.00,29.11,17.5234
61245,249,252,0.0028,0.0015,100.0,518.67,643.42,1598.92,1426.77,14.62,...,535.02,2388.46,8185.47,8.2221,0.03,396,2388,100.00,39.38,23.7151
61246,249,253,0.0029,0.0000,100.0,518.67,643.68,1607.72,1430.56,14.62,...,535.41,2388.48,8193.94,8.2525,0.03,395,2388,100.00,39.78,23.8270
61247,249,254,35.0046,0.8400,100.0,449.44,555.77,1381.29,1148.18,5.48,...,187.92,2388.83,8125.64,9.0515,0.02,337,2223,100.00,15.26,9.0774


## Preprocessing

### Test Set Transformation 
Test set has samples for all cycles, but has annotations only for last one

In [8]:
test.shape, y_test.shape

((41214, 26), (248, 1))

In [9]:
test_last = proc.transform_test_keep_setting(test)
test_last.head()

Unnamed: 0,op_1,op_2,op_3,s_0,s_1,s_2,s_3,s_4,s_5,s_6,...,s_11,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20
0,25.007,0.6214,60.0,462.54,537.66,1264.31,1046.41,7.05,8.99,176.56,...,166.19,2028.53,7890.31,10.7615,0.02,308,1915,84.93,14.41,8.6329
1,41.9989,0.84,100.0,445.0,549.96,1354.05,1133.55,3.91,5.72,139.03,...,130.17,2387.72,8073.44,9.3925,0.02,331,2212,100.0,10.58,6.4325
2,42.0005,0.8401,100.0,445.0,549.47,1341.06,1118.9,3.91,5.69,139.26,...,130.73,2388.18,8095.58,9.2974,0.02,330,2212,100.0,10.61,6.3488
3,25.0018,0.6207,60.0,462.54,536.06,1253.49,1038.53,7.05,9.0,175.63,...,164.91,2028.3,7878.63,10.8396,0.02,306,1915,84.93,14.41,8.5696
4,25.0039,0.62,60.0,462.54,537.36,1263.6,1052.52,7.05,9.03,175.53,...,164.95,2028.24,7873.75,10.9094,0.02,307,1915,84.93,14.19,8.6248


In [10]:
X_test = test_last

### Remaining Useful Life (RUL)

In [11]:
train = proc.add_remaining_useful_life_linear(train)
train[index_cols+['RUL']].head()

Unnamed: 0,unit_number,time,RUL
0,1,1,320
1,1,2,319
2,1,3,318
3,1,4,317
4,1,5,316


## Attributes and target separation

In [12]:
X_train, y_train = proc.X_y_train_divide_with_settings(train)

In [13]:
y_train.head()

Unnamed: 0,RUL
0,320
1,319
2,318
3,317
4,316


In [14]:
X_train.head()

Unnamed: 0,op_1,op_2,op_3,s_0,s_1,s_2,s_3,s_4,s_5,s_6,...,s_11,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20
0,42.0049,0.84,100.0,445.0,549.68,1343.43,1112.93,3.91,5.7,137.36,...,129.78,2387.99,8074.83,9.3335,0.02,330,2212,100.0,10.62,6.367
1,20.002,0.7002,100.0,491.19,606.07,1477.61,1237.5,9.35,13.61,332.1,...,312.59,2387.73,8046.13,9.1913,0.02,361,2324,100.0,24.37,14.6552
2,42.0038,0.8409,100.0,445.0,548.95,1343.12,1117.05,3.91,5.69,138.18,...,129.62,2387.97,8066.62,9.4007,0.02,329,2212,100.0,10.48,6.4213
3,42.0,0.84,100.0,445.0,548.7,1341.24,1118.03,3.91,5.7,137.98,...,129.8,2388.02,8076.05,9.3369,0.02,328,2212,100.0,10.54,6.4176
4,25.0063,0.6207,60.0,462.54,536.1,1255.23,1033.59,7.05,9.0,174.82,...,164.11,2028.08,7865.8,10.8366,0.02,305,1915,84.93,14.03,8.6754


## Training and Evaluation functions

In [15]:
eval = Evaluation()

In [16]:
search = HyperparameterSearch()

# DecisionTreeRegressor

## Linear RUL

In [17]:
model = Pipeline([
    ('tree_reg'  ,   DecisionTreeRegressor(random_state=42))
])

In [18]:
GRID_SEARCH = True
if (GRID_SEARCH):
    param_distributions = {
        "tree_reg__criterion": ["squared_error","friedman_mse","absolute_error","poisson"],
        "tree_reg__splitter": ["best", "random"],
        "tree_reg__max_depth": [None,100,200,300],
        "tree_reg__min_samples_split": [2,5,10,15,20],
        "tree_reg__min_samples_leaf": [1,2,5,10,15,20,50,100],
        "tree_reg__max_features": ["sqrt", "log2"],
        "tree_reg__min_impurity_decrease": list(np.arange(0,150)/10),
        "tree_reg__ccp_alpha": list(np.round(np.linspace(0, 2, 81), decimals=3)),
    }
    model = search.run_HR_GS(model, X_train, y_train, param_distributions, 
                            scorer='r2', ignore_warnings=True)
    print(model)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 500
max_resources_: 61249
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 122
n_resources: 500
Fitting 5 folds for each of 122 candidates, totalling 610 fits
----------
iter: 1
n_candidates: 41
n_resources: 1500
Fitting 5 folds for each of 41 candidates, totalling 205 fits
----------
iter: 2
n_candidates: 14
n_resources: 4500
Fitting 5 folds for each of 14 candidates, totalling 70 fits
----------
iter: 3
n_candidates: 5
n_resources: 13500
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 4
n_candidates: 2
n_resources: 40500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'tree_reg__splitter': 'best', 'tree_reg__min_samples_split': 15, 'tree_reg__min_samples_leaf': 20, 'tree_reg__min_impurity_decrease': 6.0, 'tree_reg__max_features': 'log2', 'tree_reg__max_depth': None, 'tree_reg__criterion': 'friedman_mse', 'tree_reg__ccp_alpha

Best Model:
```
DecisionTreeRegressor(ccp_alpha=1.2, criterion='friedman_mse',
                                       max_features='log2',
                                       min_impurity_decrease=6.0,
                                       min_samples_leaf=20,
                                       min_samples_split=15,
                                       random_state=42))
```

In [19]:
model = Pipeline([
    ('tree_reg'  ,   DecisionTreeRegressor(ccp_alpha=1.2, criterion='friedman_mse',
                                       max_features='log2',
                                       min_impurity_decrease=6.0,
                                       min_samples_leaf=20,
                                       min_samples_split=15,
                                       random_state=42))
])
model

Pipeline(steps=[('tree_reg',
                 DecisionTreeRegressor(ccp_alpha=1.2, criterion='friedman_mse',
                                       max_features='log2',
                                       min_impurity_decrease=6.0,
                                       min_samples_leaf=20,
                                       min_samples_split=15,
                                       random_state=42))])

In [20]:
model.fit(X_train, y_train)
eval.show_result(y_train, model.predict(X_train))
eval.show_result_cv(y_train, X_train, model)

R2=0.577,RMSE=-58.383
(CV) R2=0.520,RMSE=-61.976


In [21]:
eval.show_result(y_test, model.predict(X_test))

R2=0.346,RMSE=-44.095


Poor results. Lowest in test, as it is the hardest.

## Non-linear RUL

In [22]:
model = Pipeline([
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = DecisionTreeRegressor(random_state=42),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':50})))
    ])

In [23]:
GRID_SEARCH = True
if (GRID_SEARCH):
    param_distributions = {
        "trf_reg__transformer__kw_args": search.generate_clip_dicts(80,150,1),
        "trf_reg__regressor__criterion": ["squared_error","friedman_mse","absolute_error","poisson"],
        "trf_reg__regressor__splitter": ["best", "random"],
        "trf_reg__regressor__max_depth": [None,100,200,300],
        "trf_reg__regressor__min_samples_split": [2,5,10,15,20],
        "trf_reg__regressor__min_samples_leaf": [1,2,5,10,15,20,50,100],
        "trf_reg__regressor__max_features": ["sqrt", "log2"],
        "trf_reg__regressor__min_impurity_decrease": list(np.arange(0,150)/10),
        "trf_reg__regressor__ccp_alpha": list(np.round(np.linspace(0, 2, 81), decimals=3)) 
    
    }
    model = search.run_HR_GS(model, X_train, y_train, param_distributions)
    print(model)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 500
max_resources_: 61249
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 122
n_resources: 500
Fitting 5 folds for each of 122 candidates, totalling 610 fits
----------
iter: 1
n_candidates: 41
n_resources: 1500
Fitting 5 folds for each of 41 candidates, totalling 205 fits
----------
iter: 2
n_candidates: 14
n_resources: 4500
Fitting 5 folds for each of 14 candidates, totalling 70 fits
----------
iter: 3
n_candidates: 5
n_resources: 13500
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 4
n_candidates: 2
n_resources: 40500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 98}, 'trf_reg__regressor__splitter': 'best', 'trf_reg__regressor__min_samples_split': 2, 'trf_reg__regressor__min_samples_leaf': 10, 'trf_reg__regressor__min_impurity_decrease': 5.5, 'trf_reg__regressor__ma

Best Model:

```
{'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 98}}

DecisionTreeRegressor(ccp_alpha=0.375,
                    criterion='friedman_mse',
                    max_depth=200,
                    max_features='sqrt',
                    min_impurity_decrease=5.5,
                    min_samples_leaf=10,
                    random_state=42),
```



In [24]:
model = Pipeline([
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = DecisionTreeRegressor(ccp_alpha=0.375,
                            criterion='friedman_mse',
                            max_depth=200,
                            max_features='sqrt',
                            min_impurity_decrease=5.5,
                            min_samples_leaf=10,
                            random_state=42),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':98})))
    ])
model

Pipeline(steps=[('trf_reg',
                 TransformedTargetRegressor(check_inverse=False,
                                            regressor=DecisionTreeRegressor(ccp_alpha=0.375,
                                                                            criterion='friedman_mse',
                                                                            max_depth=200,
                                                                            max_features='sqrt',
                                                                            min_impurity_decrease=5.5,
                                                                            min_samples_leaf=10,
                                                                            random_state=42),
                                            transformer=FunctionTransformer(func=<function clip at 0x7f070c523c20>,
                                                                            kw_args={'a_max': 98,
                

In [25]:
model.fit(X_train, y_train)
reclipped_y =  model['trf_reg'].transformer.transform(y_train)
eval.show_result(reclipped_y, model.predict(X_train))
eval.show_result_cv(reclipped_y, X_train, model)

R2=0.776,RMSE=-14.243
(CV) R2=0.751,RMSE=-15.003


In [26]:
reclipped_y =  model['trf_reg'].transformer.transform(y_test)
eval.show_result(reclipped_y, model.predict(X_test))

R2=0.742,RMSE=-17.293


Good results, as with other groups of data. Very same model as FD001, FD003 and FD002!

## PolyFeatures + Linear RUL

In [27]:
model = Pipeline([
    ('poly_ft'  ,   PolynomialFeatures()),
    ('tree_reg'  ,   DecisionTreeRegressor(random_state=42))
])
model

Pipeline(steps=[('poly_ft', PolynomialFeatures()),
                ('tree_reg', DecisionTreeRegressor(random_state=42))])

In [28]:
# ~2min
GRID_SEARCH = True
if (GRID_SEARCH):
    param_distributions = {
        "poly_ft__degree": [1,2,3],
        "poly_ft__interaction_only": [False, True],
        "poly_ft__include_bias": [True, False],
        "tree_reg__criterion": ["squared_error","friedman_mse","absolute_error","poisson"],
        "tree_reg__splitter": ["best", "random"],
        "tree_reg__max_depth": [None,100,200,300],
        "tree_reg__min_samples_split": [2,5,10,15,20],
        "tree_reg__min_samples_leaf": [1,2,5,10,15,20,50,100],
        "tree_reg__max_features": [None, "sqrt", "log2"],
        "tree_reg__min_impurity_decrease": list(np.arange(0,150)/10),
        "tree_reg__ccp_alpha": list(np.round(np.linspace(0, 2, 81), decimals=3)) 
    }
    model = search.run_HR_GS(model, X_train, y_train, 
                            param_distributions, scorer='r2')
    print(model)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 500
max_resources_: 61249
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 122
n_resources: 500
Fitting 5 folds for each of 122 candidates, totalling 610 fits
----------
iter: 1
n_candidates: 41
n_resources: 1500
Fitting 5 folds for each of 41 candidates, totalling 205 fits
----------
iter: 2
n_candidates: 14
n_resources: 4500
Fitting 5 folds for each of 14 candidates, totalling 70 fits
----------
iter: 3
n_candidates: 5
n_resources: 13500
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 4
n_candidates: 2
n_resources: 40500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'tree_reg__splitter': 'random', 'tree_reg__min_samples_split': 15, 'tree_reg__min_samples_leaf': 15, 'tree_reg__min_impurity_decrease': 2.7, 'tree_reg__max_features': None, 'tree_reg__max_depth': None, 'tree_reg__criterion': 'squared_error', 'tree_reg__ccp_alph

Best Model:
```
PolynomialFeatures(include_bias=False, interaction_only=True)),
DecisionTreeRegressor(ccp_alpha=0.175,
                        min_impurity_decrease=2.7,
                        min_samples_leaf=15,
                        min_samples_split=15, random_state=42,
                        splitter='random'))
```


In [29]:
model = Pipeline([
    ('poly_ft'  ,   PolynomialFeatures(include_bias=False, 
                                       interaction_only=True)),
    ('tree_reg'  ,   DecisionTreeRegressor(ccp_alpha=0.175,
                        min_impurity_decrease=2.7,
                        min_samples_leaf=15,
                        min_samples_split=15, random_state=42,
                        splitter='random'))
])
model

Pipeline(steps=[('poly_ft',
                 PolynomialFeatures(include_bias=False, interaction_only=True)),
                ('tree_reg',
                 DecisionTreeRegressor(ccp_alpha=0.175,
                                       min_impurity_decrease=2.7,
                                       min_samples_leaf=15,
                                       min_samples_split=15, random_state=42,
                                       splitter='random'))])

In [30]:
model.fit(X_train, y_train)
eval.show_result(y_train, model.predict(X_train))
eval.show_result_cv(y_train, X_train, model)

R2=0.553,RMSE=-60.017
(CV) R2=0.523,RMSE=-61.789


In [31]:
eval.show_result(y_test, model.predict(X_test))

R2=0.170,RMSE=-49.677


No clear benefit from polynomial features in linear RUL, as with the other models. Actually, it worsened the results, especially in test set.

## PolyFeatures + Non-Linear RUL

In [32]:
model = Pipeline([
    ('poly_ft'  ,   PolynomialFeatures()),
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = DecisionTreeRegressor(random_state=42),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':96})))
])

In [33]:
# ~3min
GRID_SEARCH = False
if (GRID_SEARCH):
    param_distributions = {
        "poly_ft__degree": [1,2,3],
        "poly_ft__interaction_only": [False, True],
        "poly_ft__include_bias": [True, False],
        "trf_reg__transformer__kw_args": search.generate_clip_dicts(70,150,1),
        "trf_reg__regressor__criterion": ["squared_error","friedman_mse","absolute_error","poisson"],
        "trf_reg__regressor__splitter": ["best", "random"],
        "trf_reg__regressor__max_depth": [None,100,200,300],
        "trf_reg__regressor__min_samples_split": [2,5,10,15,20],
        "trf_reg__regressor__min_samples_leaf": [1,2,5,10,15,20,50,100],
        "trf_reg__regressor__max_features": [None, "sqrt", "log2"],
        "trf_reg__regressor__min_impurity_decrease": list(np.arange(0,150)/10),
        "trf_reg__regressor__ccp_alpha": list(np.round(np.linspace(0, 2, 81), decimals=3)) 
    }
    model = search.run_HR_GS(model, X_train, y_train, param_distributions)
    print(model)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 500
max_resources_: 61249
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 122
n_resources: 500
Fitting 5 folds for each of 122 candidates, totalling 610 fits
----------
iter: 1
n_candidates: 41
n_resources: 1500
Fitting 5 folds for each of 41 candidates, totalling 205 fits
----------
iter: 2
n_candidates: 14
n_resources: 4500
Fitting 5 folds for each of 14 candidates, totalling 70 fits
----------
iter: 3
n_candidates: 5
n_resources: 13500
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 4
n_candidates: 2
n_resources: 40500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 95}, 'trf_reg__regressor__splitter': 'random', 'trf_reg__regressor__min_samples_split': 15, 'trf_reg__regressor__min_samples_leaf': 15, 'trf_reg__regressor__min_impurity_decrease': 0.3, 'trf_reg__regressor_

Best Model:
```
'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 95}
PolynomialFeatures(include_bias=False, interaction_only=True)),
DecisionTreeRegressor(ccp_alpha=0.6,
                        criterion='friedman_mse',
                        max_depth=100,
                        min_impurity_decrease=0.3,
                        min_samples_leaf=15,
                        min_samples_split=15,
                        random_state=42,
                        splitter='random'),
```


In [34]:
model = Pipeline([
    ('poly_ft', PolynomialFeatures(include_bias=False, interaction_only=True)),
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = DecisionTreeRegressor(ccp_alpha=0.6,
                                            criterion='friedman_mse',
                                            max_depth=100,
                                            min_impurity_decrease=0.3,
                                            min_samples_leaf=15,
                                            min_samples_split=15,
                                            random_state=42,
                                            splitter='random'),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':95})))
])
model

Pipeline(steps=[('poly_ft',
                 PolynomialFeatures(include_bias=False, interaction_only=True)),
                ('trf_reg',
                 TransformedTargetRegressor(check_inverse=False,
                                            regressor=DecisionTreeRegressor(ccp_alpha=0.6,
                                                                            criterion='friedman_mse',
                                                                            max_depth=100,
                                                                            min_impurity_decrease=0.3,
                                                                            min_samples_leaf=15,
                                                                            min_samples_split=15,
                                                                            random_state=42,
                                                                            splitter='random'),
                           

In [35]:
model.fit(X_train, y_train)
reclipped_y =  model['trf_reg'].transformer.transform(y_train)
eval.show_result(reclipped_y, model.predict(X_train))
eval.show_result_cv(reclipped_y, X_train, model)

R2=0.772,RMSE=-13.813
(CV) R2=0.765,RMSE=-14.000


In [36]:
reclipped_y =  model['trf_reg'].transformer.transform(y_test)
eval.show_result(reclipped_y, model.predict(X_test))

R2=0.768,RMSE=-15.868


It improved a little bit the performance, compared to not using it.

# RandomForestRegressor

## Linear RUL

In [37]:
model = Pipeline([
    ('tree_reg'  ,   RandomForestRegressor(random_state=42, n_jobs=-1))
])

In [38]:
# We need specific shape 1D arrays for this model
y_train = np.array(y_train).ravel()

In [39]:
# ~10min
GRID_SEARCH = False
if (GRID_SEARCH):
    param_distributions = {
        "tree_reg__n_estimators": [50,100,200],
        "tree_reg__criterion": ['squared_error','absolute_error','poisson'],
        "tree_reg__max_depth": [None,50, 100, 150],
        "tree_reg__min_samples_split": [2,5,10,15,20],
        "tree_reg__min_samples_leaf": [1,2,5,10,15,20,50,100],
        "tree_reg__max_features": ["sqrt", "log2"],
        "tree_reg__min_impurity_decrease": list(np.arange(0,150)/10),
        "tree_reg__ccp_alpha": list(np.round(np.linspace(0, 2, 81), decimals=3)),
        "tree_reg__oob_score": [True, False]
    }
    model = search.run_HR_GS(model, X_train, y_train, param_distributions, 
                            scorer='r2', ignore_warnings=True)
    print(model)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 500
max_resources_: 61249
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 122
n_resources: 500
Fitting 5 folds for each of 122 candidates, totalling 610 fits
----------
iter: 1
n_candidates: 41
n_resources: 1500
Fitting 5 folds for each of 41 candidates, totalling 205 fits
----------
iter: 2
n_candidates: 14
n_resources: 4500
Fitting 5 folds for each of 14 candidates, totalling 70 fits
----------
iter: 3
n_candidates: 5
n_resources: 13500
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 4
n_candidates: 2
n_resources: 40500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'tree_reg__oob_score': True, 'tree_reg__n_estimators': 100, 'tree_reg__min_samples_split': 10, 'tree_reg__min_samples_leaf': 10, 'tree_reg__min_impurity_decrease': 3.8, 'tree_reg__max_features': 'sqrt', 'tree_reg__max_depth': 150, 'tree_reg__criterion': 'square

Best Model:
```
RandomForestRegressor(ccp_alpha=0.775, max_depth=150,
                        max_features='sqrt',
                        min_impurity_decrease=3.8,
                        min_samples_leaf=10,
                        min_samples_split=10, n_jobs=-1,
                        oob_score=True, random_state=42))
```

In [40]:
model = Pipeline([
    ('tree_reg'  ,   RandomForestRegressor(ccp_alpha=0.775, max_depth=150,
                                            max_features='sqrt',
                                            min_impurity_decrease=3.8,
                                            min_samples_leaf=10,
                                            min_samples_split=10, n_jobs=-1,
                                            oob_score=True, random_state=42))
])
model

Pipeline(steps=[('tree_reg',
                 RandomForestRegressor(ccp_alpha=0.775, max_depth=150,
                                       max_features='sqrt',
                                       min_impurity_decrease=3.8,
                                       min_samples_leaf=10,
                                       min_samples_split=10, n_jobs=-1,
                                       oob_score=True, random_state=42))])

In [41]:
model.fit(X_train, y_train)
eval.show_result(y_train, model.predict(X_train))
eval.show_result_cv(y_train, X_train, model)

R2=0.590,RMSE=-57.457
(CV) R2=0.566,RMSE=-58.974


In [42]:
eval.show_result(y_test, model.predict(X_test))

R2=0.424,RMSE=-41.397


A little better than linear regression.

## Non-linear RUL

In [43]:
model = Pipeline([
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = RandomForestRegressor(random_state=42, n_jobs=-1),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':50})))
    ])

In [44]:
# ~10min
GRID_SEARCH = False
if (GRID_SEARCH):
    param_distributions = {
        "trf_reg__transformer__kw_args": search.generate_clip_dicts(80,150,1),
        "trf_reg__regressor__n_estimators": [50,100,200],
        "trf_reg__regressor__criterion": ['squared_error','absolute_error','poisson'],
        "trf_reg__regressor__max_depth": [None,50, 100, 150],
        "trf_reg__regressor__min_samples_split": [2,5,10,15,20],
        "trf_reg__regressor__min_samples_leaf": [1,2,5,10,15,20,50,100],
        "trf_reg__regressor__max_features": ["sqrt", "log2"],
        "trf_reg__regressor__min_impurity_decrease": list(np.arange(0,150)/10),
        "trf_reg__regressor__ccp_alpha": list(np.round(np.linspace(0, 2, 81), decimals=3)),
        "trf_reg__regressor__oob_score": [True, False]
    
    }
    model = search.run_HR_GS(model, X_train, y_train, param_distributions)
    print(model)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 500
max_resources_: 61249
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 122
n_resources: 500
Fitting 5 folds for each of 122 candidates, totalling 610 fits
----------
iter: 1
n_candidates: 41
n_resources: 1500
Fitting 5 folds for each of 41 candidates, totalling 205 fits
----------
iter: 2
n_candidates: 14
n_resources: 4500
Fitting 5 folds for each of 14 candidates, totalling 70 fits
----------
iter: 3
n_candidates: 5
n_resources: 13500
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 4
n_candidates: 2
n_resources: 40500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 90}, 'trf_reg__regressor__oob_score': True, 'trf_reg__regressor__n_estimators': 200, 'trf_reg__regressor__min_samples_split': 10, 'trf_reg__regressor__min_samples_leaf': 15, 'trf_reg__regressor__min_impurit

Best Model:

```
{'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 90}}

RandomForestRegressor(ccp_alpha=0.975,
                        max_depth=150,
                        max_features='log2',
                        min_impurity_decrease=1.2,
                        min_samples_leaf=15,
                        min_samples_split=10,
                        n_estimators=200,
                        n_jobs=-1,
                        oob_score=True,
                        random_state=42),
```



In [45]:
model = Pipeline([
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = RandomForestRegressor(ccp_alpha=0.975,
                                            max_depth=150,
                                            max_features='log2',
                                            min_impurity_decrease=1.2,
                                            min_samples_leaf=15,
                                            min_samples_split=10,
                                            n_estimators=200,
                                            n_jobs=-1,
                                            oob_score=True,
                                            random_state=42),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':90})))
    ])
model

Pipeline(steps=[('trf_reg',
                 TransformedTargetRegressor(check_inverse=False,
                                            regressor=RandomForestRegressor(ccp_alpha=0.975,
                                                                            max_depth=150,
                                                                            max_features='log2',
                                                                            min_impurity_decrease=1.2,
                                                                            min_samples_leaf=15,
                                                                            min_samples_split=10,
                                                                            n_estimators=200,
                                                                            n_jobs=-1,
                                                                            oob_score=True,
                                                       

In [46]:
model.fit(X_train, y_train)
reclipped_y =  model['trf_reg'].transformer.transform(y_train)
eval.show_result(reclipped_y, model.predict(X_train))
eval.show_result_cv(reclipped_y, X_train, model)

R2=0.807,RMSE=-11.849
(CV) R2=0.797,RMSE=-12.133


In [47]:
reclipped_y =  model['trf_reg'].transformer.transform(y_test)
eval.show_result(reclipped_y, model.predict(X_test))

R2=0.803,RMSE=-13.767


Following the results from the decision trees, we have a good result and good generalization. Same model as FD002.

## PolyFeatures + Linear RUL

In [48]:
model = Pipeline([
    ('poly_ft'  ,   PolynomialFeatures()),
    ('tree_reg'  ,   RandomForestRegressor(random_state=42, n_jobs=-1)),
])
model

Pipeline(steps=[('poly_ft', PolynomialFeatures()),
                ('tree_reg',
                 RandomForestRegressor(n_jobs=-1, random_state=42))])

In [49]:
# ~1h
GRID_SEARCH = False
if (GRID_SEARCH):
    param_distributions = {
        "poly_ft__degree": [1,2,3],
        "poly_ft__interaction_only": [False, True],
        "poly_ft__include_bias": [True, False],
        "tree_reg__n_estimators": [50,100,200],
        "tree_reg__criterion": ['squared_error','absolute_error','poisson'],
        "tree_reg__max_depth": [None, 50, 100, 150],
        "tree_reg__min_samples_split": [2,5,10,15,20],
        "tree_reg__min_samples_leaf": [1,2,5,10,15,20,50,100],
        "tree_reg__max_features": ["sqrt", "log2"],
        "tree_reg__min_impurity_decrease": list(np.arange(0,150)/10),
        "tree_reg__ccp_alpha": list(np.round(np.linspace(0, 2, 81), decimals=3)),
        "tree_reg__oob_score": [True, False]
    }
    model = search.run_HR_GS(model, X_train, y_train, 
                            param_distributions, scorer='r2')
    print(model)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 500
max_resources_: 61249
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 122
n_resources: 500
Fitting 5 folds for each of 122 candidates, totalling 610 fits
----------
iter: 1
n_candidates: 41
n_resources: 1500
Fitting 5 folds for each of 41 candidates, totalling 205 fits
----------
iter: 2
n_candidates: 14
n_resources: 4500
Fitting 5 folds for each of 14 candidates, totalling 70 fits
----------
iter: 3
n_candidates: 5
n_resources: 13500
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 4
n_candidates: 2
n_resources: 40500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'tree_reg__oob_score': True, 'tree_reg__n_estimators': 200, 'tree_reg__min_samples_split': 15, 'tree_reg__min_samples_leaf': 10, 'tree_reg__min_impurity_decrease': 1.3, 'tree_reg__max_features': 'sqrt', 'tree_reg__max_depth': None, 'tree_reg__criterion': 'squar

Best Model:
```
PolynomialFeatures(include_bias=False, interaction_only=True)),
RandomForestRegressor(ccp_alpha=0.425, max_features='sqrt',
                        min_impurity_decrease=1.3,
                        min_samples_leaf=10,
                        min_samples_split=15, n_estimators=200,
                        n_jobs=-1, oob_score=True,
                        random_state=42))
```


In [50]:
model = Pipeline([
    ('poly_ft'  ,   PolynomialFeatures(include_bias=False, 
                                       interaction_only=True)),
    ('tree_reg'  ,   RandomForestRegressor(ccp_alpha=0.425, max_features='sqrt',
                        min_impurity_decrease=1.3,
                        min_samples_leaf=10,
                        min_samples_split=15, n_estimators=200,
                        n_jobs=-1, oob_score=True,
                        random_state=42))
])
model

Pipeline(steps=[('poly_ft',
                 PolynomialFeatures(include_bias=False, interaction_only=True)),
                ('tree_reg',
                 RandomForestRegressor(ccp_alpha=0.425, max_features='sqrt',
                                       min_impurity_decrease=1.3,
                                       min_samples_leaf=10,
                                       min_samples_split=15, n_estimators=200,
                                       n_jobs=-1, oob_score=True,
                                       random_state=42))])

In [51]:
# ~20min
model.fit(X_train, y_train)
eval.show_result(y_train, model.predict(X_train))
eval.show_result_cv(y_train, X_train, model)

R2=0.667,RMSE=-51.794
(CV) R2=0.589,RMSE=-57.333


In [52]:
eval.show_result(y_test, model.predict(X_test))

R2=0.446,RMSE=-40.577


No clear benefit from polynomial features in linear RUL, as with the other models

## PolyFeatures + Non-Linear RUL

In [53]:
model = Pipeline([
    ('poly_ft'  ,   PolynomialFeatures()),
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = RandomForestRegressor(random_state=42, n_jobs=-1),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':96})))
])

In [54]:
# ~35min
GRID_SEARCH = False
if (GRID_SEARCH):
    param_distributions = {
        "poly_ft__degree": [1,2,3],
        "poly_ft__interaction_only": [False, True],
        "poly_ft__include_bias": [True, False],
        "trf_reg__transformer__kw_args": search.generate_clip_dicts(70,150,1),
        "trf_reg__regressor__n_estimators": [50,100,200],
        "trf_reg__regressor__criterion": ['squared_error','absolute_error','poisson'],
        "trf_reg__regressor__max_depth": [None, 50, 100, 150],
        "trf_reg__regressor__min_samples_split": [2,5,10,15,20],
        "trf_reg__regressor__min_samples_leaf": [1,2,5,10,15,20,50,100],
        "trf_reg__regressor__max_features": ["sqrt", "log2"],
        "trf_reg__regressor__min_impurity_decrease": list(np.arange(0,150)/10),
        "trf_reg__regressor__ccp_alpha": list(np.round(np.linspace(0, 2, 81), decimals=3)),
        "trf_reg__regressor__oob_score": [True, False]
    }
    model = search.run_HR_GS(model, X_train, y_train, param_distributions)
    print(model)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 500
max_resources_: 61249
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 122
n_resources: 500
Fitting 5 folds for each of 122 candidates, totalling 610 fits
----------
iter: 1
n_candidates: 41
n_resources: 1500
Fitting 5 folds for each of 41 candidates, totalling 205 fits
----------
iter: 2
n_candidates: 14
n_resources: 4500
Fitting 5 folds for each of 14 candidates, totalling 70 fits
----------
iter: 3
n_candidates: 5
n_resources: 13500
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 4
n_candidates: 2
n_resources: 40500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 77}, 'trf_reg__regressor__oob_score': False, 'trf_reg__regressor__n_estimators': 50, 'trf_reg__regressor__min_samples_split': 10, 'trf_reg__regressor__min_samples_leaf': 15, 'trf_reg__regressor__min_impurit

Best Model:
```
'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 77}
PolynomialFeatures(include_bias=False, interaction_only=True)),
RandomForestRegressor(ccp_alpha=1.475,
                    max_depth=100,
                    max_features='sqrt',
                    min_impurity_decrease=0.8,
                    min_samples_leaf=15,
                    min_samples_split=10,
                    n_estimators=50,
                    n_jobs=-1,
                    random_state=42),
```


In [55]:
model = Pipeline([
    ('poly_ft', PolynomialFeatures(include_bias=False, interaction_only=True)),
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = RandomForestRegressor(ccp_alpha=1.475,
                                            max_depth=100,
                                            max_features='sqrt',
                                            min_impurity_decrease=0.8,
                                            min_samples_leaf=15,
                                            min_samples_split=10,
                                            n_estimators=50,
                                            n_jobs=-1,
                                            random_state=42),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':77})))
])
model

Pipeline(steps=[('poly_ft',
                 PolynomialFeatures(include_bias=False, interaction_only=True)),
                ('trf_reg',
                 TransformedTargetRegressor(check_inverse=False,
                                            regressor=RandomForestRegressor(ccp_alpha=1.475,
                                                                            max_depth=100,
                                                                            max_features='sqrt',
                                                                            min_impurity_decrease=0.8,
                                                                            min_samples_leaf=15,
                                                                            min_samples_split=10,
                                                                            n_estimators=50,
                                                                            n_jobs=-1,
                                       

In [56]:
# ~3min
model.fit(X_train, y_train)
reclipped_y =  model['trf_reg'].transformer.transform(y_train)
eval.show_result(reclipped_y, model.predict(X_train))
eval.show_result_cv(reclipped_y, X_train, model)

R2=0.822,RMSE=-9.266
(CV) R2=0.811,RMSE=-9.508


In [57]:
reclipped_y =  model['trf_reg'].transformer.transform(y_test)
eval.show_result(reclipped_y, model.predict(X_test))

R2=0.820,RMSE=-10.943


Same model of FD002.

Much better results. Even though the data is more complex, we still achieved a good performance, comparable to the easier groups. 