<a href="https://colab.research.google.com/github/arthursl12/POC1/blob/main/POC1_Turbofan_FD003_Model_Distance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import glob

In [2]:
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor

from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeRegressor

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PolynomialFeatures

In [3]:
sns.set_palette('colorblind')

# Data Preparation

In [4]:
# Dataset Download 
os.system('git clone https://github.com/arthursl12/dataset_2')
os.system('mv /content/dataset_2/CMaps /content/CMaps')
os.system('mv /content/dataset_2/data_processing /content/data_processing')
os.system('rm -rf dataset_2')

0

In [5]:
from data_processing.processing import DatasetProcessing
from data_processing.training import HyperparameterSearch, reclipper_scorer
from data_processing.eval import Evaluation

In [6]:
proc = DatasetProcessing()

## Data Integration

The data are provided as a zip-compressed text file with 26 columns of numbers, separated by spaces. Each row is a snapshot of data taken during a single operational cycle, each column is a different variable. The columns correspond to:  

1) unit number   
2) time, in cycles  
3) operational setting 1  
4) operational setting 2  
5) operational setting 3    
6) sensor measurement 1    
7) sensor measurement 2  
...  
26) sensor measurement 20


There are 6 conditions (or combinations) which the 3 operational settings can take.  
Condition 1: Altitude = 0, Mach Number = 0, TRA = 100  
Condition 2: Altitude = 10, Mach Number = 0.25, TRA = 100  
Condition 3: Altitude = 20, Mach Number = 0.7 TRA = 100  
Condition 4: Altitude = 25, Mach Number = 0.62, TRA = 60  
Condition 5: Altitude = 35 Mach Number = 0.84, TRA = 100  
Condition 6: Altitude = 42, Mach Number = 0.84, TRA = 100  
  
There is slight variation in all these conditions so you may get numbers like 24.453 instead of 25 exactly.

FD001: Condition 1 only  
FD002: Mix of all the conditions  
FD003: Condition 1 only  
FD004: Mix of all conditions  


In [7]:
index_cols, settings_cols, sensors_cols, cols = proc.column_names()
train, test, y_test = proc.read_dataset(3)
train

Unnamed: 0,unit_number,time,op_1,op_2,op_3,s_0,s_1,s_2,s_3,s_4,...,s_11,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20
0,1,1,-0.0005,0.0004,100.0,518.67,642.36,1583.23,1396.84,14.62,...,522.31,2388.01,8145.32,8.4246,0.03,391,2388,100.0,39.11,23.3537
1,1,2,0.0008,-0.0003,100.0,518.67,642.50,1584.69,1396.89,14.62,...,522.42,2388.03,8152.85,8.4403,0.03,392,2388,100.0,38.99,23.4491
2,1,3,-0.0014,-0.0002,100.0,518.67,642.18,1582.35,1405.61,14.62,...,522.03,2388.00,8150.17,8.3901,0.03,391,2388,100.0,38.85,23.3669
3,1,4,-0.0020,0.0001,100.0,518.67,642.92,1585.61,1392.27,14.62,...,522.49,2388.08,8146.56,8.3878,0.03,392,2388,100.0,38.96,23.2951
4,1,5,0.0016,0.0000,100.0,518.67,641.68,1588.63,1397.65,14.62,...,522.58,2388.03,8147.80,8.3869,0.03,392,2388,100.0,39.14,23.4583
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24715,100,148,-0.0016,-0.0003,100.0,518.67,643.78,1596.01,1424.11,14.62,...,519.66,2388.30,8138.08,8.5036,0.03,394,2388,100.0,38.44,22.9631
24716,100,149,0.0034,-0.0003,100.0,518.67,643.29,1596.38,1429.14,14.62,...,519.91,2388.28,8144.36,8.5174,0.03,395,2388,100.0,38.50,22.9746
24717,100,150,-0.0016,0.0004,100.0,518.67,643.84,1604.53,1431.41,14.62,...,519.44,2388.24,8135.95,8.5223,0.03,396,2388,100.0,38.39,23.0682
24718,100,151,-0.0023,0.0004,100.0,518.67,643.94,1597.56,1426.57,14.62,...,520.01,2388.26,8141.24,8.5148,0.03,395,2388,100.0,38.31,23.0753


## Preprocessing

### Test Set Transformation 
Test set has samples for all cycles, but has annotations only for last one

In [8]:
test.shape, y_test.shape

((16596, 26), (100, 1))

In [9]:
test_last = proc.transform_test(test)
test_last.head()

Unnamed: 0,s_0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,...,s_11,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20
0,518.67,642.59,1592.4,1409.87,14.62,21.58,560.53,2388.22,9085.5,1.31,...,528.05,2388.23,8158.77,8.2966,0.03,393,2388,100.0,39.43,23.5679
1,518.67,642.56,1587.42,1409.69,14.62,21.61,553.33,2388.18,9050.97,1.3,...,520.9,2388.17,8128.04,8.4514,0.03,392,2388,100.0,38.83,23.2821
2,518.67,642.75,1591.93,1417.66,14.62,21.6,563.61,2388.31,9091.69,1.31,...,531.36,2388.33,8173.56,8.3057,0.03,395,2388,100.0,39.27,23.644
3,518.67,642.28,1584.68,1406.56,14.62,21.61,552.75,2388.07,9048.23,1.3,...,521.27,2388.09,8133.78,8.4337,0.03,392,2388,100.0,38.7,23.3804
4,518.67,642.15,1580.59,1397.26,14.62,21.58,553.82,2387.96,9050.89,1.3,...,521.74,2387.96,8132.51,8.39,0.03,390,2388,100.0,38.89,23.4463


In [10]:
X_test = test_last

### Remaining Useful Life (RUL)

In [11]:
train = proc.add_remaining_useful_life_linear(train)
train[index_cols+['RUL']].head()

Unnamed: 0,unit_number,time,RUL
0,1,1,258
1,1,2,257
2,1,3,256
3,1,4,255
4,1,5,254


## Attributes and target separation

In [12]:
X_train, y_train = proc.X_y_train_divide(train)

In [13]:
y_train.head()

Unnamed: 0,RUL
0,258
1,257
2,256
3,255
4,254


In [14]:
X_train.head()

Unnamed: 0,s_0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,...,s_11,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20
0,518.67,642.36,1583.23,1396.84,14.62,21.61,553.97,2387.96,9062.17,1.3,...,522.31,2388.01,8145.32,8.4246,0.03,391,2388,100.0,39.11,23.3537
1,518.67,642.5,1584.69,1396.89,14.62,21.61,554.55,2388.0,9061.78,1.3,...,522.42,2388.03,8152.85,8.4403,0.03,392,2388,100.0,38.99,23.4491
2,518.67,642.18,1582.35,1405.61,14.62,21.61,554.43,2388.03,9070.23,1.3,...,522.03,2388.0,8150.17,8.3901,0.03,391,2388,100.0,38.85,23.3669
3,518.67,642.92,1585.61,1392.27,14.62,21.61,555.21,2388.0,9064.57,1.3,...,522.49,2388.08,8146.56,8.3878,0.03,392,2388,100.0,38.96,23.2951
4,518.67,641.68,1588.63,1397.65,14.62,21.61,554.74,2388.04,9076.14,1.3,...,522.58,2388.03,8147.8,8.3869,0.03,392,2388,100.0,39.14,23.4583


## Training and Evaluation functions

In [15]:
eval = Evaluation()

In [16]:
search = HyperparameterSearch()

# DummyRegressor

Just to have a baseline on models. The trained models must have a better performance than a dummy.

## Linear RUL

In [17]:
model = Pipeline([
    ('dummy_reg'  ,   DummyRegressor())
])

In [18]:
model.fit(X_train, y_train)
eval.show_result(y_train, model.predict(X_train))
eval.show_result_cv(y_train, X_train, model)

R2=0.000,RMSE=-98.845
(CV) R2=-0.045,RMSE=-98.991


In [19]:
eval.show_result(y_test, model.predict(X_test))

R2=-2.298,RMSE=-75.180


## Non-linear RUL

In [20]:
model = Pipeline([
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = DummyRegressor(),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':90})))
    ])

In [21]:
model.fit(X_train, y_train)
reclipped_y =  model['trf_reg'].transformer.transform(y_train)
eval.show_result(reclipped_y, model.predict(X_train))
eval.show_result_cv(reclipped_y, X_train, model)

R2=0.000,RMSE=-26.927
(CV) R2=-0.006,RMSE=-26.942


In [22]:
reclipped_y =  model['trf_reg'].transformer.transform(y_test)
eval.show_result(reclipped_y, model.predict(X_test))

R2=-0.105,RMSE=-30.342


## PolyFeatures + Linear RUL

In [23]:
model = Pipeline([
    ('poly_ft'  ,   PolynomialFeatures()),
    ('tree_reg'  ,   DummyRegressor())
])
model

Pipeline(steps=[('poly_ft', PolynomialFeatures()),
                ('tree_reg', DummyRegressor())])

In [24]:
model.fit(X_train, y_train)
eval.show_result(y_train, model.predict(X_train))
eval.show_result_cv(y_train, X_train, model)

R2=0.000,RMSE=-98.845
(CV) R2=-0.045,RMSE=-98.991


In [25]:
eval.show_result(y_test, model.predict(X_test))

R2=-2.298,RMSE=-75.180


## PolyFeatures + Non-Linear RUL

In [26]:
model = Pipeline([
    ('poly_ft'  ,   PolynomialFeatures()),
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = DummyRegressor(),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':90})))
])

In [27]:
model.fit(X_train, y_train)
reclipped_y =  model['trf_reg'].transformer.transform(y_train)
eval.show_result(reclipped_y, model.predict(X_train))
eval.show_result_cv(reclipped_y, X_train, model)

R2=0.000,RMSE=-26.927
(CV) R2=-0.006,RMSE=-26.942


In [28]:
reclipped_y =  model['trf_reg'].transformer.transform(y_test)
eval.show_result(reclipped_y, model.predict(X_test))

R2=-0.105,RMSE=-30.342


# KNeighborsRegressor

## Linear RUL

In [None]:
model = Pipeline([
    ('selection', SelectFromModel(DecisionTreeRegressor())),
    ('neigh_reg'  ,   KNeighborsRegressor())
])

In [None]:
GRID_SEARCH = True
if (GRID_SEARCH):
    param_distributions = {
        "neigh_reg__n_neighbors": [3,5,15,25,45,55,65,75,85,95],
        "neigh_reg__weights": ["uniform", "distance"],
        "neigh_reg__algorithm": ["auto", "ball_tree", "kd_tree"],
        "neigh_reg__leaf_size": [20,30,50,100,150],
        "neigh_reg__p": [2,5,10],
        "neigh_reg__metric": ["minkowski", "euclidean", "manhattan", "chebyshev"],
    }
    model = search.run_HR_GS(model, X_train, y_train, param_distributions, 
                            scorer='r2', ignore_warnings=True)
    print(model)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 500
max_resources_: 24720
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 49
n_resources: 500
Fitting 5 folds for each of 49 candidates, totalling 245 fits
----------
iter: 1
n_candidates: 17
n_resources: 1500
Fitting 5 folds for each of 17 candidates, totalling 85 fits
----------
iter: 2
n_candidates: 6
n_resources: 4500
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 3
n_candidates: 2
n_resources: 13500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'neigh_reg__weights': 'distance', 'neigh_reg__p': 10, 'neigh_reg__n_neighbors': 55, 'neigh_reg__metric': 'manhattan', 'neigh_reg__leaf_size': 150, 'neigh_reg__algorithm': 'kd_tree'}
Pipeline(steps=[('selection',
                 SelectFromModel(estimator=DecisionTreeRegressor())),
                ('neigh_reg',
                 KNeighborsRegressor(algorithm='kd_tree', leaf_size=

Best Model:
```
KNeighborsRegressor(algorithm='kd_tree', leaf_size=150,
                    metric='manhattan', n_neighbors=55, p=10,
                    weights='distance')
```

In [None]:
model = Pipeline([
    ('selection', SelectFromModel(DecisionTreeRegressor())),
    ('neigh_reg'  ,   KNeighborsRegressor(algorithm='kd_tree', leaf_size=150,
                    metric='manhattan', n_neighbors=55, p=10,
                    weights='distance'))
])
model

Pipeline(steps=[('selection',
                 SelectFromModel(estimator=DecisionTreeRegressor())),
                ('neigh_reg',
                 KNeighborsRegressor(algorithm='kd_tree', leaf_size=150,
                                     metric='manhattan', n_neighbors=55, p=10,
                                     weights='distance'))])

In [None]:
model.fit(X_train, y_train)
eval.show_result(y_train, model.predict(X_train))
eval.show_result_cv(y_train, X_train, model)

R2=1.000,RMSE=-0.117
(CV) R2=0.501,RMSE=-67.826


In [None]:
eval.show_result(y_test, model.predict(X_test))

R2=-0.770,RMSE=-55.080


It overfitted the training set. Same model from FD001.

## Non-linear RUL

In [None]:
model = Pipeline([
    ('selection', SelectFromModel(DecisionTreeRegressor())),
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = KNeighborsRegressor(),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':50})))
    ])

In [None]:
GRID_SEARCH = True
if (GRID_SEARCH):
    param_distributions = {
        "trf_reg__transformer__kw_args": search.generate_clip_dicts(80,150,1),
        "trf_reg__regressor__n_neighbors": [3,5,15,25,45,55,65,75,85,95],
        "trf_reg__regressor__weights": ["uniform", "distance"],
        "trf_reg__regressor__algorithm": ["auto", "ball_tree", "kd_tree"],
        "trf_reg__regressor__leaf_size": [20,30,50,100,150],
        "trf_reg__regressor__p": [2,5,10],
        "trf_reg__regressor__metric": ["minkowski", "euclidean", "manhattan", "chebyshev"],
    
    }
    model = search.run_HR_GS(model, X_train, y_train, param_distributions)
    print(model)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 500
max_resources_: 24720
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 49
n_resources: 500
Fitting 5 folds for each of 49 candidates, totalling 245 fits
----------
iter: 1
n_candidates: 17
n_resources: 1500
Fitting 5 folds for each of 17 candidates, totalling 85 fits
----------
iter: 2
n_candidates: 6
n_resources: 4500
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 3
n_candidates: 2
n_resources: 13500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 103}, 'trf_reg__regressor__weights': 'uniform', 'trf_reg__regressor__p': 5, 'trf_reg__regressor__n_neighbors': 15, 'trf_reg__regressor__metric': 'minkowski', 'trf_reg__regressor__leaf_size': 100, 'trf_reg__regressor__algorithm': 'ball_tree'}
Pipeline(steps=[('selection',
                 SelectFromModel(estimator=DecisionTr

Best Model:

```
{'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 103}}

KNeighborsRegressor(algorithm='ball_tree',
                    leaf_size=100,
                    n_neighbors=15,
                    p=5),
```



In [29]:
model = Pipeline([
    ('selection', SelectFromModel(DecisionTreeRegressor())),
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = KNeighborsRegressor(algorithm='ball_tree',
                                            leaf_size=100,
                                            n_neighbors=15,
                                            p=5),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':103})))
    ])
model

Pipeline(steps=[('selection',
                 SelectFromModel(estimator=DecisionTreeRegressor())),
                ('trf_reg',
                 TransformedTargetRegressor(check_inverse=False,
                                            regressor=KNeighborsRegressor(algorithm='ball_tree',
                                                                          leaf_size=100,
                                                                          n_neighbors=15,
                                                                          p=5),
                                            transformer=FunctionTransformer(func=<function clip at 0x7fa9622cec20>,
                                                                            kw_args={'a_max': 103,
                                                                                     'a_min': 0})))])

In [30]:
model.fit(X_train, y_train)
reclipped_y =  model['trf_reg'].transformer.transform(y_train)
eval.show_result(reclipped_y, model.predict(X_train))
eval.show_result_cv(reclipped_y, X_train, model)

R2=0.841,RMSE=-12.762
(CV) R2=0.765,RMSE=-15.423


In [31]:
reclipped_y =  model['trf_reg'].transformer.transform(y_test)
eval.show_result(reclipped_y, model.predict(X_test))

R2=0.761,RMSE=-16.215


Didn't overfit at all. Generalized much better.

## PolyFeatures + Linear RUL

In [None]:
model = Pipeline([
    ('poly_ft'  ,   PolynomialFeatures()),
    ('selection', SelectFromModel(DecisionTreeRegressor())),
    ('neigh_reg'  ,   KNeighborsRegressor())
])
model

Pipeline(steps=[('poly_ft', PolynomialFeatures()),
                ('selection',
                 SelectFromModel(estimator=DecisionTreeRegressor())),
                ('neigh_reg', KNeighborsRegressor())])

In [None]:
# ~9min
GRID_SEARCH = False
if (GRID_SEARCH):
    param_distributions = {
        "poly_ft__degree": [1,2,3],
        "poly_ft__interaction_only": [False, True],
        "poly_ft__include_bias": [True, False],
        "neigh_reg__n_neighbors": [3,5,15,25,45,55,65,75,85,95],
        "neigh_reg__weights": ["uniform", "distance"],
        "neigh_reg__algorithm": ["auto", "ball_tree", "kd_tree"],
        "neigh_reg__leaf_size": [20,30,50,100,150],
        "neigh_reg__p": [2,5,10],
        "neigh_reg__metric": ["minkowski", "euclidean", "manhattan", "chebyshev"],
    }
    model = search.run_HR_GS(model, X_train, y_train, 
                            param_distributions, scorer='r2')
    print(model)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 500
max_resources_: 24720
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 49
n_resources: 500
Fitting 5 folds for each of 49 candidates, totalling 245 fits
----------
iter: 1
n_candidates: 17
n_resources: 1500
Fitting 5 folds for each of 17 candidates, totalling 85 fits
----------
iter: 2
n_candidates: 6
n_resources: 4500
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 3
n_candidates: 2
n_resources: 13500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'poly_ft__interaction_only': True, 'poly_ft__include_bias': False, 'poly_ft__degree': 2, 'neigh_reg__weights': 'distance', 'neigh_reg__p': 5, 'neigh_reg__n_neighbors': 75, 'neigh_reg__metric': 'manhattan', 'neigh_reg__leaf_size': 100, 'neigh_reg__algorithm': 'auto'}
Pipeline(steps=[('poly_ft',
                 PolynomialFeatures(include_bias=False, interaction_only=True)),
    

Best Model:
```
PolynomialFeatures(include_bias=False, interaction_only=True))
KNeighborsRegressor(leaf_size=100, metric='manhattan',
                    n_neighbors=75, p=5,
                    weights='distance'))
```


In [None]:
model = Pipeline([
    ('poly_ft' , PolynomialFeatures(include_bias=False, interaction_only=True)),
    ('selection', SelectFromModel(DecisionTreeRegressor())),
    ('neigh_reg' ,   KNeighborsRegressor(leaf_size=100, metric='manhattan',
                                     n_neighbors=75, p=5,
                                     weights='distance'))
])
model

Pipeline(steps=[('poly_ft',
                 PolynomialFeatures(include_bias=False, interaction_only=True)),
                ('selection',
                 SelectFromModel(estimator=DecisionTreeRegressor())),
                ('neigh_reg',
                 KNeighborsRegressor(leaf_size=100, metric='manhattan',
                                     n_neighbors=75, p=5,
                                     weights='distance'))])

In [None]:
# ~5min
model.fit(X_train, y_train)
eval.show_result(y_train, model.predict(X_train))
eval.show_result_cv(y_train, X_train, model)

R2=1.000,RMSE=-0.000
(CV) R2=0.559,RMSE=-64.329


In [None]:
eval.show_result(y_test, model.predict(X_test))

R2=-0.672,RMSE=-53.529


It overfitted the data again.. It is the same model of FD001.

## PolyFeatures + Non-Linear RUL

In [None]:
model = Pipeline([
    ('poly_ft'  ,   PolynomialFeatures()),
    ('selection', SelectFromModel(DecisionTreeRegressor())),
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = KNeighborsRegressor(),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':96})))
])

In [None]:
# ~13min
GRID_SEARCH = False
if (GRID_SEARCH):
    param_distributions = {
        "poly_ft__degree": [1,2,3],
        "poly_ft__interaction_only": [False, True],
        "poly_ft__include_bias": [True, False],
        "trf_reg__transformer__kw_args": search.generate_clip_dicts(70,150,1),
        "trf_reg__regressor__n_neighbors": [3,5,15,25,45,55,65,75,85,95],
        "trf_reg__regressor__weights": ["uniform", "distance"],
        "trf_reg__regressor__algorithm": ["auto", "ball_tree", "kd_tree"],
        "trf_reg__regressor__leaf_size": [20,30,50,100,150],
        "trf_reg__regressor__p": [2,5,10],
        "trf_reg__regressor__metric": ["minkowski", "euclidean", "manhattan", "chebyshev"],
    }
    model = search.run_HR_GS(model, X_train, y_train, param_distributions)
    print(model)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 500
max_resources_: 24720
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 49
n_resources: 500
Fitting 5 folds for each of 49 candidates, totalling 245 fits
----------
iter: 1
n_candidates: 17
n_resources: 1500
Fitting 5 folds for each of 17 candidates, totalling 85 fits
----------
iter: 2
n_candidates: 6
n_resources: 4500
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 3
n_candidates: 2
n_resources: 13500
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best params:  {'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 77}, 'trf_reg__regressor__weights': 'distance', 'trf_reg__regressor__p': 5, 'trf_reg__regressor__n_neighbors': 15, 'trf_reg__regressor__metric': 'euclidean', 'trf_reg__regressor__leaf_size': 150, 'trf_reg__regressor__algorithm': 'ball_tree', 'poly_ft__interaction_only': False, 'poly_ft__include_bias': False, 'poly_ft__degre

Best Model:
```
'trf_reg__transformer__kw_args': {'a_min': 0, 'a_max': 77}
PolynomialFeatures(degree=3, include_bias=False)),
KNeighborsRegressor(algorithm='ball_tree',
                    leaf_size=150,
                    metric='euclidean',
                    n_neighbors=15,
                    p=5,
                    weights='distance'),
```


In [32]:
model = Pipeline([
    ('poly_ft', PolynomialFeatures(degree=3, include_bias=False)),
    ('selection', SelectFromModel(DecisionTreeRegressor())),
    ('trf_reg' ,TransformedTargetRegressor(
        check_inverse=False,
        regressor   = KNeighborsRegressor(algorithm='ball_tree',
                                            leaf_size=150,
                                            metric='euclidean',
                                            n_neighbors=15,
                                            p=5,
                                            weights='distance'),
        transformer = FunctionTransformer(np.clip, 
                                          kw_args={'a_min':0,'a_max':77})))
])
model

Pipeline(steps=[('poly_ft', PolynomialFeatures(degree=3, include_bias=False)),
                ('selection',
                 SelectFromModel(estimator=DecisionTreeRegressor())),
                ('trf_reg',
                 TransformedTargetRegressor(check_inverse=False,
                                            regressor=KNeighborsRegressor(algorithm='ball_tree',
                                                                          leaf_size=150,
                                                                          metric='euclidean',
                                                                          n_neighbors=15,
                                                                          p=5,
                                                                          weights='distance'),
                                            transformer=FunctionTransformer(func=<function clip at 0x7fa9622cec20>,
                                                                    

In [33]:
# ~15min
model.fit(X_train, y_train)
reclipped_y =  model['trf_reg'].transformer.transform(y_train)
eval.show_result(reclipped_y, model.predict(X_train))
eval.show_result_cv(reclipped_y, X_train, model)

R2=1.000,RMSE=-0.000
(CV) R2=0.836,RMSE=-8.792


In [34]:
reclipped_y =  model['trf_reg'].transformer.transform(y_test)
eval.show_result(reclipped_y, model.predict(X_test))

R2=0.780,RMSE=-11.266


Using the polynominal features actually improved the perfomance a little bit, when comparing with not using them and the non-linear RUL