In [1]:
import numpy as np
import optuna as opt
import pandas as pd
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
df_train = pd.read_csv("regression_data/regression_train.csv")
df_test = pd.read_csv("regression_data/regression_test.csv")

In [3]:
df_train

Unnamed: 0,target,X1,X2,X3,X4,X5,X6,X7
0,,-0.546168,,47.063305,1.611370,-65.882137,0,-16.672865
1,47.735389,3.074317,-2.836000,49.396649,0.261998,-66.570716,0,-6.664599
2,,-1.485531,-13.102537,42.367991,3.991558,-67.108014,8,-30.790962
3,908.202209,6.907396,-0.308967,57.228787,0.256771,-66.181266,5,-0.726072
4,1640.461863,1.916788,3.460290,51.703375,2.463790,-65.143663,5,8.131680
...,...,...,...,...,...,...,...,...
1037,4296.426459,5.375810,6.937947,55.810467,1.698025,-65.819509,8,16.304176
1038,54.530919,0.619207,-1.925819,48.168606,1.357223,-65.385243,8,-4.525675
1039,1636.535078,4.548172,2.886253,47.775525,1.278439,-56.312543,8,6.782694
1040,,1.806045,-3.015213,48.352707,4.627906,-57.722688,5,-7.085751


In [4]:
df_test

Unnamed: 0,target,X1,X2,X3,X4,X5,X6,X7
0,3.436244,0.210854,-2.451307,51.239996,4.641751,-67.874319,5,-5.760571
1,1525.839412,2.157483,4.337776,43.828794,8.070219,-68.405526,8,10.193773
2,455.600191,5.166359,-0.452615,43.931305,1.349625,-66.240021,0,-1.063645
3,0.619759,3.276641,-3.607201,50.631046,2.473542,-69.971690,8,-8.476921
4,10.996472,-1.867485,-2.408863,58.660224,2.826219,-53.647149,5,-5.660827
...,...,...,...,...,...,...,...,...
256,555.785223,5.099614,-0.648889,54.770941,1.975509,-61.664039,8,-1.524890
257,624.578115,0.220241,1.410877,42.895015,0.639779,-66.077209,0,3.315561
258,,1.066213,-5.569022,51.318035,1.994699,-64.081511,0,-13.087202
259,,-2.260013,-10.854081,38.671378,1.065288,-66.732946,0,-25.507091


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1042 entries, 0 to 1041
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   target  715 non-null    float64
 1   X1      1042 non-null   float64
 2   X2      1031 non-null   float64
 3   X3      1042 non-null   float64
 4   X4      1042 non-null   float64
 5   X5      1042 non-null   float64
 6   X6      1042 non-null   int64  
 7   X7      1042 non-null   float64
dtypes: float64(7), int64(1)
memory usage: 65.2 KB


In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261 entries, 0 to 260
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   target  180 non-null    float64
 1   X1      261 non-null    float64
 2   X2      261 non-null    float64
 3   X3      261 non-null    float64
 4   X4      261 non-null    float64
 5   X5      261 non-null    float64
 6   X6      261 non-null    int64  
 7   X7      261 non-null    float64
dtypes: float64(7), int64(1)
memory usage: 16.4 KB


In [7]:
df_train = df_train.dropna().reset_index(drop=True)
df_test = df_test.dropna().reset_index(drop=True)

In [8]:
cat_col = ["X6"]
num_col = [x for x in df_train.columns if x not in cat_col + ["target"]]
num_col

['X1', 'X2', 'X3', 'X4', 'X5', 'X7']

In [9]:
X_train = X_train = df_train[cat_col + num_col]
y_train = df_train["target"]
X_test = df_test[cat_col + num_col]
y_test = df_test["target"]

In [10]:
def create_pipeline(**params):
    scaler = StandardScaler()
    num_transformer = make_pipeline(scaler)
    
    preprocessor = ColumnTransformer(
        transformers=[("num", num_transformer, num_col)],
        remainder="passthrough"
    )
    
    regressor = RandomForestRegressor(random_state=42, **params)
    pipe = make_pipeline(preprocessor, regressor)
    return pipe

In [11]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 4, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 150),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 60),
    }
    pipe = create_pipeline(**params)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    return mean_squared_error(y_test, y_pred, squared=False)



In [12]:
study = opt.create_study(direction='minimize')
study.optimize(objective, n_trials=100, n_jobs=-1)

trial = study.best_trial

[32m[I 2023-07-04 19:37:00,163][0m A new study created in memory with name: no-name-02b710ac-f23f-48aa-8153-fb92db3f3ccc[0m
[32m[I 2023-07-04 19:37:02,800][0m Trial 3 finished with value: 514.4835257955148 and parameters: {'n_estimators': 64, 'max_depth': 24, 'min_samples_split': 149, 'min_samples_leaf': 31}. Best is trial 3 with value: 514.4835257955148.[0m
[32m[I 2023-07-04 19:37:03,159][0m Trial 10 finished with value: 507.9115077087597 and parameters: {'n_estimators': 66, 'max_depth': 41, 'min_samples_split': 128, 'min_samples_leaf': 5}. Best is trial 10 with value: 507.9115077087597.[0m
[32m[I 2023-07-04 19:37:03,778][0m Trial 6 finished with value: 510.97331021694504 and parameters: {'n_estimators': 107, 'max_depth': 29, 'min_samples_split': 125, 'min_samples_leaf': 51}. Best is trial 10 with value: 507.9115077087597.[0m
[32m[I 2023-07-04 19:37:04,734][0m Trial 2 finished with value: 345.68461198709673 and parameters: {'n_estimators': 134, 'max_depth': 26, 'min_samp

[32m[I 2023-07-04 19:37:33,116][0m Trial 32 finished with value: 323.97477147298054 and parameters: {'n_estimators': 919, 'max_depth': 36, 'min_samples_split': 59, 'min_samples_leaf': 19}. Best is trial 24 with value: 177.82094807565807.[0m
[32m[I 2023-07-04 19:37:34,054][0m Trial 35 finished with value: 273.6049635523942 and parameters: {'n_estimators': 725, 'max_depth': 50, 'min_samples_split': 29, 'min_samples_leaf': 16}. Best is trial 24 with value: 177.82094807565807.[0m
[32m[I 2023-07-04 19:37:34,899][0m Trial 38 finished with value: 305.5317962098505 and parameters: {'n_estimators': 720, 'max_depth': 50, 'min_samples_split': 61, 'min_samples_leaf': 13}. Best is trial 24 with value: 177.82094807565807.[0m
[32m[I 2023-07-04 19:37:35,275][0m Trial 36 finished with value: 273.6049635523942 and parameters: {'n_estimators': 725, 'max_depth': 47, 'min_samples_split': 22, 'min_samples_leaf': 16}. Best is trial 24 with value: 177.82094807565807.[0m
[32m[I 2023-07-04 19:37:35

[32m[I 2023-07-04 19:38:00,104][0m Trial 73 finished with value: 172.2521594413721 and parameters: {'n_estimators': 389, 'max_depth': 42, 'min_samples_split': 15, 'min_samples_leaf': 7}. Best is trial 73 with value: 172.2521594413721.[0m
[32m[I 2023-07-04 19:38:00,640][0m Trial 65 finished with value: 176.63121192048635 and parameters: {'n_estimators': 624, 'max_depth': 42, 'min_samples_split': 15, 'min_samples_leaf': 8}. Best is trial 73 with value: 172.2521594413721.[0m
[32m[I 2023-07-04 19:38:00,695][0m Trial 71 finished with value: 320.4296862663859 and parameters: {'n_estimators': 645, 'max_depth': 41, 'min_samples_split': 14, 'min_samples_leaf': 25}. Best is trial 73 with value: 172.2521594413721.[0m
[32m[I 2023-07-04 19:38:01,518][0m Trial 75 finished with value: 176.71069871241613 and parameters: {'n_estimators': 404, 'max_depth': 42, 'min_samples_split': 13, 'min_samples_leaf': 8}. Best is trial 73 with value: 172.2521594413721.[0m
[32m[I 2023-07-04 19:38:01,782]

In [13]:
pipe = create_pipeline(**study.best_params)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [14]:
pipe

In [15]:
print(f"A RMSE é: {mean_squared_error(y_test, y_pred, squared=False)}\nA R2 é: {r2_score(y_test, y_pred)}"
     )

A RMSE é: 137.8592561332619
A R2 é: 0.9880903327896267


In [16]:
pipe.named_steps["randomforestregressor"].feature_importances_

array([2.01329398e-02, 4.87168952e-01, 2.31014833e-03, 4.14472325e-03,
       1.09691017e-03, 4.84689214e-01, 4.57112196e-04])

In [17]:
pipe[0].get_feature_names_out()

array(['num__X1', 'num__X2', 'num__X3', 'num__X4', 'num__X5', 'num__X7',
       'remainder__X6'], dtype=object)

In [18]:
ziped_feat_importance = zip(pipe.named_steps["randomforestregressor"].feature_importances_, pipe[0].get_feature_names_out())
ziped_feat_importance = sorted(ziped_feat_importance, key=lambda x: x[0], reverse=True)
for importance_tuple in ziped_feat_importance:
    importance, column = importance_tuple
    print(f"A Feature {column} tem: {importance:.2f}% de importancia para a target")

A Feature num__X2 tem: 0.49% de importancia para a target
A Feature num__X7 tem: 0.48% de importancia para a target
A Feature num__X1 tem: 0.02% de importancia para a target
A Feature num__X4 tem: 0.00% de importancia para a target
A Feature num__X3 tem: 0.00% de importancia para a target
A Feature num__X5 tem: 0.00% de importancia para a target
A Feature remainder__X6 tem: 0.00% de importancia para a target


In [19]:
df_train.corr()["target"][:]

target    1.000000
X1        0.514485
X2        0.938318
X3        0.057441
X4       -0.011001
X5       -0.023407
X6        0.040508
X7        0.938318
Name: target, dtype: float64