In [24]:
import pandas as pd
import matplotlib.pyplot as plt

from src.graficos import plot_comparar_metricas_modelos, plot_residuos
from src.modelos import organiza_resultados, treinar_e_validar_modelo_regressao, grid_search_cv_regressor

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, PowerTransformer, QuantileTransformer
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.dummy import DummyRegressor
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor

import seaborn as sns

RANDOM_STATE = 42

In [25]:
df = pd.read_parquet("../dados/diabete_categorizado.parquet") 
df


Unnamed: 0,idade,sexo,imc,pressao_media,colesterol_total,ldl,hdl,triglicerides,glicose,target,colesterol_hdl_cat
0,59,2,32.1,101.00,157,93.2,38.0,4.8598,87,151,4-5
1,48,1,21.6,87.00,183,103.2,70.0,3.8918,69,75,2-3
2,72,2,30.5,93.00,156,93.6,41.0,4.6728,85,141,4-5
3,24,1,25.3,84.00,198,131.4,40.0,4.8903,89,206,4-5
4,50,1,23.0,101.00,192,125.4,52.0,4.2905,80,135,4-5
...,...,...,...,...,...,...,...,...,...,...,...
437,60,2,28.2,112.00,185,113.8,42.0,4.9836,93,178,4-5
438,47,2,24.9,75.00,225,166.0,42.0,4.4427,102,104,4-5
439,60,2,24.9,99.67,162,106.6,43.0,4.1271,95,132,4-5
440,36,1,30.0,95.00,201,125.2,42.0,5.1299,85,220,4-5


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   idade               442 non-null    int32   
 1   sexo                442 non-null    int32   
 2   imc                 442 non-null    float64 
 3   pressao_media       442 non-null    float64 
 4   colesterol_total    442 non-null    int32   
 5   ldl                 442 non-null    float64 
 6   hdl                 442 non-null    float64 
 7   triglicerides       442 non-null    float64 
 8   glicose             442 non-null    int32   
 9   target              442 non-null    int32   
 10  colesterol_hdl_cat  442 non-null    category
dtypes: category(1), float64(5), int32(5)
memory usage: 26.6 KB


In [27]:
X = df.drop(columns='target')
y = df['target']

In [28]:
colunas_power_transform = ["imc", "ldl", "hdl", "colesterol_total"]

coluna_target = ["target"]

coluna_ordinal_encoder = ["colesterol_hdl_cat"]

coluna_one_hot_encoder = ["sexo"]

coluna_standard_scaler = [
    coluna for coluna in df.columns if coluna not in colunas_power_transform + coluna_target + coluna_ordinal_encoder + coluna_one_hot_encoder
]

coluna_standard_scaler

['idade', 'pressao_media', 'triglicerides', 'glicose']

In [29]:
categorias_ordinal_encoder = [
    ["2-3", "4-5", "6+"],
]

In [30]:
preprocessamento_categoricas = ColumnTransformer(transformers=[
    ("ordinal_encoder", OrdinalEncoder(categories=categorias_ordinal_encoder), coluna_ordinal_encoder),
    ("one_hot_encoder", OneHotEncoder(drop='if_binary'), coluna_one_hot_encoder)
], remainder='passthrough')

preprocessamento_simples = ColumnTransformer(transformers=[
    ("ordinal_encoder", OrdinalEncoder(categories=categorias_ordinal_encoder), coluna_ordinal_encoder),
    ("one_hot_encoder", OneHotEncoder(drop='if_binary'), coluna_one_hot_encoder),
    ("stander_scaler", StandardScaler(), X.columns.difference(coluna_ordinal_encoder + coluna_one_hot_encoder)),
], remainder='passthrough')


preprocessamento_completo = ColumnTransformer([
    ("power_transform", PowerTransformer(method='box-cox'), colunas_power_transform),
    ("stander_scaler", StandardScaler(), coluna_standard_scaler),
    ("ordinal_encoder", OrdinalEncoder(categories=categorias_ordinal_encoder), coluna_ordinal_encoder),
    ("one_hot_encoder", OneHotEncoder(drop='if_binary'), coluna_one_hot_encoder)
    
])

target_transformer = QuantileTransformer(n_quantiles=20, output_distribution='normal')

In [31]:
ridge_regressor = Ridge()

In [38]:
param_grid = {
    "regressor__reg__alpha": [1E-2, 5E-2, 0.1, 1.0, 5, 10],
    "regressor__preprocessor": [preprocessamento_categoricas, preprocessamento_simples, preprocessamento_completo],
    "transformer": [target_transformer, None]
}


In [39]:
grid_search = grid_search_cv_regressor(
    regressor=ridge_regressor,
    param_grid=param_grid,
    preprocessor=preprocessamento_categoricas,
    target_transformer=target_transformer,
    
)
grid_search

In [40]:
grid_search.fit(X, y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [41]:
grid_search.best_score_

-54.64487631919926

In [42]:
grid_search.best_estimator_

## scores