In [None]:
import pandas as pd
import plotly.express as px
from src.utils import dataframe_coeficientes

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, PowerTransformer, QuantileTransformer
from sklearn.model_selection import cross_validate, KFold
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor

import seaborn as sns

RANDOM_STATE = 42

In [19]:

df = pd.read_parquet('../dados/diabete_categorizado.parquet')
df

Unnamed: 0,idade,sexo,imc,pressao_media,colesterol_total,ldl,hdl,triglicerides,glicose,target,colesterol_hdl_cat
0,59,2,32.1,101.00,157,93.2,38.0,4.8598,87,151,4-5
1,48,1,21.6,87.00,183,103.2,70.0,3.8918,69,75,2-3
2,72,2,30.5,93.00,156,93.6,41.0,4.6728,85,141,4-5
3,24,1,25.3,84.00,198,131.4,40.0,4.8903,89,206,4-5
4,50,1,23.0,101.00,192,125.4,52.0,4.2905,80,135,4-5
...,...,...,...,...,...,...,...,...,...,...,...
437,60,2,28.2,112.00,185,113.8,42.0,4.9836,93,178,4-5
438,47,2,24.9,75.00,225,166.0,42.0,4.4427,102,104,4-5
439,60,2,24.9,99.67,162,106.6,43.0,4.1271,95,132,4-5
440,36,1,30.0,95.00,201,125.2,42.0,5.1299,85,220,4-5


In [20]:
colunas_power_transform = ["imc", "ldl", "hdl", "colesterol_total"]

coluna_target = ["target"]

coluna_ordinal_encoder = ["colesterol_hdl_cat"]

coluna_one_hot_encoder = ["sexo"]

coluna_standard_scaler = [
    coluna for coluna in df.columns if coluna not in colunas_power_transform + coluna_target + coluna_ordinal_encoder + coluna_one_hot_encoder
]

coluna_standard_scaler

['idade', 'pressao_media', 'triglicerides', 'glicose']

In [21]:
categorias_ordinal_encoder = [
    ["2-3", "4-5", "6+"],
]

In [22]:
X = df.drop(columns='target')
y = df['target']

In [None]:
preprocessamento = ColumnTransformer([
    ("power_transform", PowerTransformer(method='box-cox'), colunas_power_transform),
    ("stander_scaler", StandardScaler(), coluna_standard_scaler),
    ("ordinal_encoder", OrdinalEncoder(categories=categorias_ordinal_encoder), coluna_ordinal_encoder),
    ("one_hot_encoder", OneHotEncoder(drop='if_binary'), coluna_one_hot_encoder)
])

pipeline = Pipeline([
    ("prepocessor", preprocessamento),
    ("reg", LinearRegression())
])

pipeline

In [24]:
regressor = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=QuantileTransformer(n_quantiles=20, output_distribution='normal')
)
regressor

In [25]:
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

scores = cross_validate(
    regressor,
    X,
    y,
    cv=kf,
    scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_root_mean_squared_error'],
)
scores

{'fit_time': array([0.11263871, 0.02866459, 0.02931333, 0.03306437, 0.05451679]),
 'score_time': array([0.01735187, 0.02140069, 0.00325584, 0.01324105, 0.00698161]),
 'test_r2': array([0.45111025, 0.57091426, 0.38387465, 0.57951993, 0.39804149]),
 'test_neg_mean_absolute_error': array([-42.0372685 , -40.89694828, -47.25776736, -42.49541009,
        -46.10165081]),
 'test_neg_mean_squared_error': array([-2908.10118168, -2676.89837283, -3353.52629815, -2829.98033076,
        -3363.49168702]),
 'test_neg_root_mean_squared_error': array([-53.9268132 , -51.73875117, -57.90963908, -53.19755944,
        -57.99561783])}

In [26]:
scores_df = pd.DataFrame(scores)

scores_df

Unnamed: 0,fit_time,score_time,test_r2,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_neg_root_mean_squared_error
0,0.112639,0.017352,0.45111,-42.037269,-2908.101182,-53.926813
1,0.028665,0.021401,0.570914,-40.896948,-2676.898373,-51.738751
2,0.029313,0.003256,0.383875,-47.257767,-3353.526298,-57.909639
3,0.033064,0.013241,0.57952,-42.49541,-2829.980331,-53.197559
4,0.054517,0.006982,0.398041,-46.101651,-3363.491687,-57.995618
