In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import joblib
from pathlib import Path
import time

#xgboost
from xgboost import XGBClassifier

# sklearn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score, RandomizedSearchCV, StratifiedKFold


from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,roc_auc_score,roc_curve
from sklearn.base import BaseEstimator, TransformerMixin, clone

from scipy.stats import ttest_rel
from scipy.stats import randint, uniform

#hiperparamentros search
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from scipy.stats import randint, uniform,loguniform


# Importações locais
from setup_notebook import setup_path
setup_path()
from src.model_utils import *
from src.preprocess_utils_diab import *
print("\n#Processo iniciado em:", time.strftime("%H:%M:%S"))
start_inicial = time.time()


#Processo iniciado em: 17:01:31


In [4]:
BASE = Path.cwd().parent

PP2 = joblib.load(BASE/'src'/'preprocess_diabetes_v1.2.joblib')['preprocessador']

#DATA_DIR = BASE/"data"/"raw"
#train   = pd.read_csv(DATA_DIR/"train.csv")
X_train = pd.read_csv(DATA_DIR/"X_train_raw.csv")
X_test  = pd.read_csv(DATA_DIR/"X_test_raw.csv")
y_train = pd.read_csv(DATA_DIR/"y_train_raw.csv").values.ravel()
y_test  = pd.read_csv(DATA_DIR/"y_test_raw.csv").values.ravel()
mtd_scoring='roc_auc'



In [5]:
cat_cols = X_train.select_dtypes(include="object").columns.tolist()
print("Categorical columns:", cat_cols)


Categorical columns: ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']


In [9]:
# Prepare XGBoost Features
from sklearn.preprocessing import LabelEncoder



X_xgb = X_train.copy()
X_test_xgb = X_test.copy()

for col in cat_cols:
    le = LabelEncoder()
    X_xgb[col] = le.fit_transform(X_xgb[col])
    X_test_xgb[col] = le.transform(X_test_xgb[col])

X_xgb[cat_cols]

Unnamed: 0,gender,ethnicity,education_level,income_level,smoking_status,employment_status
0,0,2,1,1,2,0
1,0,2,1,3,1,0
2,1,4,0,2,1,0
3,1,4,3,2,2,0
4,1,4,1,4,2,0
...,...,...,...,...,...,...
489995,0,0,1,2,2,0
489996,0,4,1,3,0,0
489997,1,1,1,1,0,1
489998,1,4,3,2,2,1


In [20]:
# LightGBM can handle native category types
X_lgb = X_train.copy()
X_test_lgb = X_test.copy()
for col in cat_cols:
    X_lgb[col] = X_lgb[col].astype("category")
    X_test_lgb[col] = X_test_lgb[col].astype("category")


mem_lgb = X_lgb[cat_cols].memory_usage(deep=True).sum()
mem_raw = X_train[cat_cols].memory_usage(deep=True).sum()

print(f"Memória X_lgb (category): {mem_lgb / 1024**2:.2f} MB")
print(f"Memória X_train (object): {mem_raw / 1024**2:.2f} MB")
print(f"Redução: {(1 - mem_lgb / mem_raw) * 100:.1f}%")

Memória X_lgb (category): 2.81 MB
Memória X_train (object): 179.29 MB
Redução: 98.4%


In [28]:
X_le = X_train.copy()
X_as = X_train.copy()
X_on = X_train.copy()
le = LabelEncoder()

for col in cat_cols:
    X_as[col] = X_as[col].astype("category")
    X_le[col] = le.fit_transform(X_le[col])

#onehotcode
enc = OneHotEncoder( handle_unknown='ignore',sparse_output=False)

X_cat_ohe = enc.fit_transform(X_on[cat_cols])

ohe_cols = enc.get_feature_names_out(cat_cols)

X_cat_ohe = pd.DataFrame(
    X_cat_ohe,
    columns=ohe_cols,
    index=X_on.index
)

X_on = pd.concat(
    [X_on.drop(columns=cat_cols), X_cat_ohe],
    axis=1
)


Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,...,income_level_Lower-Middle,income_level_Middle,income_level_Upper-Middle,smoking_status_Current,smoking_status_Former,smoking_status_Never,employment_status_Employed,employment_status_Retired,employment_status_Student,employment_status_Unemployed
0,56,2,90,9.6,6.2,4.5,26.4,0.88,102,82,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,70,1,65,8.1,6.9,5.3,25.8,0.84,129,76,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,52,2,90,6.7,5.8,7.3,31.1,0.85,139,80,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,37,4,150,6.6,6.9,6.3,22.1,0.81,107,77,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,31,1,76,8.2,5.8,4.7,29.6,0.89,114,80,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
489995,52,1,135,6.9,7.4,5.2,29.7,0.86,123,64,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
489996,56,1,29,8.9,6.2,7.8,25.8,0.86,123,77,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
489997,65,1,107,8.6,7.3,3.3,22.6,0.81,112,90,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
489998,49,4,51,5.7,5.9,3.5,25.8,0.88,107,71,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [29]:
mem_le = X_le.memory_usage(deep=True).sum()
mem_on = X_on.memory_usage(deep=True).sum()
mem_as = X_as.memory_usage(deep=True).sum()

print(f"Memória X_le (category): {mem_le / 1024**2:.2f} MB")
print(f"Memória X_on (object): {mem_on / 1024**2:.2f} MB")
print(f"Memória X_as (category): {mem_as / 1024**2:.2f} MB")

#print(f"Redução: {(1 - mem_lgb / mem_raw) * 100:.1f}%")

Memória X_le (category): 89.72 MB
Memória X_on (object): 157.01 MB
Memória X_as (category): 70.10 MB


In [31]:
X_le

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,...,triglycerides,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history
0,56,2,90,9.6,6.2,4.5,26.4,0.88,102,82,...,87,0,2,1,1,2,0,0,0,0
1,70,1,65,8.1,6.9,5.3,25.8,0.84,129,76,...,135,0,2,1,3,1,0,0,0,0
2,52,2,90,6.7,5.8,7.3,31.1,0.85,139,80,...,145,1,4,0,2,1,0,0,0,0
3,37,4,150,6.6,6.9,6.3,22.1,0.81,107,77,...,93,1,4,3,2,2,0,0,0,0
4,31,1,76,8.2,5.8,4.7,29.6,0.89,114,80,...,96,1,4,1,4,2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
489995,52,1,135,6.9,7.4,5.2,29.7,0.86,123,64,...,177,0,0,1,2,2,0,0,0,0
489996,56,1,29,8.9,6.2,7.8,25.8,0.86,123,77,...,148,0,4,1,3,0,0,1,1,0
489997,65,1,107,8.6,7.3,3.3,22.6,0.81,112,90,...,110,1,1,1,1,0,1,0,1,0
489998,49,4,51,5.7,5.9,3.5,25.8,0.88,107,71,...,103,1,4,3,2,2,1,0,0,0
