### Balanceamento

In [52]:
# carregar dataset
import numpy as np
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from utils import readDataframe_parquet
from utils import transformData



dados = pd.read_parquet('data/heart_disease.parquet') 

# Transformação do dataset para incluir dummies e conversões
df_transformed = transformData(dados)

df_transformed.head()
df_transformed.shape

(253680, 58)

In [66]:
x = df_transformed.drop(['HeartDiseaseorAttack','Education_College 1-3', 'Education_College 4 ou mais', 'Education_Grades 1-8', 'Education_Grades 12 ou GED', 'Education_Grades 9-11', 'MentHlth', 'PhysHlth'], axis=1)
y = df_transformed['HeartDiseaseorAttack']

In [67]:
y.value_counts()

HeartDiseaseorAttack
0.0    229787
1.0     23893
Name: count, dtype: int64

In [68]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [69]:
y_train.value_counts()

HeartDiseaseorAttack
0.0    153915
1.0     16050
Name: count, dtype: int64

In [70]:
# Contagem de classes antes do balanceamento
print('Contagem antes do balanceamento:', Counter(y_train))

Contagem antes do balanceamento: Counter({0.0: 153915, 1.0: 16050})


In [71]:
# Aplica RandomUnderSampler para reduzir a classe majoritária
undersample = RandomUnderSampler(random_state=42)
x_train_balanced, y_train_balanced = undersample.fit_resample(x_train, y_train)

# Contagem de classes após o balanceamento
print('Contagem após o balanceamento:', Counter(y_train_balanced))

Contagem após o balanceamento: Counter({0.0: 16050, 1.0: 16050})


In [72]:
y_train_balanced.value_counts()

HeartDiseaseorAttack
0.0    16050
1.0    16050
Name: count, dtype: int64

In [73]:

logistica = LogisticRegression(random_state=1,max_iter=200,penalty='l2',
                               tol=0.0001, C=1,solver ='lbfgs')
logistica.fit(x_train_balanced, y_train_balanced)

# Fazer previsões

previsores_logistica = logistica.predict(x_test)
previsores_logistica

# métricas do modelo
print(classification_report(y_test,previsores_logistica))



              precision    recall  f1-score   support

         0.0       0.97      0.75      0.85     75872
         1.0       0.25      0.80      0.38      7843

    accuracy                           0.75     83715
   macro avg       0.61      0.77      0.61     83715
weighted avg       0.90      0.75      0.80     83715



In [61]:
df_transformed.value_counts()

HeartDiseaseorAttack  HighBP  HighChol  CholCheck  Smoker  Stroke  PhysActivity  Fruits  Veggies  HvyAlcoholConsump  AnyHealthcare  NoDocbcCost  MentHlth  PhysHlth  DiffWalk  Sex  BMI_Abaixo do peso  BMI_Obesidade Grau I  BMI_Obesidade Grau II  BMI_Obesidade Grau III  BMI_Peso normal  BMI_Peso normal baixo  BMI_Sobrepeso  Diabetes_Diabético  Diabetes_Não possui diabetes  Diabetes_Pré-diabético  GenHlth_Boa  GenHlth_Execelente  GenHlth_Moderada  GenHlth_Pobre  GenHlth_Ruim  Age_18-24  Age_25-29  Age_30-34  Age_35-39  Age_40-44  Age_45-49  Age_50-54  Age_55-59  Age_60-64  Age_65-69  Age_70-74  Age_75-79  Age_Mais de 80  Education_College 1-3  Education_College 4 ou mais  Education_Grades 1-8  Education_Grades 12 ou GED  Education_Grades 9-11  Education_Nunca foi a escola (ou apenas foi à pré-escola)  Income_$10000-$14000  Income_$15000-$19999  Income_$20000-$24999  Income_$25000-$34999  Income_$35000-$49999  Income_$50000-$74999  Income_$75000 ou mais  Income_Menos de $10000
0.0         

In [74]:
import pickle

# Salvar o modelo com pickle
with open('Models/regressao2_model.pkl', 'wb') as f:
    pickle.dump(logistica, f)
