In [16]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
import pandas as pd

df = pd.read_csv('encoded_data.csv', index_col=0)
resampled_df = pd.read_csv('resampled_encoded_data.csv', index_col=0)

## Usando os dados originais

In [17]:
X = df
y = X.pop("diabetes")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifier = GaussianNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.97      0.93      0.95     18104
           1       0.32      0.51      0.39      1160

    accuracy                           0.90     19264
   macro avg       0.64      0.72      0.67     19264
weighted avg       0.93      0.90      0.91     19264



## Buscando os melhores parâmetros

In [None]:
X = df
y = X.pop("diabetes")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifier = GaussianNB()

param_grid = {'priors': [None, [0.2, 0.3, 0.5]],
              'var_smoothing': [1e-9, 1e-8, 1e-7]}

grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

report = classification_report(y_test, y_pred)
print(report)

## Testando a modelagem com os dados balanceados

In [18]:
X = resampled_df
y = X.pop("diabetes")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifier = GaussianNB()

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.79      0.89      0.84      1187
           1       0.86      0.75      0.80      1109

    accuracy                           0.82      2296
   macro avg       0.83      0.82      0.82      2296
weighted avg       0.83      0.82      0.82      2296



## Aplicando o GridSearch

In [None]:
param_grid = {'priors': [None, [0.2, 0.3, 0.5]],
              'var_smoothing': [1e-9, 1e-8, 1e-7]}

grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

report = classification_report(y_test, y_pred)
print(report)