In [1]:
# Ensure project root is on sys.path for imports
import sys
import os
sys.path.insert(0, os.path.abspath('..'))
print('root on sys.path:', sys.path[0])

root on sys.path: g:\FIAP-Pos-data-analytics\Pos_Data_Analytics_Curso\Challenges_Fases\Arthur_challenge_4\fiap-tech4


In [2]:
# Libs
import pickle
import pandas as pd

from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    StratifiedKFold,
    KFold,
)
from sklearn.metrics import (
    accuracy_score, f1_score, recall_score,
    precision_score, confusion_matrix, classification_report
)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator
from joblib import dump, load

# Models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# local modules
from src.models.train_pipeline import train_model
from src.models.production_pipeline import load_model, preprocess_input, predict_from_input
from src.utils.validation import cross_validate_model
# optional: model_tuning may not be present in this workspace
try:
    from src.model_tuning import tune_model
except Exception:
    tune_model = None

## Extrair dataframe para modelo

In [3]:
df_model_final = pd.read_csv('../data/df_model_final.csv')
df_model_final

Unnamed: 0,hist_familiar_obes,cons_altas_cal_freq,cons_verduras,refeicoes_principais_dia,lancha_entre_ref_bin,fuma,agua_dia,controle_calorias,ativ_fisica_bin,uso_tecnologia,cons_alcool_bin,trasporte_bin,target_obesidade
0,1,0,2.0,3.0,0,0,2.0,0,0,1.0,0,1,0
1,1,0,3.0,3.0,0,1,3.0,1,1,0.0,0,1,0
2,1,0,2.0,3.0,0,0,2.0,0,1,1.0,1,1,0
3,0,0,3.0,3.0,0,0,2.0,0,1,0.0,1,0,0
4,0,0,2.0,1.0,0,0,2.0,0,0,0.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,1,1,3.0,3.0,0,0,2.0,0,1,1.0,0,1,1
2107,1,1,3.0,3.0,0,0,2.0,0,0,1.0,0,1,1
2108,1,1,3.0,3.0,0,0,2.0,0,0,1.0,0,1,1
2109,1,1,3.0,3.0,0,0,3.0,0,0,1.0,0,1,1


In [4]:
df_model_final.columns

Index(['hist_familiar_obes', 'cons_altas_cal_freq', 'cons_verduras',
       'refeicoes_principais_dia', 'lancha_entre_ref_bin', 'fuma', 'agua_dia',
       'controle_calorias', 'ativ_fisica_bin', 'uso_tecnologia',
       'cons_alcool_bin', 'trasporte_bin', 'target_obesidade'],
      dtype='object')

In [5]:
X = df_model_final.drop('target_obesidade', axis=1)
y = df_model_final['target_obesidade']

### Tunando o Modelo XGBoost

In [7]:
train_model(
    X,
    y,
    model=DecisionTreeClassifier(),
    save_model=False,
    model_name='decision_tree_base_model',
    save_type='joblib'
)


==== TEST METRICS ====
Accuracy: 0.7728706624605678
F1: 0.7725315419733195
Recall: 0.7728706624605678
Precision: 0.786199043254886

Confusion Matrix:
 [[240 102]
 [ 42 250]]

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.70      0.77       342
           1       0.71      0.86      0.78       292

    accuracy                           0.77       634
   macro avg       0.78      0.78      0.77       634
weighted avg       0.79      0.77      0.77       634



(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   Index(['hist_familiar_obes', 'cons_altas_cal_freq', 'cons_verduras',
        'refeicoes_principais_dia', 'lancha_entre_ref_bin', 'fuma', 'agua_dia',
        'controle_calorias', 'ativ_fisica_bin', 'uso_tecnologia',
        'cons_alcool_bin', 'trasporte_bin'],
       dtype='object')),
                                                  ('cat',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(stra

In [21]:
param_dist_decision_tree = {
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 10, 20, 30, 40, 50],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": [None, "sqrt", "log2"]
}

base_decision_tree = DecisionTreeClassifier(random_state=42)
best_decision_tree = tune_model(base_model=base_decision_tree, param_dist=param_dist_decision_tree, X=X, y=y)

✅ Best CV score: 0.8086
✅ Best params:
  - model__min_samples_split: 2
  - model__min_samples_leaf: 2
  - model__max_features: None
  - model__max_depth: 20
  - model__criterion: entropy


In [22]:
train_model(
    X,
    y,
    model=best_decision_tree,
    save_model=False,
    model_name='decision_tree_tuned_model',
    save_type='joblib'
)


==== TEST METRICS ====
Accuracy: 0.7665615141955836
F1: 0.7662129736948006
Recall: 0.7665615141955836
Precision: 0.7797564205041217

Confusion Matrix:
 [[238 104]
 [ 44 248]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.70      0.76       342
           1       0.70      0.85      0.77       292

    accuracy                           0.77       634
   macro avg       0.77      0.77      0.77       634
weighted avg       0.78      0.77      0.77       634



(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   Index(['hist_familiar_obes', 'cons_altas_cal_freq', 'cons_verduras',
        'refeicoes_principais_dia', 'lancha_entre_ref_bin', 'fuma', 'agua_dia',
        'controle_calorias', 'ativ_fisica_bin', 'uso_tecnologia',
        'cons_alcool_bin', 'trasporte_bin'],
       dtype='object')),
                                                  ('cat',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(stra

In [8]:
train_model(
    X,
    y,
    model=XGBClassifier(),
    save_model=False,
    model_name='xgb_model',
    save_type='joblib'
)


==== TEST METRICS ====
Accuracy: 0.7917981072555205
F1: 0.7911635982604224
Recall: 0.7917981072555205
Precision: 0.8090651152199887

Confusion Matrix:
 [[242 100]
 [ 32 260]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.71      0.79       342
           1       0.72      0.89      0.80       292

    accuracy                           0.79       634
   macro avg       0.80      0.80      0.79       634
weighted avg       0.81      0.79      0.79       634



(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   Index(['hist_familiar_obes', 'cons_altas_cal_freq', 'cons_verduras',
        'refeicoes_principais_dia', 'lancha_entre_ref_bin', 'fuma', 'agua_dia',
        'controle_calorias', 'ativ_fisica_bin', 'uso_tecnologia'...
                                feature_types=None, feature_weights=None,
                                gamma=None, grow_policy=None,
                                importance_type=None,
                                interaction_constraints=None, learning_rate=None,
                   

In [25]:
param_dist_xgb = {
    "n_estimators": [200, 400, 600, 800, 1000, 1200],
    "learning_rate": [0.05, 0.1, 0.2, 0.3, 0.4, 0.5],
    "max_depth": [3, 4, 6, 8, 10],
    "subsample": [0.5, 0.7, 0.8, 1.0],
    "colsample_bytree": [0.5, 0.7, 0.8, 1.0],
}

base_xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

best_xgb = tune_model(base_xgb, param_dist_xgb, X, y)

✅ Best CV score: 0.8134
✅ Best params:
  - model__subsample: 0.8
  - model__n_estimators: 200
  - model__max_depth: 6
  - model__learning_rate: 0.3
  - model__colsample_bytree: 0.7


In [26]:
train_model(
    X,
    y,
    model=best_xgb,
    save_model=True,
    model_name='xgb_model',
    save_type='joblib'
)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



==== TEST METRICS ====
Accuracy: 0.7839116719242902
F1: 0.7832989140784172
Recall: 0.7839116719242902
Precision: 0.8004872854446239

Confusion Matrix:
 [[240 102]
 [ 35 257]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.70      0.78       342
           1       0.72      0.88      0.79       292

    accuracy                           0.78       634
   macro avg       0.79      0.79      0.78       634
weighted avg       0.80      0.78      0.78       634


✅ Modelo salvo em: xgb_model.joblib


(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   Index(['hist_familiar_obes', 'cons_altas_cal_freq', 'cons_verduras',
        'refeicoes_principais_dia', 'lancha_entre_ref_bin', 'fuma', 'agua_dia',
        'controle_calorias', 'ativ_fisica_bin', 'uso_tecnologia'...
                                feature_types=None, feature_weights=None,
                                gamma=None, grow_policy=None,
                                importance_type=None,
                                interaction_constraints=None, learning_rate=0.3,
                    

In [13]:
train_model(X,
            y,
            model=RandomForestClassifier(min_samples_split=10, n_estimators=100, random_state=42),
            save_model=False,
            model_name='random_forest_final',
            save_type='joblib')


==== TEST METRICS ====
Accuracy: 0.7917981072555205
F1: 0.7912507803978203
Recall: 0.7917981072555205
Precision: 0.8081389468457707

Confusion Matrix:
 [[243  99]
 [ 33 259]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.71      0.79       342
           1       0.72      0.89      0.80       292

    accuracy                           0.79       634
   macro avg       0.80      0.80      0.79       634
weighted avg       0.81      0.79      0.79       634



(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   Index(['hist_familiar_obes', 'cons_altas_cal_freq', 'cons_verduras',
        'refeicoes_principais_dia', 'lancha_entre_ref_bin', 'fuma', 'agua_dia',
        'controle_calorias', 'ativ_fisica_bin', 'uso_tecnologia',
        'cons_alcool_bin', 'trasporte_bin'],
       dtype='object')),
                                                  ('cat',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(stra

In [14]:
param_dist_random = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_depth": [None, 10, 20, 30, 40, 50],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

base_random = RandomForestClassifier(random_state=42)
best_random = tune_model(base_random, param_dist_random, X, y)

✅ Best CV score: 0.8119
✅ Best params:
  - model__n_estimators: 400
  - model__min_samples_split: 10
  - model__min_samples_leaf: 1
  - model__max_depth: 30
  - model__bootstrap: False


In [15]:
train_model(X,
            y,
            model=best_random,
            save_model=False,
            model_name='random_forest_final',
            save_type='joblib')


==== TEST METRICS ====
Accuracy: 0.7902208201892744
F1: 0.7896259530834271
Recall: 0.7902208201892744
Precision: 0.8069762660035323

Confusion Matrix:
 [[242 100]
 [ 33 259]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.71      0.78       342
           1       0.72      0.89      0.80       292

    accuracy                           0.79       634
   macro avg       0.80      0.80      0.79       634
weighted avg       0.81      0.79      0.79       634



(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   Index(['hist_familiar_obes', 'cons_altas_cal_freq', 'cons_verduras',
        'refeicoes_principais_dia', 'lancha_entre_ref_bin', 'fuma', 'agua_dia',
        'controle_calorias', 'ativ_fisica_bin', 'uso_tecnologia',
        'cons_alcool_bin', 'trasporte_bin'],
       dtype='object')),
                                                  ('cat',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(stra

# Testando Modelo em Produção

In [7]:
sample_input = {
    'Gender': 'Female',
    'Age': 26,
    'Height': 1.56,
    'Weight': 102.0,
    'family_history': 'yes', # Histórico familiar de obesidade
    'FAVC': 'yes', # Consumo de alimentos com alto valor calórico
    'FCVC': 1, # Frequência de consumo de vegetais
    'NCP': 3, # Número de refeições principais por dia
    'CAEC': 'Sometimes', # Consumo de alimentos entre as refeições
    'SMOKE': 'yes', 
    'CH2O': 1, # Consumo de água por dia (litros)
    'SCC': 'no', # Monitoramento do consumo calórico diaro
    'FAF': 0, # Frequência de atividade física
    'TUE': 2, # Tempo de uso de tecnologia por dia (horas)
    'CALC': 'Sometimes', # Consumo de bebidas alcoólicas
    'MTRANS': 'Public_Transportation' # Meio de transporte predominante
}


In [8]:
res = preprocess_input(sample_input)

In [9]:
print(type(res))
print(res)

<class 'pandas.core.frame.DataFrame'>
   hist_familiar_obes  cons_altas_cal_freq  cons_verduras  \
0                   1                    1              1   

   refeicoes_principais_dia  lancha_entre_ref_bin  fuma  agua_dia  \
0                         3                     0     1         1   

   controle_calorias  ativ_fisica_bin  uso_tecnologia  cons_alcool_bin  \
0                  0                0               2                0   

   trasporte_bin  
0              1  


In [12]:
model = load_model('../src/models/random_forest_final.joblib')
print('model type:', type(model))
print('model:', model)

pred = predict_from_input(model, sample_input)
print('prediction:', pred)

model type: <class 'sklearn.pipeline.Pipeline'>
model: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['hist_familiar_obes', 'cons_altas_cal_freq', 'cons_verduras',
       'refeicoes_principais_dia', 'lancha_entre_ref_bin', 'fuma', 'agua_dia',
       'controle_calorias', 'ativ_fisica_bin', 'uso_tecnologia',
       'cons_alcool_bin', 'trasporte_bin'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                            

In [13]:
res

Unnamed: 0,hist_familiar_obes,cons_altas_cal_freq,cons_verduras,refeicoes_principais_dia,lancha_entre_ref_bin,fuma,agua_dia,controle_calorias,ativ_fisica_bin,uso_tecnologia,cons_alcool_bin,trasporte_bin
0,1,1,1,3,0,1,1,0,0,2,0,1


In [11]:
model = load_model('xgb_model.joblib')

resultado = predict_from_input(model, sample_input)

print(resultado)

{'mensagem': '⚠️ Há indícios de que pode ter obesidade.', 'probabilidade': 'Probabilidade estimada: 71.53%'}


# Funções

#### ==========================================================
## FUNÇÃO: Cross-Validation
#### ==========================================================

In [44]:
def cross_validate_model(X, y, model, cv=5, stratified=True):
    import warnings
    from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
    from sklearn.preprocessing import StandardScaler
    from sklearn.compose import ColumnTransformer
    from sklearn.impute import SimpleImputer
    from sklearn.pipeline import Pipeline

    if stratified:
        kf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    else:
        kf = KFold(n_splits=cv, shuffle=True, random_state=42)

    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object', 'category']).columns

    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent"))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features)
        ]
    )

    pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    scoring_methods = ["accuracy", "f1_weighted", "recall_weighted", "precision_weighted"]

    results = {}

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        for metric in scoring_methods:
            score = cross_val_score(pipe, X, y, cv=kf, scoring=metric)
            results[metric] = score
            print(f"{metric}: {score.mean():.4f} ± {score.std():.4f}")

    return results


### Treino e Validação Cruzada dos Modelos.

In [10]:
display(res)
display(X)

Unnamed: 0,hist_familiar_obes,cons_altas_cal_freq,cons_verduras,refeicoes_principais_dia,lancha_entre_ref_bin,fuma,agua_dia,controle_calorias,ativ_fisica_bin,uso_tecnologia,cons_alcool_bin,trasporte_bin
0,1,1,1,3,1,0,2.0,0,0,2,1,1


Unnamed: 0,hist_familiar_obes,cons_altas_cal_freq,cons_verduras,refeicoes_principais_dia,lancha_entre_ref_bin,fuma,agua_dia,controle_calorias,ativ_fisica_bin,uso_tecnologia,cons_alcool_bin,trasporte_bin
0,1,0,2.0,3.0,0,0,2.0,0,0,1.0,0,1
1,1,0,3.0,3.0,0,1,3.0,1,1,0.0,0,1
2,1,0,2.0,3.0,0,0,2.0,0,1,1.0,1,1
3,0,0,3.0,3.0,0,0,2.0,0,1,0.0,1,0
4,0,0,2.0,1.0,0,0,2.0,0,0,0.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2106,1,1,3.0,3.0,0,0,2.0,0,1,1.0,0,1
2107,1,1,3.0,3.0,0,0,2.0,0,0,1.0,0,1
2108,1,1,3.0,3.0,0,0,2.0,0,0,1.0,0,1
2109,1,1,3.0,3.0,0,0,3.0,0,0,1.0,0,1


In [29]:
### Treinar Modelo Floresta Aleatória.
rf = train_model(X, y, model=RandomForestClassifier(n_estimators=100, random_state=42), save_model=True, model_name="random_forest_final", save_type="joblib")


==== TEST METRICS ====
Accuracy: 0.7902208201892744
F1: 0.7893434022704443
Recall: 0.7902208201892744
Precision: 0.8098718234376651

Confusion Matrix:
 [[239 103]
 [ 30 262]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.70      0.78       342
           1       0.72      0.90      0.80       292

    accuracy                           0.79       634
   macro avg       0.80      0.80      0.79       634
weighted avg       0.81      0.79      0.79       634


✅ Modelo salvo em: random_forest_final.joblib


In [28]:
# Validação Cruzada Floresta Aleatória.
rf_results = cross_validate_model(X, y, model=RandomForestClassifier(n_estimators=100, random_state=42), cv=5, stratified=True)

accuracy: 0.8091 ± 0.0063
f1_weighted: 0.8092 ± 0.0064
recall_weighted: 0.8091 ± 0.0063
precision_weighted: 0.8166 ± 0.0044


In [None]:
# Treinar Modelo Regressão Logistica.
lr = train_model(X, y, model=LogisticRegression(max_iter=1000, random_state=42), save_model=False, model_name="logistic_regression_model", save_type="joblib")


==== TEST METRICS ====
Accuracy: 0.7318611987381703
F1: 0.727647312229901
Recall: 0.7318611987381703
Precision: 0.7684894443096336

Confusion Matrix:
 [[203 139]
 [ 31 261]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.59      0.70       342
           1       0.65      0.89      0.75       292

    accuracy                           0.73       634
   macro avg       0.76      0.74      0.73       634
weighted avg       0.77      0.73      0.73       634



In [None]:
# Validação Cruzada Regressão Logística.
lr_results = cross_validate_model(X, y, model=LogisticRegression(max_iter=1000, random_state=42), cv=5, stratified=True)

In [None]:
# Treinar Modelo SVM.
svm = train_model(X, y, model=SVC(probability=True, random_state=42), save_model=False, model_name="svm_model", save_type="joblib")


==== TEST METRICS ====
Accuracy: 0.7634069400630915
F1: 0.7604454992543802
Recall: 0.7634069400630915
Precision: 0.7974087958827194

Confusion Matrix:
 [[217 125]
 [ 25 267]]

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.63      0.74       342
           1       0.68      0.91      0.78       292

    accuracy                           0.76       634
   macro avg       0.79      0.77      0.76       634
weighted avg       0.80      0.76      0.76       634



In [None]:
# Validação Cruzada SVM.
svm_results = cross_validate_model(X, y, model=SVC(probability=True, random_state=42), cv=5, stratified=True)

In [None]:
# Treinar Modelo Decision Tree.
dt = train_model(X, y, model=DecisionTreeClassifier(random_state=42), save_model=False, model_name="decision_tree_model", save_type="joblib")


==== TEST METRICS ====
Accuracy: 0.7649842271293376
F1: 0.7645917884826388
Recall: 0.7649842271293376
Precision: 0.7785376569356345

Confusion Matrix:
 [[237 105]
 [ 44 248]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.69      0.76       342
           1       0.70      0.85      0.77       292

    accuracy                           0.76       634
   macro avg       0.77      0.77      0.76       634
weighted avg       0.78      0.76      0.76       634



In [None]:
# Validação Cruzada Decision Tree.
dt_results = cross_validate_model(X, y, model=DecisionTreeClassifier(random_state=42), cv=5, stratified=True)

In [47]:
# Treinar Modelo XGBoost.
xb = train_model(X, y, model=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), save_model=False, model_name="xgboost_model", save_type="joblib")


==== TEST METRICS ====
Accuracy: 0.7917981072555205
F1: 0.7911635982604224
Recall: 0.7917981072555205
Precision: 0.8090651152199887

Confusion Matrix:
 [[242 100]
 [ 32 260]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.71      0.79       342
           1       0.72      0.89      0.80       292

    accuracy                           0.79       634
   macro avg       0.80      0.80      0.79       634
weighted avg       0.81      0.79      0.79       634



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [48]:
# Validação Cruzada XGBoost.
xb_results = cross_validate_model(X, y, model=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), cv=5, stratified=True)

accuracy: 0.8096 ± 0.0066
f1_weighted: 0.8097 ± 0.0067
recall_weighted: 0.8096 ± 0.0066
precision_weighted: 0.8165 ± 0.0049


In [None]:
# Treinar Modelo CatBoost.
ct = train_model(X, y, model=CatBoostClassifier(verbose=0, random_state=42), save_model=False, model_name="catboost_model", save_type="joblib")


==== TEST METRICS ====
Accuracy: 0.777602523659306
F1: 0.7765634928639306
Recall: 0.777602523659306
Precision: 0.7977936620903788

Confusion Matrix:
 [[234 108]
 [ 33 259]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.68      0.77       342
           1       0.71      0.89      0.79       292

    accuracy                           0.78       634
   macro avg       0.79      0.79      0.78       634
weighted avg       0.80      0.78      0.78       634



In [None]:
# Validação Cruzada CatBoost.
ct_results = cross_validate_model(X, y, model=CatBoostClassifier(verbose=0, random_state=42), cv=5, stratified=True)

In [None]:
# Treinar Modelo LightGBM.
lb = train_model(X, y, model=LGBMClassifier(random_state=42, verbose=False), save_model=False, model_name="lightgbm_model", save_type="joblib")

[LightGBM] [Info] Number of positive: 680, number of negative: 797
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0,000452 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 41
[LightGBM] [Info] Number of data points in the train set: 1477, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0,460393 -> initscore=-0,158762
[LightGBM] [Info] Start training from score -0,158762

==== TEST METRICS ====
Accuracy: 0.7886435331230284
F1: 0.7879066519566961
Recall: 0.7886435331230284
Precision: 0.8067641368902686

Confusion Matrix:
 [[240 102]
 [ 32 260]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.70      0.78       342
           1       0.72      0.89      0.80       292

    accuracy                           0.79       634
   macro avg       0.80      0



In [None]:
# Treinar Modelo Gaussian Naive Bayes.
gnbs = train_model(X, y, model=GaussianNB(), save_model=False, model_name="gaussian_nb_model", save_type="joblib")


==== TEST METRICS ====
Accuracy: 0.7271293375394322
F1: 0.7182356981921748
Recall: 0.7271293375394322
Precision: 0.7900644017655872

Confusion Matrix:
 [[185 157]
 [ 16 276]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.54      0.68       342
           1       0.64      0.95      0.76       292

    accuracy                           0.73       634
   macro avg       0.78      0.74      0.72       634
weighted avg       0.79      0.73      0.72       634



In [None]:
# Validação Cruzada Gaussian Naive Bayes.
gnbs_results = cross_validate_model(X, y, model=GaussianNB(), cv=5, stratified=True)

In [None]:
# Treinar Modelo K-Nearest Neighbors.
kn = train_model(X, y, model=KNeighborsClassifier(), save_model=False, model_name="knn_model", save_type="joblib")


==== TEST METRICS ====
Accuracy: 0.7618296529968455
F1: 0.7604693242507645
Recall: 0.7618296529968455
Precision: 0.7833977640489249

Confusion Matrix:
 [[227 115]
 [ 36 256]]

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.66      0.75       342
           1       0.69      0.88      0.77       292

    accuracy                           0.76       634
   macro avg       0.78      0.77      0.76       634
weighted avg       0.78      0.76      0.76       634



In [None]:
# Validação Cruzada K-Nearest Neighbors.
kn_results = cross_validate_model(X, y, model=KNeighborsClassifier(), cv=5, stratified=True)

In [None]:
# Treinar Modelo Gradient Boosting.
gbc = train_model(X, y, model=GradientBoostingClassifier(random_state=42), save_model=False, model_name="gradient_boosting_model", save_type="joblib")


==== TEST METRICS ====
Accuracy: 0.7681388012618297
F1: 0.7672777996869977
Recall: 0.7681388012618297
Precision: 0.7860628385985088

Confusion Matrix:
 [[233 109]
 [ 38 254]]

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.68      0.76       342
           1       0.70      0.87      0.78       292

    accuracy                           0.77       634
   macro avg       0.78      0.78      0.77       634
weighted avg       0.79      0.77      0.77       634



In [None]:
# Validação Cruzada Gradient Boosting.
gbc_results = cross_validate_model(X, y, model=GradientBoostingClassifier(random_state=42), cv=5, stratified=True)

accuracy: 0.7935 ± 0.0072
f1_weighted: 0.7931 ± 0.0073
recall_weighted: 0.7935 ± 0.0072
precision_weighted: 0.8071 ± 0.0118
