In [1]:
# Ensure project root is on sys.path for imports
import sys
import os
sys.path.insert(0, os.path.abspath('..'))
print('root on sys.path:', sys.path[0])

root on sys.path: g:\FIAP-Pos-data-analytics\Pos_Data_Analytics_Curso\Challenges_Fases\Arthur_challenge_4\fiap-tech4


In [19]:
# Libs
import pandas as pd

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier


# local modules
from src.models.train_pipeline import train_model
from src.models.production_pipeline import load_model, preprocess_input, predict_from_input
from src.utils.validation import cross_validate_model
# optional: model_tuning may not be present in this workspace
try:
    from src.model_tuning import tune_model
except Exception:
    tune_model = None

## Extrair dataframe para modelo

In [3]:
df_model_final = pd.read_csv('../data/df_model_final.csv')
df_model_final

Unnamed: 0,hist_familiar_obes,cons_altas_cal_freq,cons_verduras,refeicoes_principais_dia,lancha_entre_ref_bin,fuma,agua_dia,controle_calorias,ativ_fisica_bin,uso_tecnologia,cons_alcool_bin,trasporte_bin,target_obesidade
0,1,0,2.0,3.0,0,0,2.0,0,0,1.0,0,1,0
1,1,0,3.0,3.0,0,1,3.0,1,1,0.0,0,1,0
2,1,0,2.0,3.0,0,0,2.0,0,1,1.0,1,1,0
3,0,0,3.0,3.0,0,0,2.0,0,1,0.0,1,0,0
4,0,0,2.0,1.0,0,0,2.0,0,0,0.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,1,1,3.0,3.0,0,0,2.0,0,1,1.0,0,1,1
2107,1,1,3.0,3.0,0,0,2.0,0,0,1.0,0,1,1
2108,1,1,3.0,3.0,0,0,2.0,0,0,1.0,0,1,1
2109,1,1,3.0,3.0,0,0,3.0,0,0,1.0,0,1,1


In [4]:
df_model_final.columns

Index(['hist_familiar_obes', 'cons_altas_cal_freq', 'cons_verduras',
       'refeicoes_principais_dia', 'lancha_entre_ref_bin', 'fuma', 'agua_dia',
       'controle_calorias', 'ativ_fisica_bin', 'uso_tecnologia',
       'cons_alcool_bin', 'trasporte_bin', 'target_obesidade'],
      dtype='object')

In [5]:
X = df_model_final.drop('target_obesidade', axis=1)
y = df_model_final['target_obesidade']

# Testando os Modelos Machine Learning

In [None]:
# Trainoing a base Decision Tree model
train_model(
    X,
    y,
    model=DecisionTreeClassifier(),
    save_model=False,
    model_name='decision_tree_base_model',
    save_type='joblib'
)


==== TEST METRICS ====
Accuracy: 0.7728706624605678
F1: 0.7725315419733195
Recall: 0.7728706624605678
Precision: 0.786199043254886

Confusion Matrix:
 [[240 102]
 [ 42 250]]

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.70      0.77       342
           1       0.71      0.86      0.78       292

    accuracy                           0.77       634
   macro avg       0.78      0.78      0.77       634
weighted avg       0.79      0.77      0.77       634



(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   Index(['hist_familiar_obes', 'cons_altas_cal_freq', 'cons_verduras',
        'refeicoes_principais_dia', 'lancha_entre_ref_bin', 'fuma', 'agua_dia',
        'controle_calorias', 'ativ_fisica_bin', 'uso_tecnologia',
        'cons_alcool_bin', 'trasporte_bin'],
       dtype='object')),
                                                  ('cat',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(stra

In [None]:
# Hiper parametro com o tune_model function
param_dist_decision_tree = {
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 10, 20, 30, 40, 50],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": [None, "sqrt", "log2"]
}

base_decision_tree = DecisionTreeClassifier(random_state=42)
best_decision_tree = tune_model(base_decision_tree, param_dist_decision_tree, X, y)

✅ Best CV score: 0.8086
✅ Best params:
  - model__min_samples_split: 2
  - model__min_samples_leaf: 2
  - model__max_features: None
  - model__max_depth: 20
  - model__criterion: entropy


In [None]:
# Treino do modelo Decision Tree com os melhores hiper parametros
train_model(
    X,
    y,
    model=best_decision_tree,
    save_model=False,
    model_name='decision_tree_tuned_model',
    save_type='joblib'
)


==== TEST METRICS ====
Accuracy: 0.7665615141955836
F1: 0.7662129736948006
Recall: 0.7665615141955836
Precision: 0.7797564205041217

Confusion Matrix:
 [[238 104]
 [ 44 248]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.70      0.76       342
           1       0.70      0.85      0.77       292

    accuracy                           0.77       634
   macro avg       0.77      0.77      0.77       634
weighted avg       0.78      0.77      0.77       634



(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   Index(['hist_familiar_obes', 'cons_altas_cal_freq', 'cons_verduras',
        'refeicoes_principais_dia', 'lancha_entre_ref_bin', 'fuma', 'agua_dia',
        'controle_calorias', 'ativ_fisica_bin', 'uso_tecnologia',
        'cons_alcool_bin', 'trasporte_bin'],
       dtype='object')),
                                                  ('cat',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(stra

In [None]:
# Cross-validation com o modelo Decision Tree
cross_validate_model(
                    X,
                    y,
                    model=DecisionTreeClassifier(),
                    cv=5,
                    stratified=True)

accuracy: 0.8001 ± 0.0087
f1_weighted: 0.7983 ± 0.0062
recall_weighted: 0.7987 ± 0.0068
precision_weighted: 0.8076 ± 0.0059


{'accuracy': array([0.79196217, 0.80805687, 0.81279621, 0.79620853, 0.79146919]),
 'f1_weighted': array([0.79439751, 0.80581815, 0.80594992, 0.79177858, 0.79380066]),
 'recall_weighted': array([0.79196217, 0.80805687, 0.8056872 , 0.79383886, 0.79383886]),
 'precision_weighted': array([0.80494305, 0.81412466, 0.81443396, 0.79914514, 0.80541804])}

In [None]:
# Treino do modelo XGBoost
train_model(
    X,
    y,
    model=XGBClassifier(),
    save_model=False,
    model_name='xgb_model',
    save_type='joblib'
)


==== TEST METRICS ====
Accuracy: 0.7917981072555205
F1: 0.7911635982604224
Recall: 0.7917981072555205
Precision: 0.8090651152199887

Confusion Matrix:
 [[242 100]
 [ 32 260]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.71      0.79       342
           1       0.72      0.89      0.80       292

    accuracy                           0.79       634
   macro avg       0.80      0.80      0.79       634
weighted avg       0.81      0.79      0.79       634



(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   Index(['hist_familiar_obes', 'cons_altas_cal_freq', 'cons_verduras',
        'refeicoes_principais_dia', 'lancha_entre_ref_bin', 'fuma', 'agua_dia',
        'controle_calorias', 'ativ_fisica_bin', 'uso_tecnologia'...
                                feature_types=None, feature_weights=None,
                                gamma=None, grow_policy=None,
                                importance_type=None,
                                interaction_constraints=None, learning_rate=None,
                   

In [None]:
# Hiper parametro com o tune_model function
param_dist_xgb = {
    "n_estimators": [200, 400, 600, 800, 1000, 1200],
    "learning_rate": [0.05, 0.1, 0.2, 0.3, 0.4, 0.5],
    "max_depth": [3, 4, 6, 8, 10],
    "subsample": [0.5, 0.7, 0.8, 1.0],
    "colsample_bytree": [0.5, 0.7, 0.8, 1.0],
}

base_xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

best_xgb = tune_model(base_xgb, param_dist_xgb, X, y)

✅ Best CV score: 0.8134
✅ Best params:
  - model__subsample: 0.8
  - model__n_estimators: 200
  - model__max_depth: 6
  - model__learning_rate: 0.3
  - model__colsample_bytree: 0.7


In [None]:
# Treino do modelo XGBoost com os melhores hiper parametros
train_model(
    X,
    y,
    model=best_xgb,
    save_model=True,
    model_name='xgb_model',
    save_type='joblib'
)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



==== TEST METRICS ====
Accuracy: 0.7839116719242902
F1: 0.7832989140784172
Recall: 0.7839116719242902
Precision: 0.8004872854446239

Confusion Matrix:
 [[240 102]
 [ 35 257]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.70      0.78       342
           1       0.72      0.88      0.79       292

    accuracy                           0.78       634
   macro avg       0.79      0.79      0.78       634
weighted avg       0.80      0.78      0.78       634


✅ Modelo salvo em: xgb_model.joblib


(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   Index(['hist_familiar_obes', 'cons_altas_cal_freq', 'cons_verduras',
        'refeicoes_principais_dia', 'lancha_entre_ref_bin', 'fuma', 'agua_dia',
        'controle_calorias', 'ativ_fisica_bin', 'uso_tecnologia'...
                                feature_types=None, feature_weights=None,
                                gamma=None, grow_policy=None,
                                importance_type=None,
                                interaction_constraints=None, learning_rate=0.3,
                    

In [None]:
# Cross-validation com o modelo XGBoost
cross_validate_model(
                    X,
                    y,
                    model=XGBClassifier(),
                    cv=5,
                    stratified=True)

accuracy: 0.8096 ± 0.0066
f1_weighted: 0.8097 ± 0.0067
recall_weighted: 0.8096 ± 0.0066
precision_weighted: 0.8165 ± 0.0049


{'accuracy': array([0.80851064, 0.81516588, 0.81753555, 0.7985782 , 0.80805687]),
 'f1_weighted': array([0.80875692, 0.81534028, 0.81772999, 0.79861552, 0.80794369]),
 'recall_weighted': array([0.80851064, 0.81516588, 0.81753555, 0.7985782 , 0.80805687]),
 'precision_weighted': array([0.81378861, 0.82271252, 0.81837462, 0.80847652, 0.81903644])}

In [None]:
# Treino do modelo Random Forest
train_model(X,
            y,
            model=RandomForestClassifier(min_samples_split=10, n_estimators=100, random_state=42),
            save_model=False,
            model_name='random_forest_final',
            save_type='joblib')


==== TEST METRICS ====
Accuracy: 0.7917981072555205
F1: 0.7912507803978203
Recall: 0.7917981072555205
Precision: 0.8081389468457707

Confusion Matrix:
 [[243  99]
 [ 33 259]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.71      0.79       342
           1       0.72      0.89      0.80       292

    accuracy                           0.79       634
   macro avg       0.80      0.80      0.79       634
weighted avg       0.81      0.79      0.79       634



(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   Index(['hist_familiar_obes', 'cons_altas_cal_freq', 'cons_verduras',
        'refeicoes_principais_dia', 'lancha_entre_ref_bin', 'fuma', 'agua_dia',
        'controle_calorias', 'ativ_fisica_bin', 'uso_tecnologia',
        'cons_alcool_bin', 'trasporte_bin'],
       dtype='object')),
                                                  ('cat',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(stra

In [None]:
# Hiper parametro com o tune_model function
param_dist_random = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_depth": [None, 10, 20, 30, 40, 50],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

base_random = RandomForestClassifier(random_state=42)
best_random = tune_model(base_random, param_dist_random, X, y)

✅ Best CV score: 0.8119
✅ Best params:
  - model__n_estimators: 400
  - model__min_samples_split: 10
  - model__min_samples_leaf: 1
  - model__max_depth: 30
  - model__bootstrap: False


In [None]:
# Treino do modelo Random Forest com os melhores hiper parametros
train_model(X,
            y,
            model=best_random,
            save_model=False,
            model_name='random_forest_final',
            save_type='joblib')


==== TEST METRICS ====
Accuracy: 0.7902208201892744
F1: 0.7896259530834271
Recall: 0.7902208201892744
Precision: 0.8069762660035323

Confusion Matrix:
 [[242 100]
 [ 33 259]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.71      0.78       342
           1       0.72      0.89      0.80       292

    accuracy                           0.79       634
   macro avg       0.80      0.80      0.79       634
weighted avg       0.81      0.79      0.79       634



(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   Index(['hist_familiar_obes', 'cons_altas_cal_freq', 'cons_verduras',
        'refeicoes_principais_dia', 'lancha_entre_ref_bin', 'fuma', 'agua_dia',
        'controle_calorias', 'ativ_fisica_bin', 'uso_tecnologia',
        'cons_alcool_bin', 'trasporte_bin'],
       dtype='object')),
                                                  ('cat',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(stra

In [None]:
# Cross-validation com o modelo Random Forest
cross_validate_model(
                    X,
                    y,
                    model=RandomForestClassifier(),
                    cv=5,
                    stratified=True)

accuracy: 0.8081 ± 0.0079
f1_weighted: 0.8139 ± 0.0069
recall_weighted: 0.8110 ± 0.0040
precision_weighted: 0.8150 ± 0.0064


{'accuracy': array([0.80851064, 0.80805687, 0.82227488, 0.7985782 , 0.80331754]),
 'f1_weighted': array([0.81330349, 0.80832975, 0.82710505, 0.80829949, 0.81268583]),
 'recall_weighted': array([0.80851064, 0.81042654, 0.81753555, 0.81279621, 0.8056872 ]),
 'precision_weighted': array([0.81549317, 0.81539658, 0.82564873, 0.80555949, 0.81313308])}

# Testando Modelo em Produção

In [None]:
# Exemplo de input para predição
sample_input = {
    'Gender': 'Female',
    'Age': 26,
    'Height': 1.56,
    'Weight': 102.0,
    'family_history': 'yes', # Histórico familiar de obesidade
    'FAVC': 'yes', # Consumo de alimentos com alto valor calórico
    'FCVC': 1, # Frequência de consumo de vegetais
    'NCP': 3, # Número de refeições principais por dia
    'CAEC': 'Sometimes', # Consumo de alimentos entre as refeições
    'SMOKE': 'yes', 
    'CH2O': 1, # Consumo de água por dia (litros)
    'SCC': 'no', # Monitoramento do consumo calórico diaro
    'FAF': 0, # Frequência de atividade física
    'TUE': 2, # Tempo de uso de tecnologia por dia (horas)
    'CALC': 'Sometimes', # Consumo de bebidas alcoólicas
    'MTRANS': 'Public_Transportation' # Meio de transporte predominante
}


In [None]:
# Preprocessamento do input de amostra
res = preprocess_input(sample_input)

In [13]:
res

Unnamed: 0,hist_familiar_obes,cons_altas_cal_freq,cons_verduras,refeicoes_principais_dia,lancha_entre_ref_bin,fuma,agua_dia,controle_calorias,ativ_fisica_bin,uso_tecnologia,cons_alcool_bin,trasporte_bin
0,1,1,1,3,0,1,1,0,0,2,0,1


In [None]:
# Carregamento do modelo para predição
model = load_model('../src/models/random_forest_final.joblib')
print('model type:', type(model))
print('model:', model)

# Predição a partir do input de amostra
pred = predict_from_input(model, sample_input)
print('prediction:', pred)

model type: <class 'sklearn.pipeline.Pipeline'>
model: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['hist_familiar_obes', 'cons_altas_cal_freq', 'cons_verduras',
       'refeicoes_principais_dia', 'lancha_entre_ref_bin', 'fuma', 'agua_dia',
       'controle_calorias', 'ativ_fisica_bin', 'uso_tecnologia',
       'cons_alcool_bin', 'trasporte_bin'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                            

In [None]:
# Carregamento do modelo XGBoost para predição
model = load_model('xgb_model.joblib')

resultado = predict_from_input(model, sample_input)

print(resultado)

{'mensagem': '⚠️ Há indícios de que pode ter obesidade.', 'probabilidade': 'Probabilidade estimada: 71.53%'}
