In [14]:
%load_ext kedro.ipython
import pandas as pd
from pycaret.classification import *
from sklearn.metrics import log_loss, f1_score

df_train = pd.read_parquet("../data/processed/base_train.parquet")
df_test = pd.read_parquet("../data/processed/base_test.parquet")

print("Treino:", df_train.info)
print("Teste: ", df_test.info)


The kedro.ipython extension is already loaded. To reload it, use:
  %reload_ext kedro.ipython
Treino: <bound method DataFrame.info of            lat       lon  minutes_remaining  period  playoffs  shot_distance  \
7006   34.0343 -118.1288                  1       2         0             14   
20760  33.9723 -118.0808                  0       3         0             20   
15143  33.8713 -118.1728                 11       3         0             19   
10506  34.0443 -118.2698                 11       2         0              0   
21377  33.8543 -118.2698                 11       3         0             19   
...        ...       ...                ...     ...       ...            ...   
29740  34.0523 -118.0718                  9       3         1             19   
23272  34.0443 -118.2698                  0       1         0              0   
6023   34.0523 -118.0838                  2       3         0             18   
1104   34.0403 -118.3618                  3       3         0     

In [4]:
import pandas as pd
import mlflow
import mlflow.sklearn
from pycaret.classification import setup, create_model, tune_model, predict_model, finalize_model, get_config
from sklearn.metrics import log_loss, f1_score
from imblearn.over_sampling import SMOTE

# 1. Carregar os dados
df_train = pd.read_parquet("../data/processed/base_train.parquet")
df_test  = pd.read_parquet("../data/processed/base_test.parquet")

X_test = df_test.drop("shot_made_flag", axis=1)
y_test = df_test["shot_made_flag"]

# 2. Iniciar run do MLflow
with mlflow.start_run(run_name="MelhorModeloComPrints"):

    setup(
    data=df_train,
    target="shot_made_flag",
    session_id=42,
    normalize=True,                     # Normaliza os dados
    transformation=True,               # Aplica transformações (log, raiz etc)
    fix_imbalance=True,                # Ativa o balanceamento
    fix_imbalance_method=SMOTE(),      # SMOTE personalizado
    remove_outliers=True,              # Remove outliers que atrapalham
    remove_multicollinearity=True,     # Remove variáveis colineares
    multicollinearity_threshold=0.95,  # Threshold de colinearidade
    fold=10,                           # Cross-validation com 10 folds
    fold_shuffle=True,                 # Shuffle nos folds
    html=False,
    verbose=False
)

    # 4. Treinar modelos
    print("🔧 Treinando modelo: Regressão Logística")
    lr_model = tune_model(create_model("lr"), optimize="F1")

    print("🔧 Treinando modelo: Árvore de Decisão")
    dt_model = tune_model(create_model("dt"), optimize="F1")

    # 5. Transformar X_test
    pipeline = get_config("pipeline")
    X_test_transformed = pipeline.transform(X_test)

    # 6. Avaliação Logística
    y_pred_lr = lr_model.predict(X_test_transformed)
    y_proba_lr = lr_model.predict_proba(X_test_transformed)
    f1_lr = f1_score(y_test, y_pred_lr)
    loss_lr = log_loss(y_test, y_proba_lr)

    # 7. Avaliação Árvore
    y_pred_dt = dt_model.predict(X_test_transformed)
    y_proba_dt = dt_model.predict_proba(X_test_transformed)
    f1_dt = f1_score(y_test, y_pred_dt)
    loss_dt = log_loss(y_test, y_proba_dt)

    # 8. Prints e comparações
    print(f"📊 Logística - F1 Score: {f1_lr:.4f} | Log Loss: {loss_lr:.4f}")
    print(f"📊 Árvore    - F1 Score: {f1_dt:.4f} | Log Loss: {loss_dt:.4f}")

    # 9. Log de métricas
    mlflow.log_metrics({
        "f1_score_logistica": f1_lr,
        "log_loss_logistica": loss_lr,
        "f1_score_arvore": f1_dt,
        "log_loss_arvore": loss_dt
    })

    # 10. Escolher modelo vencedor
    modelo_vencedor = "Logistica" if f1_lr > f1_dt else "Arvore"
    modelo_final = lr_model if modelo_vencedor == "Logistica" else dt_model

    mlflow.log_param("modelo_vencedor", modelo_vencedor)
    mlflow.sklearn.log_model(modelo_final, artifact_path="modelo_vencedor")

    print(f"✅ Modelo vencedor: {modelo_vencedor}")

# 11. Pronto para uso
print("🏁 Fim do processo. Você pode agora aplicar esse modelo na produção.")


🔧 Treinando modelo: Regressão Logística


                                                         

      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
1       0.5880  0.6124  0.5055  0.5781  0.5394  0.1698  0.1710
2       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
3       0.5757  0.5889  0.4557  0.5691  0.5061  0.1421  0.1448
4       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
5       0.5528  0.5640  0.4594  0.5366  0.4950  0.0981  0.0990
6       0.5915  0.6151  0.5074  0.5826  0.5424  0.1767  0.1781
7       0.5581  0.5939  0.4641  0.5443  0.5010  0.1089  0.1101
8       0.5757  0.6012  0.4825  0.5659  0.5209  0.1444  0.1459
9       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
Mean    0.3442  0.3575  0.2875  0.3377  0.3105  0.0840  0.0849
Std     0.2812  0.2922  0.2353  0.2760  0.2539  0.0721  0.0729


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                         

      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
1       0.5863  0.6124  0.5055  0.5756  0.5383  0.1664  0.1675
2       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
3       0.5757  0.5887  0.4539  0.5694  0.5051  0.1420  0.1448
4       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
5       0.5546  0.5644  0.4594  0.5390  0.4960  0.1015  0.1025
6       0.5924  0.6170  0.5129  0.5828  0.5456  0.1788  0.1800
7       0.5607  0.5943  0.4641  0.5478  0.5025  0.1141  0.1153
8       0.5722  0.6015  0.4788  0.5616  0.5169  0.1373  0.1388
9       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
Mean    0.3442  0.3578  0.2875  0.3376  0.3104  0.0840  0.0849
Std     0.2812  0.2925  0.2354  0.2759  0.2539  0.0717  0.0724
🔧 Treinando modelo: Árvore de Decisão


                                                         

      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.5431  0.5203  0.6089  0.5181  0.5598  0.0913  0.0926
1       0.5572  0.5426  0.5627  0.5342  0.5481  0.1147  0.1148
2       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
3       0.5185  0.5019  0.5554  0.4959  0.5239  0.0400  0.0403
4       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
5       0.5414  0.5287  0.5793  0.5173  0.5466  0.0856  0.0862
6       0.5440  0.5177  0.6181  0.5186  0.5640  0.0937  0.0953
7       0.5185  0.5081  0.5691  0.4968  0.5305  0.0410  0.0414
8       0.5484  0.5255  0.6077  0.5238  0.5627  0.1011  0.1023
9       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
Mean    0.3771  0.3645  0.4101  0.3605  0.3835  0.0567  0.0573
Std     0.2471  0.2388  0.2692  0.2362  0.2514  0.0435  0.0439


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                         

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.5282  0.5122  0.1624  0.5176  0.2472  0.0251  0.0340
1       0.5158  0.5001  0.1568  0.4775  0.2361  0.0003  0.0004
2       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
3       0.5185  0.5028  0.1605  0.4860  0.2413  0.0058  0.0077
4       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
5       0.5018  0.4854  0.1273  0.4259  0.1960 -0.0302 -0.0418
6       0.5167  0.5014  0.1661  0.4813  0.2469  0.0028  0.0037
7       0.5088  0.4929  0.1326  0.4528  0.2051 -0.0145 -0.0203
8       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
9       0.5093  0.4932  0.1365  0.4540  0.2099 -0.0140 -0.0193
Mean    0.3599  0.3488  0.1042  0.3295  0.1583 -0.0025 -0.0036
Std     0.2357  0.2284  0.0694  



✅ Modelo vencedor: Arvore
🏁 Fim do processo. Você pode agora aplicar esse modelo na produção.
