In [1]:
%load_ext kedro.ipython
import pandas as pd
from pycaret.classification import *
from sklearn.metrics import log_loss, f1_score

df_train = pd.read_parquet("../data/processed/base_train.parquet")
df_test = pd.read_parquet("../data/processed/base_test.parquet")

print("Treino:", df_train.info)
print("Teste: ", df_test.info)


Treino: <bound method DataFrame.info of            lat       lon  minutes_remaining  period  playoffs  shot_distance  \
0      34.0343 -118.1288                  1       2         0             14   
1      33.9723 -118.0808                  0       3         0             20   
2      33.8713 -118.1728                 11       3         0             19   
3      34.0443 -118.2698                 11       2         0              0   
4      33.8543 -118.2698                 11       3         0             19   
...        ...       ...                ...     ...       ...            ...   
16223  34.0523 -118.0718                  9       3         1             19   
16224  34.0443 -118.2698                  0       1         0              0   
16225  34.0523 -118.0838                  2       3         0             18   
16226  34.0403 -118.3618                  3       3         0              9   
16227  33.8573 -118.3958                  6       1         1             22   


In [2]:
import pandas as pd
import mlflow
import mlflow.sklearn
from pycaret.classification import setup, create_model, tune_model, predict_model, finalize_model, get_config
from sklearn.metrics import log_loss, f1_score
from imblearn.over_sampling import SMOTE

# 1. Carregar os dados
df_train = pd.read_parquet("../data/processed/base_train.parquet")
df_test  = pd.read_parquet("../data/processed/base_test.parquet")

X_test = df_test.drop("shot_made_flag", axis=1)
y_test = df_test["shot_made_flag"]

# 2. Iniciar run do MLflow
with mlflow.start_run(run_name="MelhorModeloComPrints"):

    setup(
    data=df_train,
    target="shot_made_flag",
    session_id=42,
    normalize=True,                     # Normaliza os dados
    transformation=True,               # Aplica transformações (log, raiz etc)
    fix_imbalance=True,                # Ativa o balanceamento
    fix_imbalance_method=SMOTE(),      # SMOTE personalizado
    remove_outliers=True,              # Remove outliers que atrapalham
    remove_multicollinearity=True,     # Remove variáveis colineares
    multicollinearity_threshold=0.95,  # Threshold de colinearidade
    fold=10,                           # Cross-validation com 10 folds
    fold_shuffle=True,                 # Shuffle nos folds
    html=False,
    verbose=False
)

    # 4. Treinar modelos
    print("🔧 Treinando modelo: Regressão Logística")
    lr_model = tune_model(create_model("lr"), optimize="F1")

    print("🔧 Treinando modelo: Árvore de Decisão")
    dt_model = tune_model(create_model("dt"), optimize="F1")

    # 5. Transformar X_test
    pipeline = get_config("pipeline")
    X_test_transformed = pipeline.transform(X_test)

    # 6. Avaliação Logística
    y_pred_lr = lr_model.predict(X_test_transformed)
    y_proba_lr = lr_model.predict_proba(X_test_transformed)
    f1_lr = f1_score(y_test, y_pred_lr)
    loss_lr = log_loss(y_test, y_proba_lr)

    # 7. Avaliação Árvore
    y_pred_dt = dt_model.predict(X_test_transformed)
    y_proba_dt = dt_model.predict_proba(X_test_transformed)
    f1_dt = f1_score(y_test, y_pred_dt)
    loss_dt = log_loss(y_test, y_proba_dt)

    # 8. Prints e comparações
    print(f"📊 Logística - F1 Score: {f1_lr:.4f} | Log Loss: {loss_lr:.4f}")
    print(f"📊 Árvore    - F1 Score: {f1_dt:.4f} | Log Loss: {loss_dt:.4f}")

    # 9. Log de métricas
    mlflow.log_metrics({
        "f1_score_logistica": f1_lr,
        "log_loss_logistica": loss_lr,
        "f1_score_arvore": f1_dt,
        "log_loss_arvore": loss_dt
    })

    # 10. Escolher modelo vencedor
    modelo_vencedor = "Logistica" if f1_lr > f1_dt else "Arvore"
    modelo_final = lr_model if modelo_vencedor == "Logistica" else dt_model

    mlflow.log_param("modelo_vencedor", modelo_vencedor)
    mlflow.sklearn.log_model(modelo_final, artifact_path="modelo_vencedor")

    print(f"✅ Modelo vencedor: {modelo_vencedor}")

# 11. Pronto para uso
print("🏁 Fim do processo. Você pode agora aplicar esse modelo na produção.")


🔧 Treinando modelo: Regressão Logística


                                                         

      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
1       0.5889  0.6123  0.5074  0.5789  0.5408  0.1716  0.1728
2       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
3       0.5783  0.5881  0.4613  0.5721  0.5107  0.1477  0.1503
4       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
5       0.5546  0.5631  0.4613  0.5388  0.4970  0.1016  0.1026
6       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
7       0.5590  0.5945  0.4641  0.5455  0.5015  0.1107  0.1118
8       0.5722  0.6011  0.4807  0.5613  0.5179  0.1375  0.1388
9       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
Mean    0.2853  0.2959  0.2375  0.2797  0.2568  0.0669  0.0676
Std     0.2854  0.2961  0.2378  0.2799  0.2570  0.0693  0.0700


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                         

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.5810  0.6049  0.4557  0.5771  0.5093  0.1524  0.1556
1       0.5880  0.6126  0.5074  0.5777  0.5403  0.1699  0.1711
2       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
3       0.5766  0.5887  0.4576  0.5701  0.5077  0.1440  0.1467
4       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
5       0.5519  0.5641  0.4631  0.5352  0.4965  0.0967  0.0975
6       0.5889  0.6169  0.5092  0.5786  0.5417  0.1717  0.1729
7       0.5555  0.5936  0.4604  0.5411  0.4975  0.1036  0.1046
8       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
9       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
Mean    0.3442  0.3581  0.2853  0.3380  0.3093  0.0838  0.0848
Std     0.2813  0.2927  0.2337  

                                                         

      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.5396  0.5188  0.6015  0.5150  0.5549  0.0840  0.0851
1       0.5528  0.5297  0.5886  0.5281  0.5567  0.1082  0.1089
2       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
3       0.5246  0.5092  0.5701  0.5016  0.5337  0.0530  0.0534
4       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
5       0.5158  0.4971  0.5720  0.4936  0.5299  0.0363  0.0368
6       0.5467  0.5203  0.5996  0.5217  0.5579  0.0973  0.0983
7       0.5370  0.5301  0.5838  0.5138  0.5466  0.0774  0.0781
8       0.5299  0.5039  0.5948  0.5071  0.5475  0.0649  0.0658
9       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
Mean    0.3746  0.3609  0.4110  0.3581  0.3827  0.0521  0.0526
Std     0.2455  0.2365  0.2693  0.2346  0.2507  0.0393  0.0396


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                         

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
1       0.4894  0.5023  0.7841  0.4786  0.5944  0.0045  0.0056
2       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
3       0.5185  0.5028  0.1605  0.4860  0.2413  0.0058  0.0077
4       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
5       0.5018  0.4854  0.1273  0.4259  0.1960 -0.0302 -0.0418
6       0.5167  0.5014  0.1661  0.4813  0.2469  0.0028  0.0037
7       0.5088  0.4929  0.1326  0.4528  0.2051 -0.0145 -0.0203
8       0.5123  0.4963  0.1326  0.4645  0.2063 -0.0076 -0.0107
9       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
Mean    0.3048  0.2981  0.1503  0.2789  0.1690 -0.0039 -0.0056
Std     0.2489  0.2435  0.2222  