In [14]:
import pandas as pd
import mlflow
import mlflow.sklearn
from pycaret.classification import setup, create_model, tune_model, predict_model, finalize_model, get_config
from sklearn.metrics import log_loss, f1_score
from imblearn.over_sampling import SMOTE

# 1. Carregar os dados
df_train = pd.read_parquet("../data/processed/base_train.parquet")
df_test  = pd.read_parquet("../data/processed/base_test.parquet")

X_test = df_test.drop("shot_made_flag", axis=1)
y_test = df_test["shot_made_flag"]

# 2. Iniciar run do MLflow
with mlflow.start_run(run_name="MelhorModeloComPrints"):

    setup(
    data=df_train,
    target="shot_made_flag",
    session_id=42,
    normalize=True,                     # Normaliza os dados
    transformation=True,               # Aplica transformações (log, raiz etc)
    fix_imbalance=True,                # Ativa o balanceamento
    fix_imbalance_method=SMOTE(),      # SMOTE personalizado
    remove_outliers=True,              # Remove outliers que atrapalham
    remove_multicollinearity=True,     # Remove variáveis colineares
    multicollinearity_threshold=0.95,  # Threshold de colinearidade
    fold=10,                           # Cross-validation com 10 folds
    fold_shuffle=True,                 # Shuffle nos folds
    html=False,
    verbose=False
)

    # 4. Treinar modelos
    print("🔧 Treinando modelo: Regressão Logística")
    lr_model = tune_model(create_model("lr"), optimize="F1")

    print("🔧 Treinando modelo: Árvore de Decisão")
    dt_model = tune_model(create_model("dt"), optimize="F1")

    # 5. Transformar X_test
    pipeline = get_config("pipeline")
    X_test_transformed = pipeline.transform(X_test)

    # 6. Avaliação Logística
    y_pred_lr = lr_model.predict(X_test_transformed)
    y_proba_lr = lr_model.predict_proba(X_test_transformed)
    f1_lr = f1_score(y_test, y_pred_lr)
    loss_lr = log_loss(y_test, y_proba_lr)

    # 7. Avaliação Árvore
    y_pred_dt = dt_model.predict(X_test_transformed)
    y_proba_dt = dt_model.predict_proba(X_test_transformed)
    f1_dt = f1_score(y_test, y_pred_dt)
    loss_dt = log_loss(y_test, y_proba_dt)

    # 8. Prints e comparações
    print(f"📊 Logística - F1 Score: {f1_lr:.4f} | Log Loss: {loss_lr:.4f}")
    print(f"📊 Árvore    - F1 Score: {f1_dt:.4f} | Log Loss: {loss_dt:.4f}")

    # 9. Log de métricas
    mlflow.log_metrics({
         "f1_score_arvore": f1_dt,
        "f1_score_logistica": f1_lr,
        "log_loss_logistica": loss_lr, 
        "log_loss_arvore": loss_dt
    })
score_lr = (0.7 * f1_lr) - (0.3 * loss_lr)
score_dt = (0.7 * f1_dt) - (0.3 * loss_dt)

if score_lr > score_dt:
    modelo_vencedor = "Logistica"
    modelo_final = lr_model
else:
    modelo_vencedor = "Arvore"
    modelo_final = dt_model

print(f"✅ Modelo vencedor: {modelo_vencedor}")



🔧 Treinando modelo: Regressão Logística


                                                         

      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
1       0.5863  0.6132  0.5074  0.5753  0.5392  0.1665  0.1676
2       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
3       0.5757  0.5884  0.4557  0.5691  0.5061  0.1421  0.1448
4       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
5       0.5537  0.5641  0.4594  0.5378  0.4955  0.0998  0.1008
6       0.5951  0.6160  0.5111  0.5869  0.5464  0.1838  0.1853
7       0.5572  0.5943  0.4622  0.5433  0.4995  0.1071  0.1082
8       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
9       0.5947  0.6212  0.5332  0.5827  0.5568  0.1848  0.1854
Mean    0.3463  0.3597  0.2929  0.3395  0.3144  0.0884  0.0892
Std     0.2830  0.2941  0.2403  0.2776  0.2574  0.0769  0.0775


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                         

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
1       0.5836  0.6121  0.5000  0.5729  0.5340  0.1608  0.1620
2       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
3       0.5748  0.5892  0.4539  0.5681  0.5046  0.1403  0.1430
4       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
5       0.5537  0.5646  0.4594  0.5378  0.4955  0.0998  0.1008
6       0.5907  0.6148  0.5111  0.5807  0.5437  0.1753  0.1765
7       0.5590  0.5948  0.4622  0.5457  0.5005  0.1105  0.1117
8       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
9       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
Mean    0.2862  0.2976  0.2387  0.2805  0.2578  0.0687  0.0694
Std     0.2864  0.2978  0.2392  

                                                         

      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
1       0.5502  0.5248  0.5775  0.5261  0.5506  0.1023  0.1028
2       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
3       0.5423  0.5297  0.5683  0.5185  0.5423  0.0864  0.0868
4       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
5       0.5396  0.5240  0.5923  0.5152  0.5511  0.0833  0.0841
6       0.5660  0.5424  0.6494  0.5374  0.5881  0.1381  0.1409
7       0.5440  0.5386  0.6059  0.5197  0.5595  0.0926  0.0938
8       0.5423  0.5180  0.6335  0.5173  0.5695  0.0913  0.0935
9       0.5322  0.5053  0.6052  0.5085  0.5527  0.0700  0.0712
Mean    0.3816  0.3683  0.4232  0.3643  0.3914  0.0664  0.0673
Std     0.2500  0.2413  0.2780  0.2386  0.2565  0.0465  0.0472


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                         

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
1       0.5062  0.4952  0.2565  0.4680  0.3313 -0.0097 -0.0108
2       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
3       0.5185  0.5028  0.1605  0.4860  0.2413  0.0058  0.0077
4       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
5       0.5018  0.4854  0.1273  0.4259  0.1960 -0.0302 -0.0418
6       0.5167  0.5014  0.1661  0.4813  0.2469  0.0028  0.0037
7       0.5088  0.4929  0.1326  0.4528  0.2051 -0.0145 -0.0203
8       0.5123  0.4963  0.1326  0.4645  0.2063 -0.0076 -0.0107
9       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
Mean    0.3064  0.2974  0.0976  0.2779  0.1427 -0.0053 -0.0072
Std     0.2502  0.2429  0.0868  