In [None]:
# Importar Bibliotecas Necessárias
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, f1_score
import joblib
import mlflow
from pycaret.classification import *

# Definir função para divisão de dados
def split_data(data, test_size=0.2):
    """
    Função para dividir os dados em conjuntos de treino e teste.
    """
    #features 
    X = data.drop('shot_made_flag', axis=1)
    #target
    y = data['shot_made_flag']
    
    # Dividir os dados de forma aleatória e estratificada
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=42)
    
    return X_train, X_test, y_train, y_test

# Definir função para salvar dados em formato Parquet
def save_data(X, y, file_path):
    """
    Função para salvar os conjuntos de dados em formato Parquet.
    """
    # Criar o diretório se não existir
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    
    # Concatenar os recursos e o alvo
    data = pd.concat([X, y], axis=1)
    
    # Salvar os dados em formato Parquet
    data.to_parquet(file_path, index=False)

# Iniciar o experimento do MLflow
mlflow.start_run(run_name="Treinamento")

# Carregar os dados pré-processados de desenvolvimento
data_file = 'Proj_Kobe-Bryant-Shot-Selection/data/processed/dataset_kobe_dev.parquet'
df_kobe = pd.read_parquet(data_file)

# Definir as features e o target
features = df_kobe.drop('shot_made_flag', axis=1)
target = df_kobe['shot_made_flag']

# Separar os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Salvar os conjuntos de dados em arquivos Parquet
save_data(X_train, y_train, "Proj_Kobe-Bryant-Shot-Selection/data/processed/base_train.parquet")
save_data(X_test, y_test, "Proj_Kobe-Bryant-Shot-Selection/data/processed/base_test.parquet")

# Calcular o percentual de teste
test_percentage = len(X_test) / len(df_kobe)
        
# Registrar os parâmetros e métricas no MLflow
mlflow.log_param("test_percentage", test_percentage)
mlflow.log_metric("train_dataset_size", len(X_train))
mlflow.log_metric("test_dataset_size", len(X_test))

# Treinar os modelos e registrar no MLflow
setup(data=pd.concat([X_train, y_train], axis=1), target='shot_made_flag')

# Treinar um modelo de regressão logística
lr_model = create_model('lr')

# Prever probabilidades usando o modelo de regressão logística
lr_predictions_proba = lr_model.predict_proba(X_test)
lr_predictions_proba_positive = lr_predictions_proba[:, 1]

# Calcular a função de custo "log loss" usando as probabilidades previstas
log_loss_test = log_loss(y_test, lr_predictions_proba_positive)
mlflow.log_metric("log_loss_test", log_loss_test)

# Treinar um modelo de classificação
classification_model = compare_models()

# Fazer previsões usando o modelo de classificação
classification_model_predictions = classification_model.predict(X_test)

# Registrar a função de custo "log loss" e F1_score para esse novo modelo
log_loss_classification = log_loss(y_test, classification_model_predictions)
f1_score_classification = f1_score(y_test, classification_model_predictions)
mlflow.log_metric("log_loss_classification", log_loss_classification)
mlflow.log_metric("f1_score_classification", f1_score_classification)

# Finalizar o experimento do MLflow
mlflow.end_run()

Unnamed: 0,Description,Value
0,Session id,6721
1,Target,shot_made_flag
2,Target type,Binary
3,Original data shape,"(16228, 7)"
4,Transformed data shape,"(16228, 7)"
5,Transformed train set shape,"(11359, 7)"
6,Transformed test set shape,"(4869, 7)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5801,0.5973,0.4649,0.574,0.5138,0.1514,0.154
1,0.5889,0.6061,0.5018,0.58,0.5381,0.1712,0.1727
2,0.5713,0.5934,0.5166,0.5545,0.5349,0.1382,0.1385
3,0.5819,0.5974,0.4852,0.573,0.5255,0.1563,0.158
4,0.5581,0.5728,0.4649,0.5431,0.501,0.1087,0.1098
5,0.5977,0.6268,0.5148,0.5899,0.5498,0.1892,0.1906
6,0.5731,0.5864,0.4908,0.56,0.5231,0.1397,0.1407
7,0.5757,0.5938,0.4567,0.5701,0.5072,0.1426,0.1453
8,0.5678,0.5912,0.5028,0.5526,0.5265,0.1306,0.1311
9,0.5824,0.5985,0.4723,0.5766,0.5193,0.1565,0.1589


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.5879,0.5947,0.3877,0.6076,0.4729,0.161,0.1722,1.912
gbc,Gradient Boosting Classifier,0.5867,0.5964,0.3986,0.6017,0.4791,0.1594,0.1689,4.851
lr,Logistic Regression,0.5777,0.5964,0.4871,0.5674,0.5239,0.1484,0.15,0.482
ridge,Ridge Classifier,0.5774,0.0,0.4889,0.5666,0.5247,0.1481,0.1495,0.18
lda,Linear Discriminant Analysis,0.5771,0.5969,0.4895,0.566,0.5248,0.1474,0.1488,0.339
nb,Naive Bayes,0.5758,0.5939,0.5074,0.5618,0.533,0.1462,0.1469,0.24
qda,Quadratic Discriminant Analysis,0.5734,0.5988,0.5308,0.5558,0.5428,0.1433,0.1436,0.28
rf,Random Forest Classifier,0.5524,0.5632,0.5199,0.5322,0.5258,0.1021,0.1022,7.892
et,Extra Trees Classifier,0.5489,0.5574,0.5356,0.527,0.531,0.0966,0.0966,6.179
knn,K Neighbors Classifier,0.5355,0.5446,0.4924,0.5141,0.5029,0.0674,0.0675,0.542


Processing:   0%|          | 0/61 [00:00<?, ?it/s]