In [None]:
#cd Proj_Kobe-Bryant-Shot-Selection
#pip install -r requirements.txt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Importar Bibliotecas Necessárias:

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
import mlflow
import mlflow.sklearn
from pycaret.classification import *


In [None]:
#print(mlflow.__version__)

In [None]:
# Iniciar o experimento do MLFlow
mlflow.start_run()

<ActiveRun: >

# Carregar os Dados Brutos do arquivo CSV

In [None]:
file_path = 'Proj_Kobe-Bryant-Shot-Selection/data/raw/data.csv'
df_kobe = pd.read_csv(file_path, sep=',')
df_kobe.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,minutes_remaining,period,...,shot_type,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id
0,Jump Shot,Jump Shot,10,20000012,33.9723,167,72,-118.1028,10,1,...,2PT Field Goal,Right Side(R),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,1
1,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,10,1,...,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,2
2,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,7,1,...,2PT Field Goal,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,3
3,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,6,1,...,2PT Field Goal,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,4
4,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,6,2,...,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,LAL @ POR,POR,5


In [None]:
print("Número de linhas no DataFrame:", df_kobe.shape[0])

Número de linhas no DataFrame: 30697


# 6. b. Pré-processamento dos Dados processed_data

In [None]:
# Filtrar os dados onde o valor de shot_type é igual a '2PT Field Goal'
df_kobe = df_kobe[df_kobe['shot_type'] == '2PT Field Goal']

# Filtrar o DataFrame para incluir apenas as colunas especificadas
columns_to_keep = ['lat', 'lon', 'minutes_remaining', 'period', 'playoffs', 'shot_distance', 'shot_made_flag']
df_kobe = df_kobe[columns_to_keep]

def preprocess_data(raw_data):
    # Tratar dados ausentes, se houver
    processed_data = raw_data.dropna().copy()  # Copiar o DataFrame para garantir que estamos trabalhando com uma cópia

    # Substituir valores faltantes na coluna 'shot_made_flag' por 0 (indicando que o arremesso foi errado)
    processed_data['shot_made_flag'].fillna(0, inplace=True)

    # Converter a coluna 'shot_made_flag' para tipo inteiro
    processed_data['shot_made_flag'] = processed_data['shot_made_flag'].astype(int)

    return processed_data

# Aplicar o pré-processamento aos dados brutos
df_kobe = preprocess_data(df_kobe)

print("Número de linhas no DataFrame após o pré-processamento:", df_kobe.shape[0])

Número de linhas no DataFrame após o pré-processamento: 20285


# 6. b. Transformação em Parquet e Dimensão resultante do dataset

In [None]:
# Salvar o DataFrame filtrado em um arquivo parquet
output_file = 'Proj_Kobe-Bryant-Shot-Selection/data/processed/data_filtered.parquet'
df_kobe.to_parquet(output_file)

# Verificar a dimensão resultante do DataFrame filtrado
print("Dimensão do DataFrame filtrado:", df_kobe.shape)

Dimensão do DataFrame filtrado: (20285, 7)


# 6. c. Divisão dos Dados em Conjuntos de Treino (80%) e Teste (20 %)

In [None]:
def split_data(data, test_size=0.2):

    X = data.drop('shot_made_flag', axis=1)
    y = data['shot_made_flag']

    # Dividir os dados de forma aleatória e estratificada
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=42)

    return X_train, X_test, y_train, y_test

if __name__ == "__main__":

    # Dividir os dados em conjuntos de treino e teste
    X_train, X_test, y_train, y_test = split_data(df_kobe)

# 6. c. Armazenar os datasets resultantes em arquivos Parquet

In [None]:
def save_data(X, y, file_path):

    # Criar o diretório se não existir
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    # Concatenar os recursos e o alvo
    data = pd.concat([X, y], axis=1)

    # Salvar os dados em formato Parquet
    data.to_parquet(file_path, index=False)

if __name__ == "__main__":

    # Salvar os conjuntos de dados em arquivos Parquet
    save_data(X_train, y_train, "Proj_Kobe-Bryant-Shot-Selection/data/processed/base_train.parquet")
    save_data(X_test, y_test, "Proj_Kobe-Bryant-Shot-Selection/data/processed/base_test.parquet")


# 6. d. Registre os parâmetros (% teste) e métricas (tamanho de cada base) no MlFlow

In [None]:
# Definir as features e o target
features = df_kobe.drop('shot_made_flag', axis=1)
target = df_kobe['shot_made_flag']

if __name__ == "__main__":
    # Encerrar qualquer execução ativa antes de iniciar uma nova
    if mlflow.active_run():
        mlflow.end_run()

    # Iniciar o experimento do MLFlow
    mlflow.start_run()

    # Calcular o percentual de teste
    test_percentage = len(X_test) / len(df_kobe)

    # Registrar os parâmetros e métricas no MLflow
    mlflow.log_param("test_percentage", test_percentage)
    mlflow.log_metric("train_dataset_size", len(X_train))
    mlflow.log_metric("test_dataset_size", len(X_test))

    # Finalizar o experimento do MLflow
    mlflow.end_run()


# 7. a, b, c, d. Implementar o pipeline de treinamento do modelo com o MlFlow usando o nome "Treinamento"

In [None]:
# Iniciar o experimento do MLflow com o nome especificado
if mlflow.active_run():
    mlflow.end_run()

mlflow.start_run(run_name="Treinamento")

# Separar os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Treinar um modelo de regressão logística usando a biblioteca PyCaret
setup(data=pd.concat([X_train, y_train], axis=1), target='shot_made_flag')
lr_model = create_model('lr')

# Prever probabilidades usando o modelo de regressão logística
lr_predictions_proba = lr_model.predict_proba(X_test)
lr_predictions_proba_positive = lr_predictions_proba[:, 1]

# Calcular a função de custo "log loss" usando as probabilidades previstas
log_loss_test = log_loss(y_test, lr_predictions_proba_positive)
mlflow.log_metric("log_loss_test", log_loss_test)

# Treinar um modelo de classificação usando a biblioteca PyCaret
setup(data=pd.concat([X_train, y_train], axis=1), target='shot_made_flag')
classification_model = compare_models()

# Fazer previsões usando o modelo de classificação
classification_model_predictions = classification_model.predict(X_test)

# Registrar a função de custo "log loss" e F1_score para esse novo modelo
log_loss_classification = log_loss(y_test, classification_model_predictions)
f1_score_classification = f1_score(y_test, classification_model_predictions)
mlflow.log_metric("log_loss_classification", log_loss_classification)
mlflow.log_metric("f1_score_classification", f1_score_classification)

# Finalizar o experimento do MLflow
mlflow.end_run()


Unnamed: 0,Description,Value
0,Session id,3150
1,Target,shot_made_flag
2,Target type,Binary
3,Original data shape,"(16228, 7)"
4,Transformed data shape,"(16228, 7)"
5,Transformed train set shape,"(11359, 7)"
6,Transformed test set shape,"(4869, 7)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5827,0.6031,0.4613,0.5787,0.5133,0.1562,0.1593
1,0.6048,0.6315,0.5424,0.5939,0.567,0.2048,0.2055
2,0.5977,0.6264,0.5055,0.5918,0.5453,0.1886,0.1904
3,0.588,0.5957,0.5055,0.5781,0.5394,0.1698,0.171
4,0.5704,0.5944,0.4834,0.5574,0.5178,0.134,0.1351
5,0.5537,0.584,0.4576,0.538,0.4945,0.0996,0.1007
6,0.5792,0.5814,0.4788,0.5714,0.521,0.151,0.1529
7,0.5766,0.584,0.5083,0.5633,0.5344,0.148,0.1487
8,0.5871,0.5933,0.4862,0.5815,0.5296,0.1669,0.1691
9,0.5568,0.5778,0.4502,0.5434,0.4924,0.1053,0.1067


Unnamed: 0,Description,Value
0,Session id,7086
1,Target,shot_made_flag
2,Target type,Binary
3,Original data shape,"(16228, 7)"
4,Transformed data shape,"(16228, 7)"
5,Transformed train set shape,"(11359, 7)"
6,Transformed test set shape,"(4869, 7)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.5905,0.5915,0.3624,0.622,0.4578,0.1643,0.1799,1.909
gbc,Gradient Boosting Classifier,0.5898,0.5909,0.3806,0.6142,0.4697,0.1643,0.1771,4.272
lr,Logistic Regression,0.5773,0.5972,0.4805,0.5683,0.5206,0.1473,0.1491,0.452
ridge,Ridge Classifier,0.5765,0.0,0.4824,0.5667,0.521,0.1458,0.1474,0.19
lda,Linear Discriminant Analysis,0.5761,0.5974,0.482,0.5663,0.5206,0.145,0.1467,0.251
nb,Naive Bayes,0.5755,0.5952,0.5014,0.5626,0.5301,0.1453,0.1462,0.199
qda,Quadratic Discriminant Analysis,0.5741,0.5961,0.5115,0.5594,0.5342,0.1433,0.1439,0.197
lightgbm,Light Gradient Boosting Machine,0.5729,0.5887,0.4588,0.5651,0.5063,0.1371,0.1395,642.119
rf,Random Forest Classifier,0.5588,0.5703,0.5123,0.5401,0.5256,0.1138,0.114,11.713
et,Extra Trees Classifier,0.5534,0.5639,0.5257,0.5326,0.529,0.1045,0.1045,5.799
