In [1]:
# %pip install pandas
# %pip install scikit-learn
# %pip install pycaret
# %pip install mlflow

In [3]:
import pandas  as pd

from sklearn.model_selection import train_test_split
from pycaret.regression import *
from pycaret.classification import *
from pycaret.regression import RegressionExperiment
from sklearn.metrics import log_loss 
# from pycaret.regression import *

In [1]:
seed = 123

In [4]:
# Carrega o dataset e exibe os dados
raw_df = pd.read_csv('../Data/01_raw/kobe_dataset.csv', delimiter=',')
# raw_df = Catalog.load('raw_kobe_dataset')

print('Dados Originais. {} amostras e {} colunas.'.format(raw_df.shape[0], raw_df.shape[1]))

Dados Originais. 30697 amostras e 25 colunas.


In [12]:
list(raw_df.columns.values)

['action_type',
 'combined_shot_type',
 'game_event_id',
 'game_id',
 'lat',
 'loc_x',
 'loc_y',
 'lon',
 'minutes_remaining',
 'period',
 'playoffs',
 'season',
 'seconds_remaining',
 'shot_distance',
 'shot_made_flag',
 'shot_type',
 'shot_zone_area',
 'shot_zone_basic',
 'shot_zone_range',
 'team_id',
 'team_name',
 'game_date',
 'matchup',
 'opponent',
 'shot_id']

In [5]:
raw_df.isnull().sum()

action_type              0
combined_shot_type       0
game_event_id            0
game_id                  0
lat                      0
loc_x                    0
loc_y                    0
lon                      0
minutes_remaining        0
period                   0
playoffs                 0
season                   0
seconds_remaining        0
shot_distance            0
shot_made_flag        5000
shot_type                0
shot_zone_area           0
shot_zone_basic          0
shot_zone_range          0
team_id                  0
team_name                0
game_date                0
matchup                  0
opponent                 0
shot_id                  0
dtype: int64

In [5]:
conformed_df = raw_df.dropna()
print('Dados de trabalho. {} amostras e {} colunas.'.format(conformed_df.shape[0], conformed_df.shape[1]))

Dados de trabalho. 25697 amostras e 25 colunas.


In [6]:
conformed_df = conformed_df.query('shot_type == "2PT Field Goal"')
print('Dados de trabalho. {} amostras e {} colunas.'.format(conformed_df.shape[0], conformed_df.shape[1]))

Dados de trabalho. 20285 amostras e 25 colunas.


In [15]:
list(conformed_df['shot_made_flag'].value_counts().items())

[(0.0, 10602), (1.0, 9683)]

- [x] Observe que há dados faltantes na base de dados! As linhas que possuem dados faltantes devem ser desconsideradas
- [x] Você também irá filtrar os dados onde o valor de shot_type for igual à 2PT Field Goal.
- [x] Ainda, para esse exercício serão apenas consideradas as colunas: 
  - lat
  - lng
  - minutes_remaining
  - period
  - playoffs
  - shot_distance

In [18]:
X = conformed_df[[
    'lat',
    'lon',
    'minutes_remaining',
    'period',
    'playoffs',
    'shot_distance'
]];

y = conformed_df[['shot_made_flag']]

print('Dados de trabalho, {} amostras e {} colunas. Labels, {} amostras.'.format(X.shape[0], X.shape[1], y.shape[0]))

Dados de trabalho, 20285 amostras e 6 colunas. Labels, 20285 amostras.


- [x] Separe os dados em treino (80%) e teste (20 %) usando uma escolha aleatória e estratificada.

In [17]:
train, test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=seed, stratify=y)

- [x] Com os dados separados para treinamento, treine um modelo com regressão logística do sklearn usando a biblioteca pyCaret.
- [x] Com os dados separados para treinamento, treine um modelo de classificação do sklearn usando a biblioteca pyCaret.
- [x] Selecione agora os dados da base de dados original onde shot_type for igual à 3PT Field Goal (será uma nova base de dados)
- [] Através da biblioteca requests, aplique o modelo treinado. Pode usar o MLFlow Models?  

## Regressão

In [11]:
regression_xp = RegressionExperiment()
regression_xp.setup(data = train,
                    session_id = seed,
                    fold = 10,
                    test_data = test,
                    n_jobs = -2,
                    fold_strategy = 'stratifiedkfold',
                    experiment_name = 'regression_xp',
                    log_experiment='mlflow')
regression_xp.add_metric('logloss', 'Log Loss', log_loss, greater_is_better = False)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,shot_distance
2,Target type,Regression
3,Original data shape,"(20285, 6)"
4,Transformed data shape,"(20285, 6)"
5,Transformed train set shape,"(16228, 6)"
6,Transformed test set shape,"(4057, 6)"
7,Numeric features,5
8,Preprocess,True
9,Imputation type,simple


Name                                                       Log Loss
Display Name                                               Log Loss
Score Function            <function log_loss at 0x000001A7A9B32680>
Scorer               make_scorer(log_loss, greater_is_better=False)
Target                                                         pred
Args                                                             {}
Greater is Better                                             False
Custom                                                         True
Name: logloss, dtype: object

In [12]:
best_regression = regression_xp.compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,Log Loss,TT (Sec)
et,Extra Trees Regressor,0.1083,0.0876,0.2552,0.9985,0.0257,0.0145,0.0,0.318
rf,Random Forest Regressor,0.1136,0.0953,0.2721,0.9983,0.0281,0.0147,0.0,0.275
lightgbm,Light Gradient Boosting Machine,0.1936,0.1234,0.3231,0.9978,0.0388,0.0302,0.0,0.234
dt,Decision Tree Regressor,0.0988,0.1433,0.3556,0.9975,0.0384,0.0127,0.0,0.225
gbr,Gradient Boosting Regressor,0.3185,0.2489,0.4879,0.9956,0.0921,0.0571,0.0,0.231
knn,K Neighbors Regressor,0.6804,1.4377,1.1968,0.9748,0.144,0.0905,0.0,0.234
ada,AdaBoost Regressor,1.8169,4.9567,2.2241,0.913,0.7367,0.2314,0.0,0.229
lar,Least Angle Regression,4.1534,25.9665,5.0947,0.5444,0.9643,0.519,0.0,0.233
br,Bayesian Ridge,4.1535,25.9665,5.0947,0.5444,0.9644,0.519,0.0,0.222
lr,Linear Regression,4.1534,25.9666,5.0948,0.5444,0.9643,0.519,0.0,0.758


## Classificação

In [13]:
classification_xp = ClassificationExperiment()
classification_xp.setup(
    data = train,
    session_id = seed,
    fold = 10,
    test_data = test,
    n_jobs = -2,
    fold_strategy = 'stratifiedkfold',
    log_experiment='mlflow',
    experiment_name = 'classification_xp')
classification_xp.add_metric('logloss', 'Log Loss', log_loss, greater_is_better = False)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,shot_distance
2,Target type,Multiclass
3,Target mapping,"0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 26: 24, 50: 25"
4,Original data shape,"(20285, 6)"
5,Transformed data shape,"(20285, 6)"
6,Transformed train set shape,"(16228, 6)"
7,Transformed test set shape,"(4057, 6)"
8,Numeric features,5
9,Preprocess,True


Name                                                       Log Loss
Display Name                                               Log Loss
Score Function            <function log_loss at 0x000001A7A9B32680>
Scorer               make_scorer(log_loss, greater_is_better=False)
Target                                                         pred
Args                                                             {}
Greater is Better                                             False
Multiclass                                                     True
Custom                                                         True
Name: logloss, dtype: object

In [14]:
# exp.add_metric('f1_score_custom', 'Custom F1 Score', f1_score_custom)

In [15]:
best_classification = classification_xp.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Log Loss,TT (Sec)
dt,Decision Tree Classifier,0.9,0.1888,0.9,0.9011,0.8997,0.891,0.8911,0.0,0.268
rf,Random Forest Classifier,0.8331,0.1987,0.8331,0.8337,0.8315,0.818,0.8181,0.0,1.011
gbc,Gradient Boosting Classifier,0.8316,0.1984,0.8316,0.8347,0.8316,0.8165,0.8167,0.0,0.772
et,Extra Trees Classifier,0.7658,0.198,0.7658,0.7666,0.7645,0.7447,0.7448,0.0,2.706
knn,K Neighbors Classifier,0.4129,0.1667,0.4129,0.4007,0.4021,0.357,0.3573,0.0,0.386
nb,Naive Bayes,0.3892,0.1783,0.3892,0.4065,0.3738,0.3316,0.3349,0.0,0.258
ada,Ada Boost Classifier,0.2913,0.1385,0.2913,0.2017,0.2161,0.2181,0.2554,0.0,0.277
lda,Linear Discriminant Analysis,0.2788,0.1548,0.2788,0.1347,0.1712,0.1441,0.1625,0.0,0.252
lr,Logistic Regression,0.2706,0.1409,0.2706,0.1052,0.1461,0.1248,0.1482,0.0,0.31
ridge,Ridge Classifier,0.2683,0.0,0.2683,0.0962,0.1336,0.1115,0.1434,0.0,0.252


Verificar se o log loss 0 não é causado pela ausencia do createmodel do exp

In [16]:
# !mlflow ui

In [17]:
classification_xp.save_model(best_classification, 'best_classification_model');

Transformation Pipeline and Model Successfully Saved


In [18]:
# classification_loaded_model = load_model('classification_best_model_pipeline')

In [19]:
# classification_loaded_model = assign_model(classification_loaded_model, custom_metric = {'Log Loss': log_loss_custom, 'F1 Score': f1_score_custom})
# evaluate_model(lr)

---

In [20]:
three_pt_field_goal_df = conformed_df.query('shot_type == "3PT Field Goal"')