# Package & data

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
import lightgbm as lgb

import joblib
import time



In [32]:
data = pd.read_csv("../data/processed/data_preprocessed.csv")
data.head()

Unnamed: 0,PLAYER_YEAR_START,PLAYER_GAME_AGE,PLAYER_HEIGHT,PLAYER_WEIGHT,PLAYER_BORN_YEAR,PLAYER_EXP,GAME_PERIOD,GAME_PERIODE_SECOND_REMAINGING,SHOT_DISTANCE,SHOT_MADE_FLAG,...,SHOT_ZONE_AREA_Center(C),SHOT_ZONE_AREA_Left_Side_Center(LC),SHOT_ZONE_AREA_Left_Side(L),SHOT_ZONE_AREA_Right_Side_Center(RC),SHOT_ZONE_AREA_Right_Side(R),SHOT_ZONE_RANGE_16-24_ft.,SHOT_ZONE_RANGE_24+_ft.,SHOT_ZONE_RANGE_8-16_ft.,SHOT_ZONE_RANGE_Back_Court_Shot,SHOT_ZONE_RANGE_Less_Than_8_ft.
0,-1.788159,0.235575,-0.766211,-0.682981,-1.140319,0.609749,1.336712,1.595207,0.3084,1,...,False,False,False,False,True,False,False,True,False,False
1,-1.788159,0.235575,-0.766211,-0.682981,-1.140319,0.609749,1.336712,1.088236,0.205792,0,...,False,False,False,False,True,False,False,True,False,False
2,-1.788159,0.235575,-0.766211,-0.682981,-1.140319,0.609749,1.336712,-0.772253,-0.204643,0,...,False,False,True,False,False,False,False,True,False,False
3,-1.788159,0.235575,-0.766211,-0.682981,-1.140319,0.609749,1.336712,-1.164438,-0.717686,0,...,True,False,False,False,False,False,False,False,False,True
4,-1.788159,0.235575,-0.766211,-0.682981,-1.140319,0.609749,1.336712,-1.451403,0.205792,0,...,False,False,False,False,True,False,False,True,False,False


In [33]:
## Ce code est à utiliser pour tester les codes suivant avec peu de ligne
data = data.sample(n = 5000, random_state=42)


## Entrainement de modèle

In [34]:

# Séparation features et target
features = data.drop('SHOT_MADE_FLAG', axis=1)
target = data['SHOT_MADE_FLAG']

# Division des matrices pour l'ensemble de données complet
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, stratify = target)

# Save the training and testing data
data_split = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test
}

joblib.dump(data_split, '../data/processed/data_split.joblib')


['../data/processed/data_split.joblib']

In [35]:
# Créer une fonction d'entrainement de model
def train_model(model, param_grid, X_train, y_train, X_test, y_test, model_name):
    start_time_search_params = time.time()
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    end_time_search_params = time.time()
    execution_time_search_params = end_time_search_params - start_time_search_params

    best_model = model.set_params(**best_params)
    start_time_training = time.time()
    best_model.fit(X_train, y_train)
    end_time_training = time.time()
    execution_time_training = end_time_training - start_time_training

    y_pred = best_model.predict(X_test)

    model_data = {
        'best_params': best_params,
        'best_model': best_model,
        'y_pred': y_pred,
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred),
        'execution_time_search_params': execution_time_search_params,
        'execution_time_training': execution_time_training,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
    }

    joblib.dump(model_data, f'../models/{model_name}.joblib')


In [None]:
# Logistic Regression
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'solver': ['liblinear', 'saga']
}
logistic_model = LogisticRegression()
train_model(logistic_model, param_grid, X_train, y_train, X_test, y_test, 'logistic_model')


In [42]:
# Decision tree
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
decision_tree_model = DecisionTreeClassifier()
train_model(decision_tree_model, param_grid, X_train, y_train, X_test, y_test, 'decision_tree_model')


In [41]:
# Gradient Bossting
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 4],
    'subsample': [0.5, 0.6],
    'learning_rate': [0.02, 0.03]
}
gradient_boosting_model = GradientBoostingClassifier()
train_model(gradient_boosting_model, param_grid, X_train, y_train, X_test, y_test, 'gradient_boosting_model')


In [None]:
# xgboost
xgboost_model

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 4],
    'subsample': [0.5, 0.6],
    'learning_rate': [0.02, 0.03],
    'gamma': [0.1, 0.2],
    'colsample_bytree': [0.5, 0.6]
}
xgboost_model = XGBClassifier()
train_model(xgboost_model, param_grid, X_train, y_train, X_test, y_test, 'xgboost_model')


In [43]:
# Ada Boost
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'algorithm': ['SAMME', 'SAMME.R']
}
adaboost_model = AdaBoostClassifier()
train_model(adaboost_model, param_grid, X_train, y_train, X_test, y_test, 'adaboost_model')

In [44]:
# LightGBM
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 4],
    'learning_rate': [0.02, 0.03],
    'subsample': [0.5, 0.6],
    'objective': ['binary'],
    'metric': ['binary_error']
}
lightgbm_model = lgb.LGBMClassifier()
train_model(lightgbm_model, param_grid, X_train, y_train, X_test, y_test, 'lightgbm_model')

In [None]:
# CatBoost

# catBoost

train_pool = Pool(data=X_train, label=y_train)
test_pool = Pool(data=X_test)

# Search hyperparameters
param_dist_catboost = {
    'iterations': [100, 200],
    'depth': [3, 4],
    'learning_rate': [0.02, 0.03],
    'subsample': [0.5, 0.6],
    'custom_metric': ['Accuracy', 'F1']
}

# Train the model on the training data with the best hyperparameters found
catboost = CatBoostClassifier()
random_search = RandomizedSearchCV(
    catboost, param_distributions=param_dist_catboost,
    cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1,
    random_state=42, n_iter=10  # Ajustez n_iter au besoin
)
random_search.fit(train_pool)
best_params = random_search.best_params_

# Train the model on the training data with the best hyperparameters found
best_model = CatBoostClassifier(**best_params)
start_time_training = time.time()
best_model.fit(train_pool)
end_time_training = time.time()
execution_time_training = end_time_training - start_time_training

# Make predictions on the test data
y_pred = best_model.predict(test_pool)

# Save model and metrics
model_data = {
    'best_params': best_params,
    'best_model': best_model,
    'confusion_matrix': confusion_matrix(y_test, y_pred),
    'classification_report': classification_report(y_test, y_pred),
    'execution_time_training': execution_time_training,
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1_score': f1_score(y_test, y_pred),
    'auc': roc_auc_score(y_test, best_model.predict_proba(test_pool)[:, 1])
}
joblib.dump(model_data, '../models/catboost_model.joblib')
