In [47]:
# import libraries
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

import mlflow
from pprint import pprint
mlflow.xgboost.autolog(disable=True)


from helpers import convert_numerical_columns_to_float

In [24]:
# import data
data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))+"/data/"
train = convert_numerical_columns_to_float(pd.read_csv(data_path+"nba_logreg_train.csv"))
train.head()

Unnamed: 0,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,71.0,10.1,3.2,1.3,3.2,39.6,0.1,0.5,21.9,0.6,0.7,83.7,0.1,0.9,0.9,1.7,0.5,0.1,0.7,1.0
1,78.0,27.8,8.5,3.6,7.4,48.5,0.5,1.6,32.8,0.8,1.1,74.1,1.2,5.0,6.2,1.2,1.0,1.4,0.7,1.0
2,52.0,15.4,4.5,1.8,3.6,49.5,0.0,0.0,0.0,1.0,1.4,70.3,1.2,2.1,3.3,0.3,0.2,0.6,1.0,1.0
3,82.0,19.6,7.4,3.1,7.3,41.9,0.0,0.3,4.3,1.2,1.5,80.6,0.6,1.4,2.0,3.9,1.0,0.3,1.7,1.0
4,75.0,33.5,11.5,4.4,9.5,45.9,0.1,0.2,35.7,2.7,3.8,70.1,2.5,4.9,7.4,1.0,0.8,1.4,1.4,1.0


In [3]:
# define features and target
X_train = train.drop('TARGET_5Yrs', axis=1)
y_train = train['TARGET_5Yrs']

In [4]:
# setting the experiment 
mlflow.set_tracking_uri("http://20.224.70.229:5000/")
mlflow.set_experiment("nba-investment-experiment")

<Experiment: artifact_location='mlflow-artifacts:/666877338244795257', creation_time=1673441340627, experiment_id='666877338244795257', last_update_time=1673441340627, lifecycle_stage='active', name='nba-investment-experiment', tags={}>

In [5]:
# initialize mlflow client
client = mlflow.tracking.MlflowClient()

# Search for experiments
experiments = client.search_experiments()

# Print the experiment ID and name for each experiment
for experiment in experiments:
    print(f"Experiment ID: {experiment.experiment_id}")
    print(f"Experiment name: {experiment.name}")

Experiment ID: 666877338244795257
Experiment name: nba-investment-experiment


In [6]:
# split the training data into train and validation sets using stratified sampling so that we can preserve the same distribution of the target variable in the validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

In [7]:
# Convert the data to DMatrix objects
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
validation_dmatrix = xgb.DMatrix(X_val, label=y_val)

# Define the objective function for the hyperparameter optimization
def objective(params):
    with mlflow.start_run():
        # Set the model and the search space in the run metadata
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)

        # Train the XGBoost model using the specified hyperparameters
        booster = xgb.train(
            params=params, # Hyperparameters
            dtrain=train_dmatrix, # Training data
            num_boost_round=1000, # Train for 1000 rounds
            evals=[(validation_dmatrix, 'validation')], # Evaluate on the validation data at each iteration of training 
            early_stopping_rounds=50 # Stop training if the validation score does not improve for 50 rounds
        )
        
        # Make predictions on the validation data
        y_pred = booster.predict(validation_dmatrix).round()
        
        # Calculate the evaluation scores
        accuracy = accuracy_score(y_val, y_pred)
        precision = precision_score(y_val, y_pred)
        recall = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        
        # Log the evaluation scores to MLFlow
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

    return {'loss': 1 - f1, 'status': STATUS_OK} # Minimize the negative F1 score

# Define the search space for the hyperparameters
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 200, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'binary:logistic',
    'seed': 42
}


# Perform the hyperparameter optimization using the Tree Parzen Estimator algorithm
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-logloss:0.64598                        
[1]	validation-logloss:0.61731                        
[2]	validation-logloss:0.59906                        
[3]	validation-logloss:0.58854                        
[4]	validation-logloss:0.58306                        
[5]	validation-logloss:0.58138                        
[6]	validation-logloss:0.57430                        
[7]	validation-logloss:0.56916                        
[8]	validation-logloss:0.57814                        
[9]	validation-logloss:0.57899                        
[10]	validation-logloss:0.58212                       
[11]	validation-logloss:0.58314                       
[12]	validation-logloss:0.58315                       
[13]	validation-logloss:0.58864                       
[14]	validation-logloss:0.59042                       
[15]	validation-logloss:0.59528                       
[16]	validation-logloss:0.59935                       
[17]	validation-logloss:0.60073                       
[18]	valid

In [8]:
best_result["max_depth"] = int(best_result["max_depth"])
best_result["objective"] = "binary:logistic"
best_result["seed"] = 42
best_result

{'learning_rate': 0.24241131195414645,
 'max_depth': 162,
 'min_child_weight': 6.887410436175841,
 'reg_alpha': 0.07379133674398361,
 'reg_lambda': 0.03281453396858762,
 'objective': 'binary:logistic',
 'seed': 42}

In [13]:
with mlflow.start_run():
    # Set the model and the search space in the run metadata
    mlflow.log_params(best_result)
    
    # Train the XGBoost model using the specified hyperparameters
    booster = xgb.train(
        params=best_result, # Hyperparameters
        dtrain=train_dmatrix, # Training data
        num_boost_round=1000, # Train for 1000 rounds
        evals=[(validation_dmatrix, 'validation')], # Evaluate on the validation data at each iteration of training
        early_stopping_rounds=50 # Stop training if the validation score does not improve for 50 rounds
    )
    
    # Make predictions on the validation data
    y_pred = booster.predict(validation_dmatrix).round()
    
    # Calculate the evaluation scores
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    
    # Log the evaluation scores to MLFlow
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
        
    # mlflow.log_artifact("mymodel", artifact_path="model")  
    # mlflow.framework.log_model(model_object, artifact_path="model")  
    mlflow.xgboost.log_model(booster, "model")    

[0]	validation-logloss:0.65221
[1]	validation-logloss:0.62333
[2]	validation-logloss:0.60019
[3]	validation-logloss:0.59137
[4]	validation-logloss:0.58864
[5]	validation-logloss:0.58226
[6]	validation-logloss:0.58116
[7]	validation-logloss:0.58127
[8]	validation-logloss:0.58203
[9]	validation-logloss:0.58002
[10]	validation-logloss:0.58225
[11]	validation-logloss:0.58815
[12]	validation-logloss:0.58621
[13]	validation-logloss:0.58494
[14]	validation-logloss:0.58998
[15]	validation-logloss:0.58720
[16]	validation-logloss:0.59090
[17]	validation-logloss:0.58984
[18]	validation-logloss:0.59200
[19]	validation-logloss:0.59893
[20]	validation-logloss:0.59560
[21]	validation-logloss:0.60084
[22]	validation-logloss:0.60224
[23]	validation-logloss:0.59833
[24]	validation-logloss:0.59548
[25]	validation-logloss:0.59917
[26]	validation-logloss:0.60421
[27]	validation-logloss:0.60234
[28]	validation-logloss:0.60431
[29]	validation-logloss:0.59994
[30]	validation-logloss:0.59983
[31]	validation-lo



In [52]:
logged_model = 'runs:/25d40fe0311a4314b7bfa6cfd9669a8e/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# convert the loaded model to xgboost model
xgb_model = mlflow.xgboost.load_model(logged_model)

In [66]:
def convert_run_id_logged_model(run_id):
    return "runs:/" + run_id + "/model"

def test_model(logged_model, X_test, y_test):
    """
    this function tests the model on the test data and prints the evaluation metrics in order to go to production
    :param name: name of the model
    :param stage: stage of the model
    :param X_test: test features
    :param y_test: test target
    :return: None 
    """
    loaded_model = mlflow.pyfunc.load_model(logged_model)
    y_test_pred = loaded_model.predict(X_test).round()
    
    return (accuracy_score(y_test, y_test_pred), precision_score(y_test, y_test_pred), recall_score(y_test, y_test_pred), f1_score(y_test, y_test_pred))

In [67]:
test_data = convert_numerical_columns_to_float(pd.read_csv("/Users/abdessamadbaahmed/Desktop/livrable_mp_data/data/nba_logreg_test.csv"))
X_test = test_data.drop("TARGET_5Yrs", axis=1)
y_test = test_data["TARGET_5Yrs"]
test_data.head()

Unnamed: 0,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,82.0,33.0,17.1,7.0,14.3,49.0,0.0,0.1,20.0,3.0,4.4,68.7,3.7,4.2,8.0,2.4,1.2,0.5,3.0,1.0
1,74.0,26.4,7.8,3.1,7.4,41.6,0.1,0.7,21.2,1.5,2.3,65.9,0.3,1.6,1.9,4.5,0.7,0.1,2.2,1.0
2,67.0,16.9,5.1,2.0,3.4,59.6,0.0,0.0,0.0,1.1,1.8,57.7,1.6,2.9,4.5,0.6,0.5,0.3,0.8,0.0
3,79.0,26.3,10.8,4.2,8.5,49.9,0.0,0.1,25.0,2.3,3.2,73.1,2.2,3.5,5.7,2.2,0.7,0.7,1.5,1.0
4,82.0,20.8,8.5,3.1,6.3,49.1,0.1,0.4,36.7,2.1,2.7,77.2,1.4,2.6,4.0,1.3,0.8,0.8,1.3,1.0


In [68]:
test_model(convert_run_id_logged_model("716a8294e086457c93c610eed77c5d76"), X_test, y_test)

(0.7611940298507462, 0.7865168539325843, 0.8433734939759037, 0.813953488372093)

In [46]:
for mv in client.search_model_versions("name='nba-investment-predictor'"):
    pprint(dict(mv), indent=4)

{   'creation_timestamp': 1673454359195,
    'current_stage': 'Staging',
    'description': '',
    'last_updated_timestamp': 1673454688911,
    'name': 'nba-investment-predictor',
    'run_id': '25d40fe0311a4314b7bfa6cfd9669a8e',
    'run_link': '',
    'source': 'mlflow-artifacts:/666877338244795257/25d40fe0311a4314b7bfa6cfd9669a8e/artifacts/model',
    'status': 'READY',
    'status_message': '',
    'tags': {},
    'user_id': '',
    'version': '1'}
{   'creation_timestamp': 1673460340691,
    'current_stage': 'None',
    'description': '',
    'last_updated_timestamp': 1673460340691,
    'name': 'nba-investment-predictor',
    'run_id': '716a8294e086457c93c610eed77c5d76',
    'run_link': '',
    'source': 'mlflow-artifacts:/666877338244795257/716a8294e086457c93c610eed77c5d76/artifacts/model',
    'status': 'READY',
    'status_message': '',
    'tags': {},
    'user_id': '',
    'version': '2'}


In [58]:
candidate_model_1 = mlflow.xgboost.load_model(convert_run_id_to_uri("25d40fe0311a4314b7bfa6cfd9669a8e"))
candidate_model_2 = mlflow.xgboost.load_model(convert_run_id_to_uri("716a8294e086457c93c610eed77c5d76"))

In [8]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import numpy as np


# Define the objective function for the hyperparameter optimization
def objective(params, X_train, y_train, k=5):

    X_train = X_train.values
    y_train = y_train.values

    # Convert the data to DMatrix objects
    train_dmatrix = xgb.DMatrix(X_train, label=y_train)

    with mlflow.start_run():
        # Set the model and the search space in the run metadata
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)

        # Create a KFold object for cross-validation
        kf = KFold(n_splits=k, shuffle=True, random_state=42)

        # Initialize the evaluation scores
        accuracy = []
        precision = []
        recall = []
        f1 = []

        # Iterate over the folds
        for train_index, val_index in kf.split(X_train):
            # Split the data into training and validation sets
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

            # Convert the data to DMatrix objects
            train_fold_dmatrix = xgb.DMatrix(X_train_fold, label=y_train_fold)
            val_fold_dmatrix = xgb.DMatrix(X_val_fold, label=y_val_fold)

            # Train the XGBoost model using the specified hyperparameters
            booster = xgb.train(
                params=params,
                dtrain=train_fold_dmatrix,
                num_boost_round=1000,
                evals=[(val_fold_dmatrix, 'validation')],
                early_stopping_rounds=50
            ) 
            
            # Make predictions on the validation data
            y_pred = booster.predict(val_fold_dmatrix).round()

            # Calculate the evaluation scores for the fold
            accuracy.append(accuracy_score(y_val_fold, y_pred))
            precision.append(precision_score(y_val_fold, y_pred))
            recall.append(recall_score(y_val_fold, y_pred))
            f1.append(f1_score(y_val_fold, y_pred))

        # Calculate the mean evaluation scores over all the folds
        mean_accuracy = np.mean(accuracy)
        mean_precision = np.mean(precision)
        mean_recall = np.mean(recall)
        mean_f1 = np.mean(f1)
        
        # Log the evaluation scores to MLFlow
        mlflow.log_metric("accuracy", mean_accuracy)
        mlflow.log_metric("precision", mean_precision)
        mlflow.log_metric("recall", mean_recall)
        mlflow.log_metric("f1_score", mean_f1)

    return {'loss': 1 - mean_f1, 'status': STATUS_OK}

In [9]:
# Define the search space for the hyperparameters
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'binary:logistic',
    'seed': 42
}

In [10]:
# Perform the hyperparameter optimization using the Tree Parzen Estimator algorithm
def objective_cv(params):
    return objective(k=5, X_train=X_train, y_train=y_train, params=params)

best_result = fmin(
    fn=objective_cv,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]