In [13]:
# import libraries
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

import mlflow

In [14]:
# import data
data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))+"/data/"
train = pd.read_csv(data_path+"nba_logreg_train.csv")
train.head()

Unnamed: 0,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,71,10.1,3.2,1.3,3.2,39.6,0.1,0.5,21.9,0.6,0.7,83.7,0.1,0.9,0.9,1.7,0.5,0.1,0.7,1
1,78,27.8,8.5,3.6,7.4,48.5,0.5,1.6,32.8,0.8,1.1,74.1,1.2,5.0,6.2,1.2,1.0,1.4,0.7,1
2,52,15.4,4.5,1.8,3.6,49.5,0.0,0.0,0.0,1.0,1.4,70.3,1.2,2.1,3.3,0.3,0.2,0.6,1.0,1
3,82,19.6,7.4,3.1,7.3,41.9,0.0,0.3,4.3,1.2,1.5,80.6,0.6,1.4,2.0,3.9,1.0,0.3,1.7,1
4,75,33.5,11.5,4.4,9.5,45.9,0.1,0.2,35.7,2.7,3.8,70.1,2.5,4.9,7.4,1.0,0.8,1.4,1.4,1


In [15]:
# define features and target
X_train = train.drop('TARGET_5Yrs', axis=1)
y_train = train['TARGET_5Yrs']

In [17]:
# setting the experiment 
mlflow.set_tracking_uri("http://20.224.70.229:5000/")
mlflow.set_experiment("nba-investment-experiment")

<Experiment: artifact_location='/mlflow_server/0', creation_time=None, experiment_id='0', last_update_time=None, lifecycle_stage='active', name='nba-investment-experiment', tags={}>

In [25]:
# initialize mlflow client
#client = mlflow.tracking.MlflowClient()

# Search for experiments
#experiments = client.search_experiments()

# Print the experiment ID and name for each experiment
#for experiment in experiments:
#    print(f"Experiment ID: {experiment.experiment_id}")
#    print(f"Experiment name: {experiment.name}")

In [26]:
# split the training data into train and validation sets using stratified sampling so that we can preserve the same distribution of the target variable in the validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

In [27]:
# Convert the data to DMatrix objects
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
validation_dmatrix = xgb.DMatrix(X_val, label=y_val)

# Define the objective function for the hyperparameter optimization
def objective(params):
    with mlflow.start_run():
        # Set the model and the search space in the run metadata
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)

        # Train the XGBoost model using the specified hyperparameters
        booster = xgb.train(
            params=params, # Hyperparameters
            dtrain=train_dmatrix, # Training data
            num_boost_round=1000, # Train for 1000 rounds
            evals=[(validation_dmatrix, 'validation')], # Evaluate on the validation data at each iteration of training 
            early_stopping_rounds=50 # Stop training if the validation score does not improve for 50 rounds
        )
        
        # Make predictions on the validation data
        y_pred = booster.predict(validation_dmatrix).round()
        
        # Calculate the evaluation scores
        accuracy = accuracy_score(y_val, y_pred)
        precision = precision_score(y_val, y_pred)
        recall = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        
        # Log the evaluation scores to MLFlow
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

    return {'loss': 1 - f1, 'status': STATUS_OK} # Minimize the negative F1 score

# Define the search space for the hyperparameters
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 200, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'binary:logistic',
    'seed': 42
}


# Perform the hyperparameter optimization using the Tree Parzen Estimator algorithm
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-logloss:0.66470                        
[1]	validation-logloss:0.67672                        
[2]	validation-logloss:0.66763                        
[3]	validation-logloss:0.67378                        
[4]	validation-logloss:0.69203                        
[5]	validation-logloss:0.70751                        
[6]	validation-logloss:0.72951                        
[7]	validation-logloss:0.72539                        
[8]	validation-logloss:0.73879                        
[9]	validation-logloss:0.73975                        
[10]	validation-logloss:0.74841                       
[11]	validation-logloss:0.74980                       
[12]	validation-logloss:0.74324                       
[13]	validation-logloss:0.74731                       
[14]	validation-logloss:0.76211                       
[15]	validation-logloss:0.76892                       
[16]	validation-logloss:0.77083                       
[17]	validation-logloss:0.77531                       
[18]	valid

In [28]:
best_result["max_depth"] = int(best_result["max_depth"])
best_result["objective"] = "binary:logistic"
best_result["seed"] = 42
best_result

{'learning_rate': 0.11689835591919019,
 'max_depth': 119,
 'min_child_weight': 9.139066258285764,
 'reg_alpha': 0.014911334344680532,
 'reg_lambda': 0.04417097771210607,
 'objective': 'binary:logistic',
 'seed': 42}

In [None]:
# Enable automatic logging to MLFlow
mlflow.xgboost.autolog()

with mlflow.start_run():
    booster = xgb.train(
        params=best_result, # Hyperparameters
        dtrain=train_dmatrix, # Training data
        num_boost_round=1000, # Train for 1000 rounds
        evals=[(validation_dmatrix, 'validation')], # Evaluate on the validation data at each iteration of training 
        early_stopping_rounds=50 # Stop training if the validation score does not improve for 50 rounds
    )

In [8]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import numpy as np


# Define the objective function for the hyperparameter optimization
def objective(params, X_train, y_train, k=5):

    X_train = X_train.values
    y_train = y_train.values

    # Convert the data to DMatrix objects
    train_dmatrix = xgb.DMatrix(X_train, label=y_train)

    with mlflow.start_run():
        # Set the model and the search space in the run metadata
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)

        # Create a KFold object for cross-validation
        kf = KFold(n_splits=k, shuffle=True, random_state=42)

        # Initialize the evaluation scores
        accuracy = []
        precision = []
        recall = []
        f1 = []

        # Iterate over the folds
        for train_index, val_index in kf.split(X_train):
            # Split the data into training and validation sets
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

            # Convert the data to DMatrix objects
            train_fold_dmatrix = xgb.DMatrix(X_train_fold, label=y_train_fold)
            val_fold_dmatrix = xgb.DMatrix(X_val_fold, label=y_val_fold)

            # Train the XGBoost model using the specified hyperparameters
            booster = xgb.train(
                params=params,
                dtrain=train_fold_dmatrix,
                num_boost_round=1000,
                evals=[(val_fold_dmatrix, 'validation')],
                early_stopping_rounds=50
            ) 
            
            # Make predictions on the validation data
            y_pred = booster.predict(val_fold_dmatrix).round()

            # Calculate the evaluation scores for the fold
            accuracy.append(accuracy_score(y_val_fold, y_pred))
            precision.append(precision_score(y_val_fold, y_pred))
            recall.append(recall_score(y_val_fold, y_pred))
            f1.append(f1_score(y_val_fold, y_pred))

        # Calculate the mean evaluation scores over all the folds
        mean_accuracy = np.mean(accuracy)
        mean_precision = np.mean(precision)
        mean_recall = np.mean(recall)
        mean_f1 = np.mean(f1)
        
        # Log the evaluation scores to MLFlow
        mlflow.log_metric("accuracy", mean_accuracy)
        mlflow.log_metric("precision", mean_precision)
        mlflow.log_metric("recall", mean_recall)
        mlflow.log_metric("f1_score", mean_f1)

    return {'loss': 1 - mean_f1, 'status': STATUS_OK}

In [9]:
# Define the search space for the hyperparameters
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'binary:logistic',
    'seed': 42
}

In [10]:
# Perform the hyperparameter optimization using the Tree Parzen Estimator algorithm
def objective_cv(params):
    return objective(k=5, X_train=X_train, y_train=y_train, params=params)

best_result = fmin(
    fn=objective_cv,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]