In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import mlflow

In [2]:
# import data
train = pd.read_csv('/Users/abdessamadbaahmed/Desktop/livrable_mp_data/data/nba_logreg_train.csv')

In [3]:
# define features and target
X_train = train.drop('TARGET_5Yrs', axis=1)
y_train = train['TARGET_5Yrs']

In [4]:
# setting the experiment 
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("nba-investment-experiment")

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1673098660203, experiment_id='1', last_update_time=1673098660203, lifecycle_stage='active', name='nba-investment-experiment', tags={}>

In [5]:
# initialize mlflow client
client = mlflow.tracking.MlflowClient()

# Search for experiments
experiments = client.search_experiments()

# Print the experiment ID and name for each experiment
for experiment in experiments:
    print(f"Experiment ID: {experiment.experiment_id}")
    print(f"Experiment name: {experiment.name}")

Experiment ID: 1
Experiment name: nba-investment-experiment
Experiment ID: 0
Experiment name: Default


In [6]:
# split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [31]:
train_dmatrix = xgb.DMatrix(X_train.values, label=y_train.values)
validation_dmatrix = xgb.DMatrix(X_val.values, label=y_val.values)

booster = xgb.train(
    params={'objective': 'binary:logistic', 'max_depth': 3, 'eta': 0.1, 'eval_metric': 'auc'},
    dtrain=train_dmatrix,
    num_boost_round=100,
    evals=[(validation_dmatrix, 'validation')],
    early_stopping_rounds=50
)

y_pred = booster.predict(validation_dmatrix)
y_pred

[0]	validation-auc:0.75891
[1]	validation-auc:0.76027
[2]	validation-auc:0.76095
[3]	validation-auc:0.76919
[4]	validation-auc:0.76516
[5]	validation-auc:0.77371
[6]	validation-auc:0.77186
[7]	validation-auc:0.77095
[8]	validation-auc:0.77335
[9]	validation-auc:0.77127
[10]	validation-auc:0.77181
[11]	validation-auc:0.77249
[12]	validation-auc:0.77127
[13]	validation-auc:0.77249
[14]	validation-auc:0.77439
[15]	validation-auc:0.77548
[16]	validation-auc:0.77538
[17]	validation-auc:0.77439
[18]	validation-auc:0.77489
[19]	validation-auc:0.77584
[20]	validation-auc:0.77520
[21]	validation-auc:0.77489
[22]	validation-auc:0.77330
[23]	validation-auc:0.77294
[24]	validation-auc:0.77308
[25]	validation-auc:0.77462
[26]	validation-auc:0.77439
[27]	validation-auc:0.77466
[28]	validation-auc:0.77430
[29]	validation-auc:0.77367
[30]	validation-auc:0.77303
[31]	validation-auc:0.77258
[32]	validation-auc:0.77186
[33]	validation-auc:0.77050
[34]	validation-auc:0.77095
[35]	validation-auc:0.77213
[3

array([0.6587403 , 0.88995385, 0.8771996 , 0.56598544, 0.8176856 ,
       0.36133036, 0.778214  , 0.8669575 , 0.90141743, 0.42818603,
       0.38190892, 0.752019  , 0.41791117, 0.65759164, 0.44974366,
       0.3379812 , 0.5875582 , 0.8243411 , 0.17500621, 0.5088726 ,
       0.7588978 , 0.3474537 , 0.49707028, 0.894729  , 0.5728275 ,
       0.6880671 , 0.2620154 , 0.55642146, 0.1722902 , 0.8397905 ,
       0.5068802 , 0.8218446 , 0.608461  , 0.51756614, 0.95262945,
       0.6138637 , 0.14825171, 0.89077336, 0.5138876 , 0.14094484,
       0.47181126, 0.5119167 , 0.4079249 , 0.6718387 , 0.50097567,
       0.33803588, 0.93373823, 0.7255758 , 0.87639725, 0.5692476 ,
       0.15768045, 0.7645403 , 0.19225061, 0.4712717 , 0.52931446,
       0.58459157, 0.4223724 , 0.8212817 , 0.45656016, 0.9427872 ,
       0.09107975, 0.33505857, 0.6812838 , 0.36603662, 0.8380742 ,
       0.84048355, 0.6812838 , 0.5806622 , 0.5852334 , 0.86039513,
       0.6334925 , 0.39794794, 0.1613596 , 0.8422426 , 0.57651

In [34]:
y_pred.round()

array([1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 0., 1., 0., 0., 1.,
       1., 0., 1., 1., 0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 0.,
       1., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 1., 1., 1., 1.,
       1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 0., 0., 0., 1., 1.,
       0., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1.,
       0., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
       1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.], dtype=float32)

In [38]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

# Convert the data to DMatrix objects
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
validation_dmatrix = xgb.DMatrix(X_val, label=y_val)

# Define the objective function for the hyperparameter optimization
def objective(params):
    with mlflow.start_run():
        # Set the model and the search space in the run metadata
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)

        # Train the XGBoost model using the specified hyperparameters
        booster = xgb.train(
            params=params,
            dtrain=train_dmatrix,
            num_boost_round=1000,
            evals=[(validation_dmatrix, 'validation')],
            early_stopping_rounds=50
        )
        
        # Make predictions on the validation data
        y_pred = booster.predict(validation_dmatrix).round()
        
        # Calculate the evaluation scores
        accuracy = accuracy_score(y_val, y_pred)
        precision = precision_score(y_val, y_pred)
        recall = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        
        # Log the evaluation scores to MLFlow
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

    return {'loss': 1 - f1, 'status': STATUS_OK}

# Define the search space for the hyperparameters
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'binary:logistic',
    'seed': 42
}

# Perform the hyperparameter optimization using the Tree Parzen Estimator algorithm
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-logloss:0.67433                        
[1]	validation-logloss:0.65866                        
[2]	validation-logloss:0.64560                        
[3]	validation-logloss:0.63334                        
[4]	validation-logloss:0.62238                        
[5]	validation-logloss:0.61274                        
[6]	validation-logloss:0.60535                        
[7]	validation-logloss:0.59812                        
[8]	validation-logloss:0.59143                        
[9]	validation-logloss:0.58661                        
[10]	validation-logloss:0.58158                       
[11]	validation-logloss:0.57665                       
[12]	validation-logloss:0.57201                       
[13]	validation-logloss:0.56830                       
[14]	validation-logloss:0.56600                       
[15]	validation-logloss:0.56277                       
[16]	validation-logloss:0.56104                       
[17]	validation-logloss:0.55890                       
[18]	valid

In [39]:
best_result

{'learning_rate': 0.12946154361164866,
 'max_depth': 32.0,
 'min_child_weight': 12.709284030901497,
 'reg_alpha': 0.007016483955270201,
 'reg_lambda': 0.014015214063549948}

In [None]:
# import k-fold cross validation
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)


for train_index, val_index in kf.split(X_train):
    # Split the data into training and validation sets
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

In [71]:
X_train[100]

KeyError: 100

In [90]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import numpy as np


# Define the objective function for the hyperparameter optimization
def objective(params, X_train, y_train, k=5):

    X_train = X_train.values
    y_train = y_train.values

    # Convert the data to DMatrix objects
    train_dmatrix = xgb.DMatrix(X_train, label=y_train)

    with mlflow.start_run():
        # Set the model and the search space in the run metadata
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)

        # Create a KFold object for cross-validation
        kf = KFold(n_splits=k, shuffle=True, random_state=42)

        # Initialize the evaluation scores
        accuracy = []
        precision = []
        recall = []
        f1 = []

        # Iterate over the folds
        for train_index, val_index in kf.split(X_train):
            # Split the data into training and validation sets
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

            # Convert the data to DMatrix objects
            train_fold_dmatrix = xgb.DMatrix(X_train_fold, label=y_train_fold)
            val_fold_dmatrix = xgb.DMatrix(X_val_fold, label=y_val_fold)

            # Train the XGBoost model using the specified hyperparameters
            booster = xgb.train(
                params=params,
                dtrain=train_fold_dmatrix,
                num_boost_round=1000,
                evals=[(val_fold_dmatrix, 'validation')],
                early_stopping_rounds=50
            )
            
            # Make predictions on the validation data
            y_pred = booster.predict(val_fold_dmatrix).round()

            # Calculate the evaluation scores for the fold
            accuracy.append(accuracy_score(y_val_fold, y_pred))
            precision.append(precision_score(y_val_fold, y_pred))
            recall.append(recall_score(y_val_fold, y_pred))
            f1.append(f1_score(y_val_fold, y_pred))

        # Calculate the mean evaluation scores over all the folds
        mean_accuracy = np.mean(accuracy)
        mean_precision = np.mean(precision)
        mean_recall = np.mean(recall)
        mean_f1 = np.mean(f1)
        
        # Log the evaluation scores to MLFlow
        mlflow.log_metric("accuracy", mean_accuracy)
        mlflow.log_metric("precision", mean_precision)
        mlflow.log_metric("recall", mean_recall)
        mlflow.log_metric("f1_score", mean_f1)

    return {'loss': 1 - mean_f1, 'status': STATUS_OK}

In [91]:
# Define the search space for the hyperparameters
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'binary:logistic',
    'seed': 42
}

In [92]:
# Perform the hyperparameter optimization using the Tree Parzen Estimator algorithm
def wrapper(params):
    return objective(k=5, X_train=X_train, y_train=y_train, params=params)

best_result = fmin(
    fn=wrapper,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-logloss:0.65850                        
[1]	validation-logloss:0.64235                        
[2]	validation-logloss:0.63732                        
[3]	validation-logloss:0.62995                        
[4]	validation-logloss:0.62101                        
[5]	validation-logloss:0.62656                        
[6]	validation-logloss:0.63138                        
[7]	validation-logloss:0.63204                        
[8]	validation-logloss:0.63716                        
[9]	validation-logloss:0.64687                        
[10]	validation-logloss:0.65185                       
[11]	validation-logloss:0.66286                       
[12]	validation-logloss:0.67077                       
[13]	validation-logloss:0.67393                       
[14]	validation-logloss:0.68689                       
[15]	validation-logloss:0.68835                       
[16]	validation-logloss:0.69117                       
[17]	validation-logloss:0.69092                       
[18]	valid