In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import mlflow

In [2]:
# import data
train = pd.read_csv('/Users/abdessamadbaahmed/Desktop/livrable_mp_data/data/nba_logreg_train.csv')

In [3]:
# define features and target
X_train = train.drop('TARGET_5Yrs', axis=1)
y_train = train['TARGET_5Yrs']

In [4]:
# setting the experiment 
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("nba-investment-experiment")

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1673098660203, experiment_id='1', last_update_time=1673098660203, lifecycle_stage='active', name='nba-investment-experiment', tags={}>

In [5]:
# initialize mlflow client
client = mlflow.tracking.MlflowClient()

# Search for experiments
experiments = client.search_experiments()

# Print the experiment ID and name for each experiment
for experiment in experiments:
    print(f"Experiment ID: {experiment.experiment_id}")
    print(f"Experiment name: {experiment.name}")

Experiment ID: 1
Experiment name: nba-investment-experiment
Experiment ID: 0
Experiment name: Default


In [6]:
# split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [7]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

# Convert the data to DMatrix objects
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
validation_dmatrix = xgb.DMatrix(X_val, label=y_val)

# Define the objective function for the hyperparameter optimization
def objective(params):
    with mlflow.start_run():
        # Set the model and the search space in the run metadata
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)

        # Train the XGBoost model using the specified hyperparameters
        booster = xgb.train(
            params=params,
            dtrain=train_dmatrix,
            num_boost_round=1000,
            evals=[(validation_dmatrix, 'validation')],
            early_stopping_rounds=50
        )
        
        # Make predictions on the validation data
        y_pred = booster.predict(validation_dmatrix).round()
        
        # Calculate the evaluation scores
        accuracy = accuracy_score(y_val, y_pred)
        precision = precision_score(y_val, y_pred)
        recall = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        
        # Log the evaluation scores to MLFlow
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

    return {'loss': 1 - f1, 'status': STATUS_OK}

# Define the search space for the hyperparameters
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'binary:logistic',
    'seed': 42
}

# Perform the hyperparameter optimization using the Tree Parzen Estimator algorithm
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-logloss:0.67926                        
[1]	validation-logloss:0.66583                        
[2]	validation-logloss:0.65627                        
[3]	validation-logloss:0.64923                        
[4]	validation-logloss:0.64094                        
[5]	validation-logloss:0.63297                        
[6]	validation-logloss:0.62784                        
[7]	validation-logloss:0.62269                        
[8]	validation-logloss:0.61857                        
[9]	validation-logloss:0.61235                        
[10]	validation-logloss:0.60580                       
[11]	validation-logloss:0.60094                       
[12]	validation-logloss:0.59769                       
[13]	validation-logloss:0.59401                       
[14]	validation-logloss:0.59075                       
[15]	validation-logloss:0.58865                       
[16]	validation-logloss:0.58633                       
[17]	validation-logloss:0.58631                       
[18]	valid

In [8]:
best_result

{'learning_rate': 0.12733162898151917,
 'max_depth': 47.0,
 'min_child_weight': 14.94503580200527,
 'reg_alpha': 0.07462833916308295,
 'reg_lambda': 0.014349514140204129}

In [None]:
# import k-fold cross validation
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)


for train_index, val_index in kf.split(X_train):
    # Split the data into training and validation sets
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

In [None]:
X_train[100]

In [None]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import numpy as np


# Define the objective function for the hyperparameter optimization
def objective(params, X_train, y_train, k=5):

    X_train = X_train.values
    y_train = y_train.values

    # Convert the data to DMatrix objects
    train_dmatrix = xgb.DMatrix(X_train, label=y_train)

    with mlflow.start_run():
        # Set the model and the search space in the run metadata
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)

        # Create a KFold object for cross-validation
        kf = KFold(n_splits=k, shuffle=True, random_state=42)

        # Initialize the evaluation scores
        accuracy = []
        precision = []
        recall = []
        f1 = []

        # Iterate over the folds
        for train_index, val_index in kf.split(X_train):
            # Split the data into training and validation sets
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

            # Convert the data to DMatrix objects
            train_fold_dmatrix = xgb.DMatrix(X_train_fold, label=y_train_fold)
            val_fold_dmatrix = xgb.DMatrix(X_val_fold, label=y_val_fold)

            # Train the XGBoost model using the specified hyperparameters
            booster = xgb.train(
                params=params,
                dtrain=train_fold_dmatrix,
                num_boost_round=1000,
                evals=[(val_fold_dmatrix, 'validation')],
                early_stopping_rounds=50
            )
            
            # Make predictions on the validation data
            y_pred = booster.predict(val_fold_dmatrix).round()

            # Calculate the evaluation scores for the fold
            accuracy.append(accuracy_score(y_val_fold, y_pred))
            precision.append(precision_score(y_val_fold, y_pred))
            recall.append(recall_score(y_val_fold, y_pred))
            f1.append(f1_score(y_val_fold, y_pred))

        # Calculate the mean evaluation scores over all the folds
        mean_accuracy = np.mean(accuracy)
        mean_precision = np.mean(precision)
        mean_recall = np.mean(recall)
        mean_f1 = np.mean(f1)
        
        # Log the evaluation scores to MLFlow
        mlflow.log_metric("accuracy", mean_accuracy)
        mlflow.log_metric("precision", mean_precision)
        mlflow.log_metric("recall", mean_recall)
        mlflow.log_metric("f1_score", mean_f1)

    return {'loss': 1 - mean_f1, 'status': STATUS_OK}

In [None]:
# Define the search space for the hyperparameters
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'binary:logistic',
    'seed': 42
}

In [None]:
# Perform the hyperparameter optimization using the Tree Parzen Estimator algorithm
def wrapper(params):
    return objective(k=5, X_train=X_train, y_train=y_train, params=params)

best_result = fmin(
    fn=wrapper,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)