In [2]:
import os
import sys
sys.path.append('../mlai_research/')
import log
import utils
import mlflow
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix
from skopt import BayesSearchCV
import plotly.express as px
import optuna
import joblib
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [3]:
logger = log.get_logger(__name__)

In [4]:
def split_data(df, features, target):
    """
    Split the data into training and testing sets.

    Parameters:
    df (pandas.DataFrame): The dataframe to use.
    features (list): The feature column names.
    target (str): The target column name.

    Returns:
    tuple: X_train, X_test, y_train, y_test
    """
    X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def train_model(model, X_train, y_train):
    """
    Train the machine learning model.

    Parameters:
    model (object): The machine learning model to use.
    X_train (pandas.DataFrame): Training features.
    y_train (pandas.Series): Training target.

    Returns:
    object: Trained machine learning model.
    """
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """
    Evaluate the machine learning model and log metrics to MLflow.

    Parameters:
    model (object): The trained machine learning model.
    X_test (pandas.DataFrame): Testing features.
    y_test (pandas.Series): Testing target.
    """
    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate metrics
    roc_auc = roc_auc_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    # Log metrics to MLflow
    mlflow.log_metric("roc_auc", roc_auc)

    # Save confusion matrix as a plot
    fig = px.imshow(cm)
    fig.write_image("confusion_matrix.png")

    # Log confusion matrix to MLflow
    mlflow.log_artifact("confusion_matrix.png")

    # Delete the confusion matrix image file
    os.remove("confusion_matrix.png")

def run_experiment(df, features, target, model, experiment_name):
    """
    Run a machine learning experiment.

    Parameters:
    df (pandas.DataFrame): The dataframe to use.
    features (list): The feature column names.
    target (str): The target column name.
    model (object): The machine learning model to use.
    experiment_name (str): The name of the experiment.
    """
    X_train, X_test, y_train, y_test = split_data(df, features, target)

    # Start an MLflow experiment
    mlflow.start_run(experiment_id=experiment_name)

    # Train the model
    trained_model = train_model(model, X_train, y_train)

    # Evaluate the model and log metrics to MLflow
    evaluate_model(trained_model, X_test, y_test)

    # End the MLflow run
    mlflow.end_run()

    return trained_model

def hyperparameter_optimization(model, X_train, y_train, search_space):
    """
    Perform hyperparameter optimization using Bayesian optimization.

    Parameters:
    model (object): The machine learning model to use.
    X_train (pandas.DataFrame): Training features.
    y_train (pandas.Series): Training target.
    search_space (dict): The search space for hyperparameters.

    Returns:
    dict: Best hyperparameters.
    """
    # Initialize the BayesSearchCV object
    bayes_search = BayesSearchCV(model, search_space, n_iter=32, random_state=0)

    # Fit the BayesSearchCV object to the data
    bayes_search.fit(X_train, y_train)

    # Get the best parameters
    best_params = bayes_search.best_params_
    return best_params


def load_modelling_data(conf):
    # Load train data
    train_data = np.load(f"{conf.data.path_mi}{conf.data.fn_train}")
    X_train = train_data['X']
    y_train = train_data['y']

    # Load validation data
    val_data = np.load(f"{conf.data.path_mi}{conf.data.fn_val}")
    X_val = val_data['X']
    y_val = val_data['y']
    return X_train, y_train, X_val, y_val

In [5]:
conf = utils.load_config("base")

20-Dec-23 09:55:56 - INFO - Starting 'load_config'.
20-Dec-23 09:55:56 - INFO - Finished 'load_config' in 0.0228 secs.


In [6]:
# Define the objective function to optimize
def objective(trial, X_train, y_train):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    
    # Create the model with suggested hyperparameters
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    
    # Perform cross-validation
    score = cross_val_score(model, X_train, y_train, cv=3).mean()
    return score

# Function to run the hyperparameter tuning
def run_optuna_tuning():
    # Create a study object and specify the direction is 'maximize'.
    study = optuna.create_study(direction='maximize', study_name='rf_study', storage='sqlite:///../Users/ashwineekumarpandey/Documents/Academics/Masters/SU_MS_MLAI/Modules/ResearchProject/mlai-research/data/07_model_output/example.db', load_if_exists=True)
    
    # Optimize the study, the objective function is passed in as the first argument.
    study.optimize(objective, n_trials=100, n_jobs=-1)  # n_jobs=-1 will use all available CPU cores
    
    # Save the study to a file
    joblib.dump(study, '../data/06_models/study.pkl')

    # Output the best trial
    print('Best trial:')
    print(' Value: ', study.best_trial.value)
    print(' Params: ')
    for key, value in study.best_trial.params.items():
        print(f'  {key}: {value}')

In [7]:
# Load your data into df, features, target
X_train, y_train, X_val, y_val = load_modelling_data(conf)

In [8]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((48, 1581), (48,), (4, 1581), (4,))

In [7]:
# Run the tuning process
run_optuna_tuning()

[I 2023-12-18 00:33:39,977] A new study created in RDB with name: rf_study
[I 2023-12-18 00:33:41,084] Trial 0 finished with value: 0.7916666666666666 and parameters: {'n_estimators': 71, 'max_depth': 19}. Best is trial 0 with value: 0.7916666666666666.
[I 2023-12-18 00:33:41,462] Trial 4 finished with value: 0.7916666666666666 and parameters: {'n_estimators': 88, 'max_depth': 16}. Best is trial 0 with value: 0.7916666666666666.
[I 2023-12-18 00:33:41,843] Trial 2 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 132, 'max_depth': 9}. Best is trial 2 with value: 0.8333333333333334.
[I 2023-12-18 00:33:42,106] Trial 3 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 155, 'max_depth': 3}. Best is trial 2 with value: 0.8333333333333334.
[I 2023-12-18 00:33:42,485] Trial 8 finished with value: 0.8125 and parameters: {'n_estimators': 86, 'max_depth': 6}. Best is trial 2 with value: 0.8333333333333334.
[I 2023-12-18 00:33:43,055] Trial 11 finish

Best trial:
 Value:  0.8541666666666666
 Params: 
  n_estimators: 297
  max_depth: 19


In [None]:
# To load the study later
loaded_study = joblib.load('../data/06_models/study.pkl')

In [None]:
# Initialize the machine learning model (e.g., SVC)
model = SVC()

In [None]:
# Define the parameter search space
# search_space = {"C": (1e-6, 1e+6, 'log-uniform'), "gamma": (1e-6, 1e+1, 'log-uniform'), "degree": (1, 8), "kernel": ['linear', 'poly', 'rbf']}
search_space = {"kernel": ['linear', 'poly', 'rbf']}

In [None]:
search_space

In [None]:
# Run experiment with hyperparameter optimization
best_params = hyperparameter_optimization(model, X_train, y_train, search_space)

In [None]:

trained_model = run_experiment(df, features, target, model.set_params(**best_params), "svc")