In [2]:
# loading the libraries
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import GridSearchCV

import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

import warnings
warnings.filterwarnings('ignore')

In [3]:
datasets = ['train', 'val', 'test']
loaded_data = {}
for dataset in datasets:
    loaded_data[f'{dataset}'] = pd.read_csv(f'E:/Coding/Applied Machine Learning/02-experimental-tracking/data/prepared/{dataset}.csv')

# Accessing the loaded datasets:
train_X = loaded_data['train'].text
train_y = loaded_data['train'].spam
val_X = loaded_data['val'].text
val_y = loaded_data['val'].spam
test_X = loaded_data['test'].text
test_y = loaded_data['test'].spam

In [4]:
def train_models(train_X, train_y):
    # XGBoost model
    lgb_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000)),
        ('clf', LGBMClassifier(random_state=1, force_row_wise=True))
    ])
    lgb_pipeline.fit(train_X, train_y)

    # Logistic Regression model
    lr_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000)),
        ('clf', LogisticRegression(random_state=1))
    ])
    lr_pipeline.fit(train_X, train_y)

    # Multinomial Naive Bayes model
    nb_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000)),
        ('clf', MultinomialNB())
    ])
    nb_pipeline.fit(train_X, train_y)

    return lgb_pipeline, lr_pipeline, nb_pipeline

In [5]:
# function for evaluating the models
def AUCPR_models(pipelines, test_X, test_y):
    results = {}
    for name, pipeline in pipelines.items():
        predictions = pipeline.predict(test_X)
        precision, recall, _ = precision_recall_curve(test_y, predictions)
        auc_precision_recall = auc(recall, precision)
        results[name] = auc_precision_recall
        print(f'{name} AUCPR: {auc_precision_recall:.4f}')
    return results

In [6]:
# Creating a new MLflow Experiment
mlflow.set_experiment("Model Version Control and Experiment Tracking Email for Spam Classification Problem")

2024/02/20 03:23:38 INFO mlflow.tracking.fluent: Experiment with name 'Model Version Control and Experiment Tracking Email for Spam Classification Problem' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///e:/Coding/Applied%20Machine%20Learning/02-experimental-tracking/src/mlruns/690074325115179996', creation_time=1708379618443, experiment_id='690074325115179996', last_update_time=1708379618443, lifecycle_stage='active', name=('Model Version Control and Experiment Tracking Email for Spam Classification '
 'Problem'), tags={}>

In [9]:
# Start MLflow run
with mlflow.start_run():

    # Define parameters
    mlflow.log_param("max_features", 10000)
    
    # Train models
    lgb_pipeline, lr_pipeline, nb_pipeline = train_models(train_X, train_y)

    # Log models in MLflow
    model_names = ["LGBMClassifier", "LogisticRegression", "MultinomialNB"]
    models = [lgb_pipeline, lr_pipeline, nb_pipeline]

    # Storing the pipelines in a dictionary for easy reference
    pipelines = {
    'Light GBM': lgb_pipeline,
    'Logistic Regression': lr_pipeline,
    'Multinomial Naive Bayes': nb_pipeline
    }

    # Evaluate models on test data
    evaluation_results = AUCPR_models(pipelines, test_X, test_y)

    # Log evaluation metrics
    for name, aucpr in evaluation_results.items():
        mlflow.log_metric(f'{name}_AUCPR', aucpr)

    # Initialize MLflow client
    client = MlflowClient()

    for model_name, model in zip(model_names, models):
        mlflow.sklearn.log_model(model, model_name)
        
        # Create or get the model name in the MLflow Model Registry
        try:
            client.create_registered_model(model_name)
        except Exception as e:
            print(f"Model {model_name} already exists in the registry.")

        # Create a new version of the model in the registry
        model_uri = f"runs:/{mlflow.active_run().info.run_id}/{model_name}"
        model_details = client.create_model_version(model_name, model_uri, mlflow.active_run().info.run_id)
        
        print(f"Model Version: {model_details.version} for model {model_name} has been logged to MLflow Registry.")

[LightGBM] [Info] Number of positive: 817, number of negative: 2619
[LightGBM] [Info] Total Bins 76811
[LightGBM] [Info] Number of data points in the train set: 3436, number of used features: 2508
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.237776 -> initscore=-1.164909
[LightGBM] [Info] Start training from score -1.164909
Light GBM AUCPR: 0.9608
Logistic Regression AUCPR: 0.9646
Multinomial Naive Bayes AUCPR: 0.9304
Model LGBMClassifier already exists in the registry.
Model Version: 3 for model LGBMClassifier has been logged to MLflow Registry.
Model LogisticRegression already exists in the registry.
Model Version: 3 for model LogisticRegression has been logged to MLflow Registry.
Model MultinomialNB already exists in the registry.
Model Version: 3 for model MultinomialNB has been logged to MLflow Registry.


In [10]:
! mlflow ui

^C


In [16]:
# Start of MLflow experiment tracking
mlflow.set_experiment("(GridSearchCV) Model Version Control and Experiment Tracking Email for Spam Classification Problem with Hyperparameter Tuning")

2024/02/20 03:39:02 INFO mlflow.tracking.fluent: Experiment with name '(GridSearchCV) Model Version Control and Experiment Tracking Email for Spam Classification Problem with Hyperparameter Tuning' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///e:/Coding/Applied%20Machine%20Learning/02-experimental-tracking/src/mlruns/400935689654729131', creation_time=1708380542167, experiment_id='400935689654729131', last_update_time=1708380542167, lifecycle_stage='active', name=('(GridSearchCV) Model Version Control and Experiment Tracking Email for Spam '
 'Classification Problem with Hyperparameter Tuning'), tags={}>

In [17]:
# Define the base models
models_gcv = {
    'lightgbm': Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000)),
        ('clf', LGBMClassifier(random_state=1, force_row_wise=True))
    ]),

    'logistic_regression': Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000)),
        ('clf', LogisticRegression(random_state=1))
    ]),

    'naive_bayes': Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000)),
        ('clf', MultinomialNB())
    ])
}

# Define the parameter grids for each model
param_grids = {
    'lightgbm': {
        'tfidf__max_df': [0.5, 0.75],
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'clf__learning_rate': [0.01, 0.1, 0.5],
        'clf__num_leaves': [15,31,63],
        'clf__max_depth': [6, 8]
    },

    'logistic_regression': {
        'tfidf__max_df': [0.5, 0.75],
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'clf__C': [0.1, 1, 10]
    },
    
    'naive_bayes': {
        'tfidf__max_df': [0.5, 0.75],
        'tfidf__ngram_range': [(1, 1), (1, 2)],
        'clf__alpha': [0.01, 0.1, 1]
    }
}

In [18]:
def fit_models_with_grid_search(models, param_grids, X_train, y_train):
    fitted_models = {}
    
    for name, model in models.items():
        print(f"Fitting {name}...")
        grid_search = GridSearchCV(model, param_grids[name], cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)
        fitted_models[name] = grid_search
        
    return fitted_models

In [19]:
def AUCPR_models(fitted_models, test_X, test_y):
    results = {}
    for name, grid_search_cv in fitted_models.items():
        best_estimator = grid_search_cv.best_estimator_
        predictions = best_estimator.predict(test_X)
        precision, recall, _ = precision_recall_curve(test_y, predictions)
        auc_precision_recall = auc(recall, precision)
        results[name] = auc_precision_recall
        print(f'{name} AUCPR: {auc_precision_recall:.4f}')
    return results

In [25]:
with mlflow.start_run():
    mlflow.log_param("max_features", 10000)
    
    # Train models with GridSearchCV
    fitted_models = fit_models_with_grid_search(models_gcv, param_grids, train_X, train_y)

    # Initialize MLflow client
    client = MlflowClient()
    
    for name, grid_search_cv in fitted_models.items():
        best_estimator = grid_search_cv.best_estimator_
        best_params = grid_search_cv.best_params_

        # Log the best model from GridSearchCV with a specific naming convention
        model_name_with_suffix = f"{name}_grid_search_cv"  # Append _grid_search_cv to the model name
        model_path = f"{model_name_with_suffix}_best_model"
        mlflow.sklearn.log_model(best_estimator, model_path)
        
        # Log best parameters
        for param, value in best_params.items():
            mlflow.log_param(f"{model_name_with_suffix}_{param}", value)

        # Create or get the model name in the MLflow Model Registry with the suffix
        try:
            client.create_registered_model(model_name_with_suffix)
        except Exception as e:
            print(f"Model {model_name_with_suffix} already exists in the registry.")

        # Create a new version of the model in the registry
        model_uri = f"runs:/{mlflow.active_run().info.run_id}/{model_path}"
        model_details = client.create_model_version(model_name_with_suffix, model_uri, mlflow.active_run().info.run_id)
        
        print(f"Model Version: {model_details.version} for model {model_name_with_suffix} has been logged to MLflow Registry.")

    # Evaluate models on test data and log AUCPR
    AUCPR_results = AUCPR_models(fitted_models, test_X, test_y)
    for name, aucpr in AUCPR_results.items():
        mlflow.log_metric(f'{name}_grid_search_cv_AUCPR', aucpr)

Fitting lightgbm...
[LightGBM] [Info] Number of positive: 817, number of negative: 2619
[LightGBM] [Info] Total Bins 76546
[LightGBM] [Info] Number of data points in the train set: 3436, number of used features: 2507
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.237776 -> initscore=-1.164909
[LightGBM] [Info] Start training from score -1.164909
Fitting logistic_regression...
Fitting naive_bayes...
Model lightgbm_grid_search_cv already exists in the registry.
Model Version: 3 for model lightgbm_grid_search_cv has been logged to MLflow Registry.
Model logistic_regression_grid_search_cv already exists in the registry.
Model Version: 3 for model logistic_regression_grid_search_cv has been logged to MLflow Registry.
Model naive_bayes_grid_search_cv already exists in the registry.
Model Version: 3 for model naive_bayes_grid_search_cv has been logged to MLflow Registry.
lightgbm AUCPR: 0.9621
logistic_regression AUCPR: 0.9908
naive_bayes AUCPR: 0.9772


In [24]:
! mlflow ui

^C


---

**Utilizing the optimal version of the top-performing model to compute the "AUCPR" metric on the test dataset.**

In [22]:
model_name = "logistic_regression_grid_search_cv"
model_version = "1"

model_uri = f"models:/{model_name}/{model_version}"
model = mlflow.sklearn.load_model(model_uri)

In [23]:
pred_y = model.predict(test_X)
# Calculating metrics on the test dataset
precision, recall, _ = precision_recall_curve(test_y, pred_y)
auc_precision_recall = auc(recall, precision)

print(f"AUCPR on test dataset: {auc_precision_recall}")

AUCPR on test dataset: 0.9908318297961889
