In [10]:
import pandas as pd

train = pd.read_csv('train.csv')
val = pd.read_csv('validation.csv')
test = pd.read_csv('test.csv')


# Separating into text and label
X_train = train['text']
y_train = train['label']
X_val = val['text']
y_val = val['label']
X_test = test['text']
y_test = test['label']


label_mapping = {'Spam': 1, 'Ham': 0}
y_train = y_train.replace(label_mapping)
y_val = y_val.replace(label_mapping)
y_test = y_test.replace(label_mapping)


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, average_precision_score
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

In [12]:
def train_and_log_model(model_type, X_train, y_train, X_val, y_val, hyperparams={}, metadata={}):
    with mlflow.start_run(run_name=model_type):
        # Define the model pipeline based on model type
        if model_type == 'Random_Forest':
            model_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words='english')),
                ('clf', RandomForestClassifier(random_state=42, **hyperparams))
            ])
        elif model_type == 'Bernoulli_Naive_Bayes':
            model_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words='english')),
                ('clf', BernoulliNB(**hyperparams))
            ])
        elif model_type == 'Support_Vector_Machine':
            model_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words='english')),
                ('clf', SVC(random_state=42, probability=True, **hyperparams))
            ])
        else:
            raise ValueError("Model type not recognized.")
        
        # Train the model
        model_pipeline.fit(X_train, y_train)

        # Evaluate the model on validation dataset
        y_pred_val = model_pipeline.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred_val)
        aucpr = average_precision_score(y_val, model_pipeline.predict_proba(X_val)[:, 1])

        # Log parameters, metrics, and model
        mlflow.log_params(hyperparams)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("aucpr", aucpr)
        mlflow.sklearn.log_model(model_pipeline, f"model_{model_type}")

        # Get the run ID
        run_id = mlflow.active_run().info.run_id

        # Register the model in the MLflow Model Registry
        client = MlflowClient()
        try:
            client.create_registered_model(model_type)
        except Exception as e:
            print(f"Model {model_type} already exists in the registry.")

        # Create a new version of the model in the registry
        model_uri = f"runs:/{run_id}/model_{model_type}"
        model_version_info = client.create_model_version(model_type, model_uri, run_id)

        # Add metadata tags to the model version
        metadata['Created by'] = 'Anjan' 
        for tag_key, tag_value in metadata.items():
            client.set_model_version_tag(
                model_type,
                model_version_info.version,
                tag_key,
                tag_value
            )

        print(f"Model {model_type}, version {model_version_info.version} registered in the MLflow Model Registry with tags {metadata}.")
        print(f"Model: {model_type}, Accuracy: {accuracy}, AUCPR: {aucpr}")


In [13]:
mlflow.set_experiment("Email Spam-Ham Classification")

<Experiment: artifact_location='file:///G:/Coursework/AML/Assignment/A2/mlruns/607309106037128344', creation_time=1708934342659, experiment_id='607309106037128344', last_update_time=1708934342659, lifecycle_stage='active', name='Email Spam-Ham Classification', tags={}>

In [14]:
tags = {
    "Review": "Passed",
    "Ready for Deployment": "Yes"
}

In [15]:
# Training and logging models
model_names = ['Random_Forest', 'Bernoulli_Naive_Bayes', 'Support_Vector_Machine']
for model_name in model_names:
    train_and_log_model(model_name, X_train, y_train, X_val, y_val)



Model Random_Forest, version 1 registered in the MLflow Model Registry with tags {'Created by': 'Anjan'}.
Model: Random_Forest, Accuracy: 0.9755529685681025, AUCPR: 0.9952920386979857
Model Bernoulli_Naive_Bayes, version 1 registered in the MLflow Model Registry with tags {'Created by': 'Anjan'}.
Model: Bernoulli_Naive_Bayes, Accuracy: 0.9650756693830035, AUCPR: 0.995387572517552
Model Support_Vector_Machine, version 1 registered in the MLflow Model Registry with tags {'Created by': 'Anjan'}.
Model: Support_Vector_Machine, Accuracy: 0.989522700814901, AUCPR: 0.9988626301220084


In [17]:
# Chosen model after comparing results stored in the user interface
model_name = "Support_Vector_Machine" 
model_version = "1"  

In [18]:
model_uri = f"models:/{model_name}/{model_version}"
model = mlflow.sklearn.load_model(model_uri)

In [19]:
# Use the loaded model to make predictions on the test dataset
y_pred = model.predict(X_test)

# Calculate metrics on the test dataset
accuracy = accuracy_score(y_test, y_pred)
aucpr = average_precision_score(y_test, model.predict_proba(X_test)[:, 1])

In [20]:
print(f"Accuracy on test dataset: {accuracy}")
print(f"AUCPR on test dataset: {aucpr}")

Accuracy on test dataset: 0.9883720930232558
AUCPR on test dataset: 0.9983005790297361
