- Import Libraries

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import mlflow

- Load and Prepare Data

In [None]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
valid = pd.read_csv('validation.csv')

# Map labels ('ham' -> 0, 'spam' -> 1)
X_train, y_train = train['message'], train['label'].map({'ham': 0, 'spam': 1})
X_test, y_test = test['message'], test['label'].map({'ham': 0, 'spam': 1})
X_valid, y_valid = valid['message'], valid['label'].map({'ham': 0, 'spam': 1})

# Remove missing values
X_train.dropna(inplace=True)
y_train = y_train[X_train.index]

X_valid.dropna(inplace=True)
y_valid = y_valid[X_valid.index]

X_test.dropna(inplace=True)
y_test = y_test[X_test.index]

- Text Vectorization (TF-IDF)

In [None]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_valid_tfidf = vectorizer.transform(X_valid)
X_test_tfidf = vectorizer.transform(X_test)

- Set Up MLflow Experiment

In [None]:
# Set MLflow experiment
mlflow.set_experiment("SMS Spam Classification with Hyperparameter Tuning")

- Define Hyperparameter Grids

In [None]:
# Hyperparameter grids
param_grid_lr = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5]
}

param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

- Model Training and Logging Function

In [None]:
def train_and_log_model(model, param_grid, model_name):
    with mlflow.start_run(run_name=model_name):
        # Hyperparameter tuning with GridSearchCV
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
        grid_search.fit(X_train_tfidf, y_train)

        # Get best model and parameters
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        y_pred = best_model.predict(X_valid_tfidf)

        # Evaluate performance
        acc = accuracy_score(y_valid, y_pred)
        f1 = f1_score(y_valid, y_pred)

        # Log best parameters and metrics
        mlflow.log_params(best_params)
        mlflow.log_metric("best_cv_score", grid_search.best_score_)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)

        # Confusion matrix visualization
        cm = confusion_matrix(y_valid, y_pred)
        plt.figure(figsize=(5, 5))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f"{model_name} - Confusion Matrix")
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.savefig(f"{model_name}_conf_matrix.png")

        # Log confusion matrix image
        mlflow.log_artifact(f"{model_name}_conf_matrix.png")

        # Log the trained model
        mlflow.sklearn.log_model(best_model, model_name)

        print(f"{model_name} training complete. Best params: {best_params}")

-  Train and Log Models

In [None]:
# Train models
train_and_log_model(LogisticRegression(), param_grid_lr, "Logistic Regression")
train_and_log_model(RandomForestClassifier(random_state=42), param_grid_rf, "Random Forest")
train_and_log_model(SVC(), param_grid_svm, "Support Vector Machine")

print("Hyperparameter tuning and training complete! 🚀")

- Launch MLflow UI (Optional)

In [None]:
# mlflow ui