In [152]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import joblib
import os

In [128]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [130]:
file_path = r"C:\Users\hp\Desktop\pfa\dataset\spam.csv"
data = pd.read_csv(file_path)

In [132]:
data['Category'] = data['Category'].map({'ham': 0, 'spam': 1})


In [134]:
X = data['Message']
y = data['Category']


In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [138]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [140]:
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [142]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'Support Vector Machine': SVC()
}

In [150]:
results = {}

# Iterate through each model in the models dictionary
for model_name, model in models.items():
    with mlflow.start_run():  # Start a new MLflow run for each model
        # Log model hyperparameters
        mlflow.log_param("model", model_name)
        
        # Train the model
        model.fit(X_train_tfidf, y_train)
        
        # Predict on the test set
        y_pred = model.predict(X_test_tfidf)
        
        # Evaluate the model
        metrics = classification_report(y_test, y_pred, output_dict=True)
        
        # Log the model and metrics to MLflow
        mlflow.log_metric("accuracy", metrics["accuracy"])
        mlflow.log_metric("precision_ham", metrics["0"]["precision"])
        mlflow.log_metric("precision_spam", metrics["1"]["precision"])
        mlflow.log_metric("recall_ham", metrics["0"]["recall"])
        mlflow.log_metric("recall_spam", metrics["1"]["recall"])
        mlflow.log_metric("f1_ham", metrics["0"]["f1-score"])
        mlflow.log_metric("f1_spam", metrics["1"]["f1-score"])
        
        # Define input example (use a sample from your data)
        input_example = vectorizer.transform([X_test.iloc[0]])  # Transform using the same vectorizer
        
        # Infer the signature of the model using a sample input/output
        signature = infer_signature(X_train_tfidf, model.predict(X_train_tfidf))
        
        # Log the model itself with signature and input example
        mlflow.sklearn.log_model(model, model_name, signature=signature, input_example=input_example)
        
        # Manually register the model (explicit model registration)
        model_uri = f"runs:/{mlflow.active_run().info.run_id}/{model_name}"
        mlflow.register_model(model_uri, model_name)  # Register the model with the model_name
        
        # Store the evaluation metrics and model name for later use
        results[model_name] = {
            "accuracy": metrics["accuracy"],
            "precision_ham": metrics["0"]["precision"],
            "precision_spam": metrics["1"]["precision"],
            "recall_ham": metrics["0"]["recall"],
            "recall_spam": metrics["1"]["recall"],
            "f1_ham": metrics["0"]["f1-score"],
            "f1_spam": metrics["1"]["f1-score"]
        }

        joblib.dump(vectorizer, 'vectorizer.pkl')  # Save vectorizer one last time after training
        

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'Logistic Regression' already exists. Creating a new version of this model...
2025/04/24 13:41:07 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Logistic Regression, version 5
Created version '5' of model 'Logistic Regression'.


🏃 View run glamorous-midge-278 at: http://127.0.0.1:5000/#/experiments/0/runs/b08968bde56a4e7b825037f6e54c4e5f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'Naive Bayes' already exists. Creating a new version of this model...
2025/04/24 13:41:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Naive Bayes, version 4
Created version '4' of model 'Naive Bayes'.


🏃 View run rare-shrew-40 at: http://127.0.0.1:5000/#/experiments/0/runs/4c6838c3c82346149a934958fed99769
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'Support Vector Machine' already exists. Creating a new version of this model...
2025/04/24 13:41:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Support Vector Machine, version 4


🏃 View run intrigued-jay-886 at: http://127.0.0.1:5000/#/experiments/0/runs/89f58c4541094d24ad56ca5bb93934d0
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


Created version '4' of model 'Support Vector Machine'.


In [146]:
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {metrics['accuracy'] * 100:.2f}%")
    print(f"Precision (Ham): {metrics['0']['precision']:.2f}")
    print(f"Precision (Spam): {metrics['1']['precision']:.2f}")
    print(f"Recall (Ham): {metrics['0']['recall']:.2f}")
    print(f"Recall (Spam): {metrics['1']['recall']:.2f}")
    print(f"F1-Score (Ham): {metrics['0']['f1-score']:.2f}")
    print(f"F1-Score (Spam): {metrics['1']['f1-score']:.2f}")
    print("="*50)

Model: Logistic Regression
Accuracy: 96.77%


KeyError: '0'