In [1]:
pip install mlflow

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import mlflow
import mlflow.sklearn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import average_precision_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [3]:
train_df = pd.read_csv("Data/train.csv")
val_df = pd.read_csv("Data/validation.csv")
test_df = pd.read_csv("Data/test.csv")

X_train, y_train = train_df["text"], train_df["label"]
X_val, y_val = val_df["text"], val_df["label"]
X_test, y_test = test_df["text"], test_df["label"]


In [4]:
train_df.isna().sum()

label    0
text     2
dtype: int64

In [5]:
def clean_nan_rows(df):
    df = df.dropna(subset=["text"])
    df = df[df["text"].str.strip() != ""]
    return df

In [6]:
train_df = clean_nan_rows(train_df)
val_df = clean_nan_rows(val_df)
test_df = clean_nan_rows(test_df)

In [7]:
def vectorize_text(train_texts, val_texts=None, test_texts=None):
    vectorizer = TfidfVectorizer(
        stop_words="english",
        max_features=5000
    )

    X_train = vectorizer.fit_transform(train_texts)
    X_val = vectorizer.transform(val_texts) if val_texts is not None else None
    X_test = vectorizer.transform(test_texts) if test_texts is not None else None

    return vectorizer, X_train, X_val, X_test


In [8]:

X_train_vec = train_df["text"]
y_train = train_df["label"]

X_val_vec = val_df["text"]
y_val = val_df["label"]

vectorizer, X_train, X_val, _ = vectorize_text(
    X_train_vec,
    X_val_vec
)

In [9]:
models = {
    "NaiveBayes": MultinomialNB(),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "LinearSVM": LinearSVC()
}

In [10]:
mlflow.set_experiment("sms_spam_classification")

results = {}


In [16]:
for name, model in models.items():

    with mlflow.start_run(run_name=name):

        model.fit(X_train, y_train)

        val_scores = model.decision_function(X_val) if hasattr(model, "decision_function") \
                     else model.predict_proba(X_val)[:, 1]

        aucpr = average_precision_score(y_val, val_scores)

        mlflow.log_param("model", name)
        mlflow.log_metric("AUCPR", aucpr)

        mlflow.sklearn.log_model(
            model,
            name="model",
            input_example=X_train[:5]
        )


        results[name] = aucpr


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [17]:
best_model = max(results, key=results.get)
print("Best model:", best_model)

with mlflow.start_run(run_name="best_model"):
    mlflow.log_param("best_model", best_model)


Best model: LinearSVM


In [18]:
for k, v in results.items():
    print(k, "AUCPR:", v)

NaiveBayes AUCPR: 0.960256916066198
LogisticRegression AUCPR: 0.9389795687631639
LinearSVM AUCPR: 0.9645746701525639
