The objective of this notebook is to experiment with oversamplirg techniques on the dataset to balance it


## Datasets

In [1]:

import pandas as pd
from mlflow.models import infer_signature
import warnings
warnings.filterwarnings('ignore')


train = pd.read_csv("../../data/gold/train.csv")
test = pd.read_csv("../../data/gold/test.csv")
val = pd.read_csv("../../data/gold/validation.csv")
X_train = train['features']
y_train = train['target']
X_test = test['features']
y_test = test['target']
signature = infer_signature(X_train, y_train)


In [2]:
spam_rows = train[train['target'] == 1].value_counts().sum()
print(f"The SPAM rows of our dataset for modelling is: {spam_rows}")
ham_rows = train[train['target'] == 0].value_counts().sum()
print(f"The HAM rows of our dataset for modelling is: {ham_rows}")

print(f"The total rows of our dataset for modelling is: {len(train)}")
rows_to_create = ham_rows - spam_rows
print(f"We have to create {rows_to_create} rows to balance the dataset")



The SPAM rows of our dataset for modelling is: 517
The HAM rows of our dataset for modelling is: 3606
The total rows of our dataset for modelling is: 4123
We have to create 3089 rows to balance the dataset


517

## Augmentation 


The strategy is to repeat the rows until a condition (having 3606 lines) is met. Then take those rows and apply transformations.

In [4]:
from experiments_utils import TextAugmentation
import diskcache
import time

cache = diskcache.Cache('my_cache')

if 'augmented_emails' not in cache:
    text_aug = TextAugmentation(alpha_sr=0.2, alpha_ri=0.2, alpha_rs=0.2, p_rd=0.1)
    augmented_sentences = []
    ref = train.iloc[0:50].copy()
    batch_size = 10  # Define un tamaño de lote más pequeño
    augmented_batches = []
    
    for i in range(0, len(ref), batch_size):
        batch = ref.iloc[i:i + batch_size].copy()
        df_batch_augmented = text_aug.eda(batch)
        augmented_batches.append(df_batch_augmented)
        
        # Descanso de 1 segundo entre lotes para aliviar la carga
        time.sleep(5)

    # Combinar todos los lotes en un único DataFrame
    df_augmented = pd.concat(augmented_batches, ignore_index=True)
    
    cache['augmented_emails'] = df_augmented
    print("augmented_emails created.")
else:
    augmented_sentences_loaded = cache['augmented_sentences']
    print("augmented_emails loaded.")

df_augmented
    

: 

## Pipeline

In [None]:
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    fbeta_score,
    roc_curve,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    precision_recall_curve,
    auc,
    
    balanced_accuracy_score,
)


pipeline = Pipeline([
        ('vectorizer', CountVectorizer(ngram_range=(1, 1), max_features=None)),
        ('classifier', MultinomialNB())
    ])
    
pipeline.fit(X_train, y_train)
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)


### Metrics

In [None]:
train_report = classification_report(y_train, y_train_pred, target_names = ['ham', 'spam'], digits=3)
print("Classification Report (Train Data):")
print(train_report)

test_report = classification_report(y_test, y_test_pred, target_names = ['ham', 'spam'], digits=3)
print("Classification Report (Test Data):")
print(test_report)

In [None]:
y_test_prob = pipeline.predict_proba(X_test)[:, 1]
y_train_prob = pipeline.predict_proba(X_train)[:, 1]

fpr_train, tpr_train, _ = roc_curve(y_train, y_train_prob)
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_prob)

auc_train = auc(fpr_train, tpr_train)
auc_test = auc(fpr_test, tpr_test)

plt.figure(figsize=(8, 6))
plt.plot(fpr_train, tpr_train, label=f'Train ROC (AUC = {auc_train:.3f})', color='blue', linewidth=2)
plt.plot(fpr_test, tpr_test, label=f'Test ROC (AUC = {auc_test:.3f})', color='red', linestyle='--', linewidth=2)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', linewidth=1, label='Random classifier')
plt.title('ROC Curve', fontsize=16)
plt.xlabel('False Positive Rate (FPR)', fontsize=12)
plt.ylabel('True Positive Rate (TPR)', fontsize=12)
plt.legend(loc='lower right', fontsize=12)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

Notes: 

- Precision for SPAM drops, bad news...
- Recall for SPAM gets a bit better.
- F1-score obviously drops because of precision
- Macro avg doesn't get better either...

This model is not memorizing well on train since the metrics clearly bad than others at classying SPAM messages.



Notes: 



In [None]:
# F0.5-Score
f0_5_score = fbeta_score(y_test, y_test_pred, beta=0.5)
print(f"F0.5-Score: {f0_5_score:.3f}")

Notes:

- f0.5-score: not that important since we have our dataset balanced now.
- Precision: went down to 52% of the predicted positives are true positives.
- Recall: 98% of the real positives are true positives.
- F1-score: the model is doing a decent job on the label 0 but not on the label 1.

In [None]:
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
matrix_fig, ax = plt.subplots(figsize=(8, 6))
cm_display = ConfusionMatrixDisplay(confusion_matrix=test_conf_matrix, display_labels=['Ham', 'Spam'])
cm_display.plot(cmap='Blues', ax=ax)
plt.title("Confusion Matrix")
plt.show()

Notes:

The confusion matrix looks very good.
- 57 e-mails were predicted as SPAM but they were HAM. (these are the ones I will try to minimize)
- 1 e-mail1 was predicted as HAM but it was SPAM.

In [None]:
y_test_pred_prob = pipeline.predict_proba(X_test)[:, 1]

precision, recall, _ = precision_recall_curve(y_test, y_test_pred_prob)
pr_auc = auc(recall, precision)

pr_fig, ax = plt.subplots(figsize=(8, 6))  
ax.plot(recall, precision, color='b', label=f'PR AUC = {pr_auc:.2f}')
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_title('Precision-Recall Curve')
ax.legend(loc="best")
ax.grid(True)
plt.show()


Notes: 

The area under the curve is 0.98 so the model strikes a strong balance between precision and recall across thresholds. The model identifies spam effectively without producing excessive false positives.

In [None]:
import numpy as np
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

In [None]:
roc_auc_train = roc_auc_score(y_train, y_train_pred)

roc_auc_test = roc_auc_score(y_test, y_test_pred_prob)

print(f"ROC AUC en entrenamiento: {roc_auc_train:.2f}")
print(f"ROC AUC en prueba: {roc_auc_test:.2f}")

fpr, tpr, thresholds = roc_curve(y_test, y_test_pred_prob)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.grid()
plt.show()

Notes: 

## MLFlow configuration

In [None]:
from experiments_utils import experiment_status
import mlflow

In [None]:
from experiments_utils import experiment_status
import mlflow

In [None]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.search_experiments()

In [None]:
experiment_status()

In [None]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

In [None]:
from experiments_utils import experiment_status
import mlflow

In [None]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.search_experiments()

In [None]:
experiment_status()

In [None]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

In [None]:
from experiments_utils import experiment_status
import mlflow

In [None]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.search_experiments()

In [None]:
experiment_status()

In [None]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

In [None]:
from experiments_utils import experiment_status
import mlflow

In [None]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.search_experiments()

In [None]:
experiment_status()

In [None]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

In [None]:
from experiments_utils import experiment_status
import mlflow

In [None]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.search_experiments()

In [None]:
experiment_status()

In [None]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

In [None]:
from experiments_utils import experiment_status
import mlflow

In [None]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.search_experiments()

In [None]:
experiment_status()

In [None]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

In [None]:
from experiments_utils import experiment_status
import mlflow

In [None]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.search_experiments()

In [None]:
experiment_status()

In [None]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

In [None]:
from experiments_utils import experiment_status
import mlflow

In [None]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.search_experiments()

In [None]:
experiment_status()

In [None]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

In [None]:
from experiments_utils import experiment_status
import mlflow

In [None]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.search_experiments()

In [None]:
experiment_status()

In [None]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

In [None]:
from experiments_utils import experiment_status
import mlflow

In [None]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.search_experiments()

In [None]:
experiment_status()

In [None]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

In [None]:
from experiments_utils import experiment_status
import mlflow

In [None]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.search_experiments()

In [None]:
experiment_status()

In [None]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

In [None]:
from experiments_utils import experiment_status
import mlflow

In [None]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.search_experiments()

In [None]:
experiment_status()

In [None]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

In [None]:
from experiments_utils import experiment_status
import mlflow

In [None]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.search_experiments()

In [None]:
experiment_status()

In [None]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

In [None]:
from experiments_utils import experiment_status
import mlflow

In [None]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.search_experiments()

In [None]:
experiment_status()

In [None]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

In [None]:
from experiments_utils import experiment_status
import mlflow

In [None]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.search_experiments()

In [None]:
experiment_status()

In [None]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

In [None]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.search_experiments()

In [None]:
experiment_status()

In [None]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

In [None]:
from experiments_utils import experiment_status
import mlflow

In [None]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.search_experiments()

In [None]:
experiment_status()

In [None]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

## MLFlow config


In [None]:
from experiments_utils import experiment_status
import mlflow

import os
mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment_status()

In [None]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts
# mlflow.search_experiments()

# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

In [None]:
experiment_name, _, _ = experiment_status()
mlflow.set_experiment(experiment_name)

## MlFlow tracking

In [None]:
   
with mlflow.start_run(run_name="baseline-model", log_system_metrics=True) as run:
    
    # Tags
    mlflow.set_tag("dataset", "Spam detection")
    mlflow.set_tag("task", "classification")
    mlflow.set_tag("vectorizer", "CountVectorizer")
    mlflow.set_tag("algorithm", "Multinomial Naive Bayes")
    mlflow.set_tag("framework", "Scikit-learn")
    mlflow.set_tag("language", "Python")
    mlflow.set_tag("environment", "Local")
    mlflow.set_tag("dataset_version", "1.0.0")
    mlflow.set_tag("preprocessing_version", "1.0.0")
    mlflow.set_tag("model_version", "0.0.1")
    mlflow.set_tag("developer", "Mldu")
    mlflow.set_tag("project_stage", "testing")

    #Datasets
    mlflow.log_input(mlflow.data.from_pandas(train, name="train dataset", targets="target"))
    mlflow.log_input(mlflow.data.from_pandas(test, name="test dataset", targets="target"))
    mlflow.log_input(mlflow.data.from_pandas(val, name="validation dataset", targets="target"))
    mlflow.log_artifact("../../data/gold/train.csv")
    mlflow.log_artifact("../../data/gold/test.csv")
    mlflow.log_artifact("../../data/gold/validation.csv")

    
    #BOW and model
    mlflow.log_param("vectorizer_type", "CountVectorizer")
    mlflow.log_param("model_type", "MultinomialNB")
    
    # Metrics
    mlflow.log_metric("test_accuracy", test_accuracy)
    mlflow.log_metric("balanced_accuracy", balanced_accuracy)
    mlflow.log_metric("f0_5_score", f0_5_score)
    cr = classification_report(y_test, y_test_pred, output_dict=True)
    mlflow.log_metric("accuracy", cr.pop("accuracy"))
    for class_or_avg, metrics_dict in cr.items():
        for metric, value in metrics_dict.items():
            mlflow.log_metric(class_or_avg + '_' + metric,value)
    
    # Figures
    mlflow.log_figure(matrix_fig, "confusion-matrix.png")
    mlflow.log_figure(pr_fig, "precision-recall-curve.png")
    
    # CountVectorizer and MNB
    mlflow.sklearn.log_model(pipeline, "pipeline",signature=signature)
    
    # Notebook's name as tag and save as artifact
    notebook_name = os.path.basename(globals()['__vsc_ipynb_file__'])   
    mlflow.set_tag("source_notebook", f"{notebook_name}")
    mlflow.log_artifact(f"{notebook_name}", artifact_path="notebooks")


    print(f"Pipeline logged to MLflow under run ID {run.info.run_id}")

Conclusions:

- Clear overfitting 
- The model is not predicting at random since the accuracy is higher than the imbalance