The objective of this notebook is to try different oversampling strategies to prepare the data before vectorizing with BOW and modelling with Multinomial Naive Bayes 

# MLFlow configuration

In [9]:
import mlflow
from mlflow.exceptions import RestException

In [10]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts


mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.search_experiments()

[<Experiment: artifact_location='/home/maldu/dscience/projects/spam_detector/notebooks/experimentation/artifacts/1', creation_time=1733326029460, experiment_id='1', last_update_time=1733326029460, lifecycle_stage='active', name='spam-classifier', tags={'mlflow.note.content': 'This experiment contains mlruns for different '
                         'approaches in the ml lifecycle of an e-mail spam '
                         'detector classifier.',
  'project_name': 'spam-classifier',
  'project_quarter': 'Q4-2024',
  'project_stage': 'testing',
  'team': 'ml-team'}>,
 <Experiment: artifact_location='/home/maldu/dscience/projects/spam_detector/notebooks/experimentation/artifacts/0', creation_time=1733326012211, experiment_id='0', last_update_time=1733326012211, lifecycle_stage='active', name='Default', tags={}>]

# Datasets

In [11]:
import pandas as pd
from mlflow.models import infer_signature


train = pd.read_csv("../../data/gold/train.csv")
test = pd.read_csv("../../data/gold/test.csv")

X_train = train['features']
y_train = train['target']
X_test = test['features']
y_test = test['target']
signature = infer_signature(X_train, y_train)



# Synonym augmentation

In [12]:
import numpy as np
import os
import nltk
import re

from nltk import word_tokenize
from nltk.corpus import stopwords

stoplist = stopwords.words('english')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings("ignore")

In [13]:
import requests
import json

url = "http://paraphrase.org/api/en/search/"


def get_synonyms(word):
    results = []
    querystring = {"batchNumber":"0","needsPOSList":"true","q":word}
    headers = {
        'cache-control': "no-cache",
        'postman-token': "2d3d31e7-b571-f4ae-d69b-8116ff64d752"
    }

    response = requests.request("GET", url, headers=headers, params=querystring)
    response_js = response.json()
    
    res_count = response_js['hits']['found']
    if res_count > 0:
        res_count = min(3, res_count )
        hits = response_js['hits']['hit'][:res_count]
        results = [ hit['target'] for hit in hits]
    return results

In [14]:
get_synonyms('so sick')  

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import (
    fbeta_score,
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    precision_recall_curve,
    auc,
    balanced_accuracy_score,
)


pipeline = Pipeline([
        ('vectorizer', CountVectorizer(ngram_range=(1, 1), max_features=2000)),
        ('classifier', MultinomialNB())
    ])
    
pipeline.fit(X_train, y_train)
y_test_pred = pipeline.predict(X_test)

# RamdomOverSampler

In [None]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# Crear instancia de RandomOverSampler
ros = RandomOverSampler(sampling_strategy='minority', random_state=42)

# Aplicar oversampling a los datos de entrenamiento
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Ver la nueva distribución de clases
print(f"Distribución después del RandomOverSampler: {Counter(y_resampled)}")

# ADASYN

In [None]:
from imblearn.over_sampling import ADASYN
from collections import Counter

# Crear instancia de ADASYN
adasyn = ADASYN(sampling_strategy='minority', random_state=42)

# Aplicar oversampling a los datos de entrenamiento
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)

# Ver la nueva distribución de clases
print(f"Distribución después de ADASYN: {Counter(y_resampled)}")


4. SMOTE-ENN (SMOTE + Edited Nearest Neighbors)

In [None]:
from imblearn.combine import SMOTEENN
from collections import Counter

# Crear instancia de SMOTEENN
smoteenn = SMOTEENN(sampling_strategy='minority', random_state=42)

# Aplicar oversampling a los datos de entrenamiento
X_resampled, y_resampled = smoteenn.fit_resample(X_train, y_train)

# Ver la nueva distribución de clases
print(f"Distribución después de SMOTEENN: {Counter(y_resampled)}")
