The objective of this notebook is to perform oversampling using different techniques to balance our dataset

## Datasets

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import sys
from mlflow.models import infer_signature
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', stream=sys.stdout)
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv("../../data/gold/train.csv")
test = pd.read_csv("../../data/gold/test.csv")
val = pd.read_csv("../../data/gold/validation.csv")

X_train = train['features']
y_train = train['target']
X_test = test['features']
y_test = test['target']


signature = infer_signature(X_train, y_train)

## Oversampling 

In [3]:
y_train.value_counts()


target
0    3606
1     517
Name: count, dtype: int64

In [4]:
y_train.value_counts(normalize=True)

target
0    0.874606
1    0.125394
Name: proportion, dtype: float64

In [5]:
oversample_factors = [1.25, 1.5, 2]
oversample_sizes = [int(y_train[1] * factor) for factor in oversample_factors]

for factor, size in zip(oversample_factors, oversample_sizes):
    print(f"Size of oversampling with {factor}x: {size}")


Size of oversampling with 1.25x: 0
Size of oversampling with 1.5x: 0
Size of oversampling with 2x: 0


## nlpAug techniques

In [8]:
import sys  
import os
sys.path.append(os.path.abspath("../utils"))

from textaug_techniques import TextAugmentation

In [10]:
OVERSAMPLE_FACTORS = [1.25, 1.5, 2.0] 

In [11]:


augmenter = TextAugmentation(qw=1, aa=0, cwea=0, sa=0, bta=0, wea=0)
all_datasets = {}


for factor in OVERSAMPLE_FACTORS:
    print(f"Running augmentation for factor {factor}x...")
    datasets = augmenter.augment(X_train, y_train, factor)
    if datasets:
        all_datasets.update(datasets)  # Combinar resultados

# Mostrar los resultados
for label, (aug_X, aug_y) in all_datasets.items():
    print(f"Dataset '{label}':")
    print(aug_X)
    print(aug_y)


Running augmentation for factor 1.25x...
2024-12-29 19:11:31,055 - INFO - Dataset '1.25x' created with 4252 samples.
Running augmentation for factor 1.5x...
2024-12-29 19:11:31,056 - INFO - Dataset '1.5x' created with 4381 samples.
Running augmentation for factor 2.0x...
2024-12-29 19:11:31,058 - INFO - Dataset '2.0x' created with 4640 samples.
Dataset '1.25x':
0      hey next sun number there basic yoga course bu...
1                    dhoni luck win big titleso winemoji
2                                     really hows master
3                               see cup coffee animation
4                           pain couldnt come worse time
                             ...                        
124    qw_augmented_dear voucher holder number claim ...
125               qw_augmented_filthy story girl waiting
126    qw_augmented_freemsg today day ready im horny ...
127    qw_augmented_please call number immediately ur...
128    qw_augmented_urgent please call number landlin...
Name: fea

## MLFlow config

In [9]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts
# mlflow.search_experiments()

# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

## Pipeline

In [10]:
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    fbeta_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    precision_recall_curve,
    auc,
    balanced_accuracy_score,
    roc_curve
)


pipeline = Pipeline([
        ('vectorizer', CountVectorizer(ngram_range=(1, 1), max_features=None)),
        ('classifier', MultinomialNB())
    ])
    
pipeline.fit(X_train, y_train)
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)




## Metrics

In [11]:
train_report = classification_report(y_train, y_train_pred, target_names = ['ham', 'spam'], digits=3)
print("Classification Report (Train Data):")
print(train_report)

test_report = classification_report(y_test, y_test_pred, target_names = ['ham', 'spam'], digits=3)
print("Classification Report (Test Data):")
print(test_report)

Classification Report (Train Data):
              precision    recall  f1-score   support

         ham      0.995     0.995     0.995      3606
        spam      0.967     0.967     0.967       517

    accuracy                          0.992      4123
   macro avg      0.981     0.981     0.981      4123
weighted avg      0.992     0.992     0.992      4123

Classification Report (Test Data):
              precision    recall  f1-score   support

         ham      0.993     0.982     0.988       453
        spam      0.882     0.952     0.916        63

    accuracy                          0.979       516
   macro avg      0.938     0.967     0.952       516
weighted avg      0.980     0.979     0.979       516



In [12]:
# def aug_KeyboardAug(X_train, y_train, oversample_factor):
#     augmenter = nac.KeyboardAug()  
    
#     positive_samples = X_train[y_train == 1]

#     augmented_texts = []

#     # Calcular cuántos ejemplos de spam quieres generar
#     num_augmentations = int(len(positive_samples) * oversample_factor) - len(positive_samples)

#     # Generar los ejemplos aumentados
#     while len(augmented_texts) < num_augmentations:
#         for message in positive_samples:
#             augmented_text = augmenter.augment(message)
#             augmented_texts.append(augmented_text)
#             if len(augmented_texts) >= num_augmentations:
#                 break

#     augmented_series = pd.Series(augmented_texts, name=X_train.name)

#     # Crear una Series con las etiquetas correspondientes
#     augmented_labels = pd.Series([1] * len(augmented_series), index=augmented_series.index)

#     # Combinar los datos augmentados con el conjunto original
#     X_train_aug = pd.concat([X_train, augmented_series])
#     y_train_aug = pd.concat([y_train, augmented_labels])

#     return X_train_aug, y_train_aug


# # Lista de factores de oversampling que deseas probar
# oversample_factors = [1.25, 1.5, 1.75, 2]

# # Suponiendo que X_train y y_train ya están definidos en tu código

# # Crear un diccionario para almacenar los resultados de oversampling para cada factor
# oversampled_data = {}

# # Ejecutar la función aug_KeyboardAug para cada factor
# for factor in oversample_factors:
#     print(f"Ejecutando oversampling con factor {factor}x...")
#     X_train_aug, y_train_aug = aug_KeyboardAug(X_train, y_train, factor)
    
#     # Almacenar los datos aumentados en el diccionario
#     oversampled_data[factor] = (X_train_aug, y_train_aug)
    
#     # Imprimir el tamaño de las clases después del oversampling
#     print(f"Tamaño de clase ham después del oversampling: {sum(y_train_aug == 0)}")
#     print(f"Tamaño de clase spam después del oversampling: {sum(y_train_aug == 1)}")
#     print("---")

In [13]:
aug = naw.SynonymAug(aug_src='wordnet',aug_max=20)

NameError: name 'naw' is not defined