The objective of this notebook is to perform oversampling using different techniques to balance our dataset

In [1]:
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv("../data/silver/df_cleantext_v0.csv")
df

Unnamed: 0,Category,Message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in number a wkly comp to win fa cup...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...
...,...,...
5152,1,this is the numbernd time we have tried number...
5153,0,will you b going to esplanade fr home
5154,0,pity was in mood for that soany other suggestions
5155,0,the guy did some bitching but i acted like id ...


In [3]:
from sklearn.model_selection import train_test_split

X = df['Message']
y = df['Category']  

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

X

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in number a wkly comp to win fa cup...
3             u dun say so early hor u c already then say
4       nah i dont think he goes to usf he lives aroun...
                              ...                        
5152    this is the numbernd time we have tried number...
5153                will you b going to esplanade fr home
5154    pity was in mood for that soany other suggestions
5155    the guy did some bitching but i acted like id ...
5156                            rofl its true to its name
Name: Message, Length: 5157, dtype: object

## Tokenization

In [4]:
# from nltk.tokenize import word_tokenize

# df_tokenized = df.copy()
# df_tokenized['Message'] = df_tokenized['Message'].apply(word_tokenize)
# df_tokenized['Message']

## Oversampling 

In [5]:
rows_count = y_train.value_counts()
rows_count

Category
0    3620
1     505
Name: count, dtype: int64

In [6]:
oversample_factors = [1.25, 1.5, 2]
oversample_sizes = [int(rows_count[1] * factor) for factor in oversample_factors]

for factor, size in zip(oversample_factors, oversample_sizes):
    print(f"Size of oversampling with {factor}x: {size}")


Size of oversampling with 1.25x: 631
Size of oversampling with 1.5x: 757
Size of oversampling with 2x: 1010


## nlpAug techniques

In [16]:
import nlpaug.augmenter.char as nac
from textaug_techniques import TextAugmenter


In [17]:
augmenter = TextAugmenter(qw=1, aa=1, cwea=1)
datasets = augmenter.augment(X_train, y_train)

for label, (X, y) in datasets.items():
    print(f"Dataset '{label}' - X shape: {X.shape}, y shape: {y.shape}")



2024-12-13 16:20:40,201 - INFO - Dataset '1.5x' created with 4377 samples.
2024-12-13 16:20:40,203 - INFO - Dataset '2.0x' created with 4630 samples.
2024-12-13 16:20:40,205 - INFO - Dataset '3.0x' created with 5135 samples.


Dataset '1.5x' - X shape: (4377,), y shape: (4377,)
Dataset '2.0x' - X shape: (4630,), y shape: (4630,)
Dataset '3.0x' - X shape: (5135,), y shape: (5135,)


## MLFlow config

In [18]:
# on the console
# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts
# mlflow.search_experiments()

# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_run("81b0d40ddc814076a95bc6fd9d4fff34")

NameError: name 'y_train_aug' is not defined

## Pipeline

In [None]:
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    fbeta_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    precision_recall_curve,
    auc,
    balanced_accuracy_score,
    roc_curve
)


pipeline = Pipeline([
        ('vectorizer', CountVectorizer(ngram_range=(1, 1), max_features=None)),
        ('classifier', MultinomialNB())
    ])
    
pipeline.fit(X_train, y_train)
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)




## Metrics

In [None]:
train_report = classification_report(y_train, y_train_pred, target_names = ['ham', 'spam'], digits=3)
print("Classification Report (Train Data):")
print(train_report)

test_report = classification_report(y_test, y_test_pred, target_names = ['ham', 'spam'], digits=3)
print("Classification Report (Test Data):")
print(test_report)

In [None]:
# def aug_KeyboardAug(X_train, y_train, oversample_factor):
#     augmenter = nac.KeyboardAug()  
    
#     positive_samples = X_train[y_train == 1]

#     augmented_texts = []

#     # Calcular cuántos ejemplos de spam quieres generar
#     num_augmentations = int(len(positive_samples) * oversample_factor) - len(positive_samples)

#     # Generar los ejemplos aumentados
#     while len(augmented_texts) < num_augmentations:
#         for message in positive_samples:
#             augmented_text = augmenter.augment(message)
#             augmented_texts.append(augmented_text)
#             if len(augmented_texts) >= num_augmentations:
#                 break

#     augmented_series = pd.Series(augmented_texts, name=X_train.name)

#     # Crear una Series con las etiquetas correspondientes
#     augmented_labels = pd.Series([1] * len(augmented_series), index=augmented_series.index)

#     # Combinar los datos augmentados con el conjunto original
#     X_train_aug = pd.concat([X_train, augmented_series])
#     y_train_aug = pd.concat([y_train, augmented_labels])

#     return X_train_aug, y_train_aug


# # Lista de factores de oversampling que deseas probar
# oversample_factors = [1.25, 1.5, 1.75, 2]

# # Suponiendo que X_train y y_train ya están definidos en tu código

# # Crear un diccionario para almacenar los resultados de oversampling para cada factor
# oversampled_data = {}

# # Ejecutar la función aug_KeyboardAug para cada factor
# for factor in oversample_factors:
#     print(f"Ejecutando oversampling con factor {factor}x...")
#     X_train_aug, y_train_aug = aug_KeyboardAug(X_train, y_train, factor)
    
#     # Almacenar los datos aumentados en el diccionario
#     oversampled_data[factor] = (X_train_aug, y_train_aug)
    
#     # Imprimir el tamaño de las clases después del oversampling
#     print(f"Tamaño de clase ham después del oversampling: {sum(y_train_aug == 0)}")
#     print(f"Tamaño de clase spam después del oversampling: {sum(y_train_aug == 1)}")
#     print("---")

In [None]:
aug = naw.SynonymAug(aug_src='wordnet',aug_max=20)