In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import LSTM, Dense, Embedding, GlobalMaxPooling1D, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

from .utils._logger import logger
from .utils._validation import config_args

In [None]:
# Load the data
from pandas import DataFrame

try:
    df: DataFrame = pd.read_csv(config_args.data_path, encoding="latin-1")
except FileNotFoundError:
    logger.error(f"File not found: {config_args.data_path}")
    raise
except Exception as e:
    logger.error(f"Error reading CSV: {e}")
    raise

In [None]:
# Data preprocess
df["Labels"] = df["Category"].map({"ham": 0, "spam": 1})
df: DataFrame = df.dropna()
df = df.drop("Category", axis=1)

In [None]:
# Split the data into training and testing sets
from typing import Any

from pandas import Series

X: Series[str] = df["Message"].astype(str)
y: Series[Any] = df["Labels"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

logger.info(
    f"Data split into training set: {len(X_train)} and testing set: {len(X_test)}"
)

In [None]:
max_length = X_test.shape[0]

In [None]:
# Initialize Tokenizer
# Handle out-of-vocabulary
tokenizer = Tokenizer(num_words=config_args.max_voc, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

In [None]:
# Convert texts to sequences
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
# Add padding
X_train = pad_sequences(X_train, maxlen=max_length)
X_test = pad_sequences(X_test, maxlen=max_length)

In [None]:
# Create vocabulary mapping
word_index = tokenizer.word_index
num_words: int = len(word_index)
logger.info(f"Vocabulary size: {num_words}")

In [None]:
# Build the LSTM model
i = Input(shape=(max_length,))
# Adding 1 for the padding token
x = Embedding(num_words + 1, config_args.embedding_dim)(i)
x = LSTM(config_args.units, return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(1, activation="sigmoid")(x)

model = Model(i, x)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
logger.info("Model compiled.")

In [None]:
# Train the model
r = model.fit(
    X_train,
    y_train,
    epochs=config_args.epochs,
    batch_size=config_args.batch_size,
    validation_data=(X_test, y_test),
    verbose=1,
)
logger.info("Model training completed.")

In [None]:
# Visualize training history
plt.figure(figsize=(12, 6))
plt.plot(r.history["accuracy"], label="Accuracy")
plt.plot(r.history["val_accuracy"], label="Validation Accuracy")
plt.plot(r.history["loss"], label="Loss")
plt.plot(r.history["val_loss"], label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Value")
plt.title("Training and Validation History")
plt.legend()
plt.show()