In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, GlobalMaxPooling1D, Conv1D, MaxPooling1D, LSTM, Embedding
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [None]:
import yaml
import logging
from datetime import datetime

# YAML config
try:
    with open(r".\config.yaml", "r") as f:
        config = yaml.safe_load(f)
except Exception as e:
    raise

# Logger
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(name)s - %(funcName)s - %(message)s",
    filename=config["log_dir"] +
    f"{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.log",
    filemode="w"
)
logger = logging.getLogger(__name__)

logger.info("Config file and logger setup completed.")

In [None]:
# Load the data
try:
    df = pd.read_csv(config["data_path"], encoding="latin-1")
except FileNotFoundError:
    logger.error(f"File not found: {data_path}")
    raise
except Exception as e:
    logger.error(f"Error reading CSV: {e}")
    raise

In [None]:
# Data preprocess
df["Labels"] = df["Category"].map({"ham": 0, "spam": 1})
df = df.dropna()
df = df.drop("Category", axis=1)

In [None]:
# Split the data into training and testing sets
X = df["Message"].astype(str)
y = df["Labels"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

logger.info(
    f"Data split into training set: {len(X_train)} and testing set: {len(X_test)}")

In [None]:
max_length = X_test.shape[0]

In [None]:
# Initialize Tokenizer
# Handle out-of-vocabulary
tokenizer = Tokenizer(num_words=config["max_voc"], oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

In [None]:
# Convert texts to sequences
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
# Add padding
X_train = pad_sequences(X_train, maxlen=max_length)
X_test = pad_sequences(X_test, maxlen=max_length)

In [None]:
# Create vocabulary mapping
word_index = tokenizer.word_index
num_words = len(word_index)
logger.info(f"Vocabulary size: {num_words}")

In [None]:
# Build the LSTM model
i = Input(shape=(max_length,))
# Adding 1 for the padding token
x = Embedding(num_words + 1, config["embedding_dim"])(i)
x = LSTM(config["units"], return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(1, activation="sigmoid")(x)

model = Model(i, x)
model.compile(loss="binary_crossentropy",
              optimizer="adam", metrics=["accuracy"])
logger.info("Model compiled.")

In [None]:
# Train the model
r = model.fit(X_train, y_train, epochs=config["epochs"], batch_size=config["batch_size"], validation_data=(
    X_test, y_test), verbose=1)
logger.info("Model training completed.")

In [None]:
# Visualize training history
plt.figure(figsize=(12, 6))
plt.plot(r.history["accuracy"], label="Accuracy")
plt.plot(r.history["val_accuracy"], label="Validation Accuracy")
plt.plot(r.history["loss"], label="Loss")
plt.plot(r.history["val_loss"], label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Value")
plt.title("Training and Validation History")
plt.legend()
plt.show()