In [None]:
# =========================
#  SPAM DETECTION SYSTEM
#  Using Statistical (TF-IDF + Naive Bayes)
#  and Embedding-Based (Pretrained Word2Vec + Neural Network) Models
# =========================

# =========================================
# STEP 1: SETUP AND PREPARATION
# =========================================

# ----- 1.1: Import Libraries -----
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import gensim
import gensim.downloader as api
import tensorflow as tf

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    confusion_matrix, 
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score
)

# For reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# ----- 1.2: NLTK Downloads (if not already done) -----
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# ----- 1.3: Load the SMS Spam Collection Dataset -----
# Make sure 'spam.csv' is in the working directory or update the path accordingly.
df = pd.read_csv('spam.csv', encoding='latin-1')

# Check columns to ensure they exist
expected_cols = {'v1', 'v2'}
if not expected_cols.issubset(df.columns):
    raise ValueError(
        "Expected columns 'v1' and 'v2' not found in the CSV. "
        "Please verify the file structure or rename accordingly."
    )

# Rename columns for clarity and keep relevant ones
df.rename(columns={'v1': 'label', 'v2': 'text'}, inplace=True)
df = df[['label', 'text']]

# Convert labels (ham/spam) to binary (0, 1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Quick checks
print("Dataset shape:", df.shape)
print("Class distribution:\n", df['label'].value_counts())

# =========================================
# STEP 2: DATA PREPROCESSING
# =========================================

# ----- 2.1: Preprocessing Tools -----
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text: str) -> list:
    """
    Cleans the input text by:
      1. Lowercasing and stripping whitespace
      2. Removing non-alphabetical characters
      3. Tokenizing
      4. Removing stop words
      5. Lemmatizing words
    Returns:
      A list of cleaned tokens.
    """
    text = text.lower().strip()
    # If you want to keep digits or certain punctuations, adjust the regex below
    text = re.sub(r'[^a-z\s]', '', text)
    
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

# Apply cleaning
df['clean_tokens'] = df['text'].apply(clean_text)

# ----- 2.2: Train-Test Split -----
X = df['clean_tokens']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nTraining set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])

# =========================================
# STEP 3: BASELINE MODEL (TF-IDF + NAIVE BAYES)
# =========================================

# ----- 3.1: TF-IDF Vectorization -----
X_train_text = X_train.apply(lambda tokens: ' '.join(tokens))
X_test_text = X_test.apply(lambda tokens: ' '.join(tokens))

# Adjust TF-IDF params as desired (e.g., max_features, ngram_range, etc.)
tfidf_vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1,2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)

# ----- 3.2: Train and Evaluate Naive Bayes -----
nb_model = MultinomialNB(alpha=1.0)  # alpha=1.0 is the default smoothing
nb_model.fit(X_train_tfidf, y_train)

y_pred_nb = nb_model.predict(X_test_tfidf)

acc_nb = accuracy_score(y_test, y_pred_nb)
prec_nb = precision_score(y_test, y_pred_nb)
rec_nb = recall_score(y_test, y_pred_nb)
f1_nb = f1_score(y_test, y_pred_nb)

print("\n=== Baseline Model (Naive Bayes + TF-IDF) ===")
print(f"Accuracy : {acc_nb:.4f}")
print(f"Precision: {prec_nb:.4f}")
print(f"Recall   : {rec_nb:.4f}")
print(f"F1 Score : {f1_nb:.4f}")

cm_nb = confusion_matrix(y_test, y_pred_nb)

plt.figure(figsize=(5,4))
sns.heatmap(cm_nb, annot=True, fmt='d', cmap='Blues')
plt.title("Naive Bayes (TF-IDF) Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# =========================================
# STEP 4: EMBEDDING-BASED MODEL (Pretrained Word2Vec + Neural Network)
# =========================================

# ----- 4.1: Load Pretrained Word2Vec -----
# This can be very large (~1.5GB). Ensure enough RAM/disk space.
print("\nLoading pretrained 'word2vec-google-news-300' model...")
pretrained_w2v = api.load('word2vec-google-news-300')
print("Pretrained model loaded successfully.")

# For Gensim 4+, pretrained_w2v is a KeyedVectors instance
# We'll fetch the dimensionality from 'vector_size'
try:
    vector_dim = pretrained_w2v.vector_size
except AttributeError:
    # Fallback for certain versions
    vector_dim = pretrained_w2v.wv.vector_size

def embed_text(tokens: list, model: gensim.models.KeyedVectors) -> np.ndarray:
    """
    Given a list of tokens and a pretrained KeyedVectors model,
    return the average embedding vector for the text.
    If no valid tokens exist, return a zero vector.
    """
    embeddings = []
    for word in tokens:
        # For Gensim 4.0+, we check 'key_to_index'
        if word in model.key_to_index: 
            embeddings.append(model[word])
    if len(embeddings) == 0:
        return np.zeros(vector_dim)
    return np.mean(embeddings, axis=0)

X_train_embed = np.array([embed_text(tokens, pretrained_w2v) for tokens in X_train])
X_test_embed = np.array([embed_text(tokens, pretrained_w2v) for tokens in X_test])

print("\nTrain Embeddings shape:", X_train_embed.shape)
print("Test Embeddings shape :", X_test_embed.shape)

# ----- 4.2: Build, Compile, and Train Neural Network -----
model_nn = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(vector_dim,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),  # Higher dropout for limited data
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_nn.summary()

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

history = model_nn.fit(
    X_train_embed, 
    y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=32,
    verbose=1,
    callbacks=[early_stopping]
)

# ----- 4.3: Evaluate Neural Network -----
y_pred_proba_nn = model_nn.predict(X_test_embed).ravel()
y_pred_nn = (y_pred_proba_nn > 0.5).astype(int)

acc_nn = accuracy_score(y_test, y_pred_nn)
prec_nn = precision_score(y_test, y_pred_nn)
rec_nn = recall_score(y_test, y_pred_nn)
f1_nn = f1_score(y_test, y_pred_nn)

print("\n=== Embedding-Based Model (Pretrained Word2Vec + NN) ===")
print(f"Accuracy : {acc_nn:.4f}")
print(f"Precision: {prec_nn:.4f}")
print(f"Recall   : {rec_nn:.4f}")
print(f"F1 Score : {f1_nn:.4f}")

cm_nn = confusion_matrix(y_test, y_pred_nn)

plt.figure(figsize=(5,4))
sns.heatmap(cm_nn, annot=True, fmt='d', cmap='Greens')
plt.title("Neural Network (Pretrained Word2Vec) Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# =========================================
# STEP 5: RESULTS AND ANALYSIS
# =========================================

labels = ['Naive Bayes (TF-IDF)', 'NN (Pretrained W2V)']
accuracies = [acc_nb, acc_nn]

plt.figure(figsize=(6,4))
plt.bar(labels, accuracies, color=['blue', 'green'])
plt.title("Model Comparison - Accuracy")
plt.ylabel("Accuracy")
plt.ylim(0.8, 1.0)
for i, v in enumerate(accuracies):
    plt.text(i, v + 0.01, f"{v:.2f}", ha='center', fontweight='bold')
plt.show()

# Plot training history for the neural network
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title("NN Loss Curve")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.subplot(1,2,2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title("NN Accuracy Curve")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()

plt.tight_layout()
plt.show()

print("\n=== Final Results Summary ===")
print("\nNaive Bayes (TF-IDF):")
print(f"  Accuracy : {acc_nb:.4f}")
print(f"  Precision: {prec_nb:.4f}")
print(f"  Recall   : {rec_nb:.4f}")
print(f"  F1 Score : {f1_nb:.4f}")

print("\nNeural Network (Pretrained Word2Vec):")
print(f"  Accuracy : {acc_nn:.4f}")
print(f"  Precision: {prec_nn:.4f}")
print(f"  Recall   : {rec_nn:.4f}")
print(f"  F1 Score : {f1_nn:.4f}")

# =========================================
# OPTIONAL: Threshold Tuning Example
# (COMMENTED OUT BY DEFAULT)
# =========================================
# from sklearn.metrics import precision_recall_curve
# import numpy as np
#
# thresholds = np.linspace(0,1,50)
# prec_vals, rec_vals = [], []
# for t in thresholds:
#     y_thresh = (y_pred_proba_nn > t).astype(int)
#     p = precision_score(y_test, y_thresh)
#     r = recall_score(y_test, y_thresh)
#     prec_vals.append(p)
#     rec_vals.append(r)
#
# plt.figure(figsize=(6,4))
# plt.plot(thresholds, prec_vals, label='Precision')
# plt.plot(thresholds, rec_vals, label='Recall')
# plt.title("NN Precision/Recall vs. Threshold")
# plt.xlabel("Threshold")
# plt.ylabel("Score")
# plt.legend()
# plt.show()
#
# # You can pick an optimal threshold based on the trade-off 
# # or compute the F1 at different thresholds to choose the best.


