In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support, roc_curve, auc

# Deep learning and text processing libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
# Part 1: Building and Optimizing a CNN for Binary Classification
print("Part 1: Building and Optimizing a CNN for Spam Classification")
print("\nStep 1: Loading and preprocessing the data...")

# Load the spam dataset
df = pd.read_csv('spam.csv', encoding='cp1252')

# Rename columns
df = df.rename(columns={'v1': 'label', 'v2': 'message'})

# Keep only the needed columns
df = df[['label', 'message']]

# Map labels to numeric values (ham=0, spam=1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Display dataset information
print(f"Dataset shape: {df.shape}")
print("\nClass distribution:")
print(df['label'].value_counts())

# Calculate spam percentage
spam_percentage = df['label'].mean() * 100
print(f"Spam percentage: {spam_percentage:.2f}%")

print("\nDisplay a few examples:")
for i in range(5):
    label = "SPAM" if df.iloc[i]['label'] == 1 else "HAM"
    print(f"{label}: {df.iloc[i]['message'][:80]}...")

In [None]:
# Text preprocessing function
def clean_text(text):
    """Clean and normalize text data"""
    text = text.lower()
    text = ''.join([c for c in text if c not in string.punctuation])
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply text preprocessing
df['clean_message'] = df['message'].apply(clean_text)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_message'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

print(f"\nTraining set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

In [None]:
# Tokenize the text
max_features = 5000  # Maximum number of words to keep
max_len = 100        # Maximum sequence length

tokenizer = Tokenizer(num_words=max_features, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

# Get vocabulary size
vocab_size = min(max_features, len(tokenizer.word_index) + 1)
print(f"Vocabulary size: {vocab_size}")

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

print(f"Padded training data shape: {X_train_pad.shape}")
print(f"Padded testing data shape: {X_test_pad.shape}")

In [None]:
print("\nStep 2: Building the CNN model...")

# CNN model architecture
def create_cnn_model(vocab_size, embedding_dim=100, max_len=100, filters=128, kernel_size=5, dropout_rate=0.3):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
        Conv1D(filters=filters, kernel_size=kernel_size, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(64, activation='relu'),
        Dropout(dropout_rate),
        Dense(32, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    return model

# Create the model
embedding_dim = 100
cnn_model = create_cnn_model(vocab_size, embedding_dim, max_len)

# Display model summary
cnn_model.summary()

In [None]:
print("\nStep 3: Training the model...")

# Set up callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

model_checkpoint = ModelCheckpoint(
    'best_spam_cnn_model.h5',
    monitor='val_loss',
    save_best_only=True
)

# Train the model
history = cnn_model.fit(
    X_train_pad, y_train,
    epochs=15,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint],
    verbose=1
)

In [None]:
print("\nStep 4: Evaluating the model...")

# Evaluate the model on the test set
test_loss, test_accuracy = cnn_model.evaluate(X_test_pad, y_test, verbose=1)
print(f"Test accuracy: {test_accuracy:.4f}")

# Make predictions
y_pred_proba = cnn_model.predict(X_test_pad)
y_pred = (y_pred_proba > 0.5).astype(int).flatten()

# Calculate metrics
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
# Part 2: Debugging Model Failures
print("\nPart 2: Debugging Model Issues")

# Plot training history
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.savefig('training_history.png')
print("Training history plot saved as 'training_history.png'")

In [None]:
# Check for overfitting
train_acc = history.history['accuracy'][-1]
val_acc = history.history['val_accuracy'][-1]
accuracy_diff = train_acc - val_acc

print("\nChecking for overfitting:")
print(f"Final training accuracy: {train_acc:.4f}")
print(f"Final validation accuracy: {val_acc:.4f}")
print(f"Difference: {accuracy_diff:.4f}")

if accuracy_diff > 0.05:
    print("Potential overfitting detected")

    print("\nApplying solutions to overfitting:")
    from tensorflow.keras.regularizers import l2

    def create_regularized_model(vocab_size, embedding_dim=100, max_len=100, dropout_rate=0.5):
        model = Sequential([
            Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
            Conv1D(filters=64, kernel_size=5, activation='relu', kernel_regularizer=l2(0.001)),
            GlobalMaxPooling1D(),
            Dense(32, activation='relu', kernel_regularizer=l2(0.001)),
            Dropout(dropout_rate),
            Dense(1, activation='sigmoid')
        ])

        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        return model

    # Create and train regularized model
    regularized_model = create_regularized_model(vocab_size, embedding_dim, max_len)
    regularized_history = regularized_model.fit(
        X_train_pad, y_train,
        epochs=15,
        batch_size=64,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=1
    )

    # Evaluate regularized model
    reg_test_loss, reg_test_accuracy = regularized_model.evaluate(X_test_pad, y_test, verbose=1)
    print(f"\nRegularized model test accuracy: {reg_test_accuracy:.4f}")

    y_pred_proba_reg = regularized_model.predict(X_test_pad)
    y_pred_reg = (y_pred_proba_reg > 0.5).astype(int).flatten()

    print("\nRegularized Model Classification Report:")
    print(classification_report(y_test, y_pred_reg))

    # Model comparison
    print("\nModel comparison:")
    print(f"Original model accuracy: {test_accuracy:.4f}")
    print(f"Regularized model accuracy: {reg_test_accuracy:.4f}")

    if reg_test_accuracy > test_accuracy:
        print("Using the regularized model for further analysis")
        best_model = regularized_model
        y_pred = y_pred_reg
        y_pred_proba = y_pred_proba_reg
    else:
        print("Using the original model for further analysis")
        best_model = cnn_model
else:
    print("No significant overfitting detected")
    best_model = cnn_model

In [None]:
# Part 3: Evaluating Model Effectiveness
print("\nPart 3: Evaluating Model Effectiveness")

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Ham', 'Spam'],
            yticklabels=['Ham', 'Spam'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')
print("Confusion matrix plot saved as 'confusion_matrix.png'")

In [None]:
# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.savefig('roc_curve.png')
print("ROC curve plot saved as 'roc_curve.png'")

In [None]:
# Analyze misclassified examples
y_pred_binary = best_model.predict(X_test_pad) > 0.5
misclassified_indices = np.where(y_pred_binary.flatten() != y_test)[0]

print(f"\nNumber of misclassified examples: {len(misclassified_indices)}")

if len(misclassified_indices) > 0:
    print("\nAnalyzing misclassified examples:")
    X_test_array = np.array(X_test)

    def classify_error(true_label, pred_proba):
        if true_label == 1 and pred_proba < 0.5:
            return "False Negative (Spam missed)"
        else:
            return "False Positive (Ham marked as spam)"

    num_to_show = min(5, len(misclassified_indices))
    for i in range(num_to_show):
        idx = misclassified_indices[i]
        true_label = y_test.iloc[idx]
        pred_proba = y_pred_proba[idx][0]
        error_type = classify_error(true_label, pred_proba)

        print(f"\nExample {i+1} - {error_type}:")
        print(f"Message: {X_test_array[idx][:100]}...")
        print(f"True label: {'Spam' if true_label == 1 else 'Ham'}")
        print(f"Predicted probability: {pred_proba:.4f}")

In [None]:
# Prediction function
def predict_spam(message, model, tokenizer, max_len=100):
    """
    Predict if a message is spam using the trained model.
    """
    cleaned_message = clean_text(message)
    sequence = tokenizer.texts_to_sequences([cleaned_message])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')
    prediction = model.predict(padded_sequence)[0][0]
    return ('SPAM' if prediction > 0.5 else 'HAM', prediction)

# Test with example messages
example_messages = [
    "Congratulations! You've won a free iPhone. Click here to claim now!",
    "Hey, what time are we meeting for dinner tonight?",
    "URGENT: Your bank account has been compromised. Reply with your details immediately.",
    "Don't forget to pick up milk on your way home."
]

print("\nTesting the spam prediction function:")
for message in example_messages:
    label, probability = predict_spam(message, best_model, tokenizer)
    print(f"\nMessage: {message}")
    print(f"Prediction: {label} (Probability: {probability:.4f})")

print("\nSpam classification project complete!")