In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
import re
import string
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


In [None]:
nltk.download('stopwords')

In [None]:
# Load Dataset
df = pd.read_csv("spam_ham_dataset.csv")
df

In [None]:
df.drop(columns=['Unnamed: 0'], inplace=True) # remove unneccessary column
df

In [None]:
df['text'] = df['text'].apply(lambda x: x.replace("\r\n", '')) # remove special characters
df

In [None]:
print("Dataset shape:", df.shape)

In [None]:
print("Column names:", df.columns.tolist())

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
print("Data types:", df.info())

In [None]:
# print(df['label_num'].unique())
print(df.label_num.unique()) # checking the unique values in the target_column

In [None]:
print(df['label'].value_counts()) 
#print(df.label.value_counts()) # Check the target class (label) distribution

In [None]:
print(df.label.value_counts(normalize=True)*100) 
#print(df['label'].value_counts(normalize=True) * 100) # Label distribution (percentages)

In [None]:
# Visualising the target class distribution
label_counts = df['label_num'].value_counts().sort_index()
labels = ['Ham', 'Spam']
custom_palette = ['#1f77b4', 'green']

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# --- Bar Chart (LEFT) ---
# Convert 'label_num' to category to ensure proper hue mapping
df['label_num'] = df['label_num'].astype('category')

sns.countplot(
    x='label_num',
    hue='label_num',  # Explicitly set hue
    data=df,
    palette=custom_palette,
    ax=axes[0],
    legend=False
)
axes[0].set_xlabel("Target")
axes[0].set_ylabel("Count")
axes[0].set_title("Spam Email Class Distribution")
handles = [plt.Rectangle((0, 0), 1, 1, color=color) for color in custom_palette]
axes[0].legend(handles, labels)

# --- Pie Chart (RIGHT) ---
axes[1].pie(label_counts, labels=labels, autopct='%1.1f%%', colors=custom_palette, startangle=90)
axes[1].set_title("Class Proportion")

plt.tight_layout()
plt.show()

In [None]:
df.text.iloc[1] # checking the content of specific row

In [None]:
stemmer = PorterStemmer()
stopwords_set = set(stopwords.words('english'))

In [None]:
# Preprocessing the text
corpus = []
for i in range (len(df)):
    text = df["text"].iloc[i].lower()
    text = text.translate(str.maketrans("", "", string.punctuation)).split()
    text = [stemmer.stem(word) for word in text if word not in stopwords_set]
    text = " ".join(text)
    corpus.append(text)

In [None]:
df.text.iloc[0] # checking the text in the first row


In [None]:
# Encode Label
y = df['label'].map({'ham': 0, 'spam': 1})  # Binary encoding
y


In [None]:
# Step 1: Split text before vectorisation
corpus_train, corpus_test, y_train, y_test = train_test_split(
    corpus,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [None]:
len(corpus_train)  # This returns how many messages are in corpus_train

In [None]:
len(corpus_test)

In [None]:
# Step 2: Initialize and fit TfidfVectorizer on training text only
vectorizer = TfidfVectorizer()
#max_df=0.9,         # Ignore very frequent words
#min_df=5,           # Ignore rare words appearing in <5 docs
#ngram_range=(1, 2)  # Use unigrams and bigrams
X_train = vectorizer.fit_transform(corpus_train)  # Fit + transform on training
X_test = vectorizer.transform(corpus_test)        # Transform only on test


In [None]:
X_train.shape   # rows = messages, columns = unique words


In [None]:
X_test.shape

In [None]:
# Optional: Convert to dense arrays (if needed by your model)
# X_train = X_train.toarray()
# X_test = X_test.toarray()


# Baseline Model

In [None]:
# Most Frequent
from sklearn.dummy import DummyClassifier
baseline_mf = DummyClassifier(strategy="most_frequent")
baseline_mf.fit(X_train, y_train)
y_pred_mf = baseline_mf.predict(X_test)
acc_mf = accuracy_score(y_test, y_pred_mf)
print("Baseline Model - Most Frequent")
print("Accuracy:", acc_mf)
print("Classification Report:\n", classification_report(y_test, y_pred_mf, zero_division=0))

In [None]:
# Stratified
baseline_strat = DummyClassifier(strategy="stratified", random_state=42)
baseline_strat.fit(X_train, y_train)
y_pred_strat = baseline_strat.predict(X_test)
acc_strat = accuracy_score(y_test, y_pred_strat)
print("Baseline Model - Stratified")
print("Accuracy:", acc_strat)
print("Classification Report:\n", classification_report(y_test, y_pred_strat, zero_division=0))

In [None]:
# Uniform
baseline_uniform = DummyClassifier(strategy="uniform", random_state=42)
baseline_uniform.fit(X_train, y_train)
y_pred_uniform = baseline_uniform.predict(X_test)
acc_uniform = accuracy_score(y_test, y_pred_uniform)
print("Baseline Model - Uniform")
print("Accuracy:", acc_uniform)
print("Classification Report:\n", classification_report(y_test, y_pred_uniform, zero_division=0))

# Build and Train Multiple Models


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'), # class_weight='balanced'
    "Random Forest": RandomForestClassifier(class_weight='balanced'), # class_weight='balanced'
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced'), # class_weight='balanced'
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(probability=True), # class_weight='balanced'
    "Naive Bayes": MultinomialNB(),
    "Gradient Boosting": GradientBoostingClassifier()
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    try:
        y_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
    except AttributeError:
        y_proba = None
        auc = None
        
    acc = accuracy_score(y_test, y_pred)
    print(f"\n{name}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Accuracy:", acc)
    print("ROC AUC Score:", auc)
    results.append((name, acc, auc))

# Add baselines to results
results.insert(0, ("Baseline (Uniform)", acc_uniform, None))
results.insert(0, ("Baseline (Stratified)", acc_strat, None))
results.insert(0, ("Baseline (Most Frequent)", acc_mf, None))


In [None]:
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "ROC AUC"])
print("\nModel Comparison:")
print(results_df.sort_values(by="ROC AUC", ascending=False))


In [None]:
# Cross Validation
from sklearn.model_selection import cross_val_score, StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("Cross-Validation Results:")
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    print(f"{name}: {scores.mean():.4f} ± {scores.std():.4f}")

In [None]:
# Display the cross validation result in a tabular format
results = []
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    results.append({
        'Model': name,
        'Mean Accuracy': scores.mean(),
        'Std Dev': scores.std()
    })

# Create a DataFrame for display
results_df = pd.DataFrame(results).sort_values(by='Mean Accuracy', ascending= False)

# Format and display
results_df = results_df.round(4)
print("Cross-Validation Results:")
display(results_df)

In [None]:
# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100],            # Regularization parameter
    'kernel': ['linear', 'rbf'],       # Try linear and RBF kernels
    'gamma': ['scale', 'auto']          # For RBF kernel
}

svm = SVC(probability=True)

grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)


In [None]:
# Evaluate best svm on test set
best_svm = grid_search.best_estimator_
y_svm_pred = best_svm.predict(X_test)
print("Test Accuracy (Best svm):", accuracy_score(y_test, y_svm_pred))
print("Classification Report (Best svm):\n", classification_report(y_test, y_svm_pred))

In [None]:
# Save the best model
import pickle
best_model = models["Support Vector Machine"]
# Save your best-performing model
with open("spam_model_svm.pkl", "wb") as f:
    pickle.dump(best_model, f)

# Save the CountVectorizer used during training
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)