# Spam Email Detection: Comparative Analysis of Machine Learning Models and Feature Extraction Techniques

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import gensim
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from transformers import BertTokenizer, BertModel
import pickle
import os

In [17]:
df=pd.read_csv("data/Spam_Email_Data.csv")

In [18]:
print("Sample of the data: \n", df.head(1))
print(df.info())
print("Missing values in each column: \n", df.isna().sum())
print("Distribution of target classes: \n", df['target'].value_counts())

Sample of the data: 
                                                 text  target
0  From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...       0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5796 entries, 0 to 5795
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5796 non-null   object
 1   target  5796 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.7+ KB
None
Missing values in each column: 
 text      0
target    0
dtype: int64
Distribution of target classes: 
 target
0    3900
1    1896
Name: count, dtype: int64


# Data Preprocessing
Here, we clean our text data using various NLP techniques like stemming and removing stop words.

In [19]:
def text_preprocessing(text):
    text = re.sub(r'\S+@\S+', '', text)  # Remove email addresses
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [20]:
df['text'] = df['text'].apply(text_preprocessing)

# Feature Extraction
## Non-Neural Network-Based Embeddings
We split the data into training and testing sets and apply feature extraction techniques that  based on Non-Neural Network such as Count Vectorizer and TF-IDF Vectorizer.

In [21]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.4, random_state=135)

In [22]:
bow_vectorizer = CountVectorizer()
X_train_counts = bow_vectorizer.fit_transform(X_train)
X_test_counts = bow_vectorizer.transform(X_test)

In [23]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

## Neural Network-Based Embeddings
Prepares the data for embeddings using Word2Vec, Doc2Vec, and BERT models.

In [24]:
# Tokenize data for Word2Vec and Doc2Vec
sentences = [row.split() for row in df['text']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, sg=1)  # sg=1 for skip-gram model

In [25]:
def get_word2vec_embedding(text):
    words = text.split()
    embedding = np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv], axis=0)
    return embedding

In [26]:
X_train_w2v = np.array([get_word2vec_embedding(text) for text in X_train])
X_test_w2v = np.array([get_word2vec_embedding(text) for text in X_test])

In [27]:
# Prepare Doc2Vec embeddings
tagged_data = [TaggedDocument(words=_d.split(), tags=[str(i)]) for i, _d in enumerate(df['text'])]
doc2vec_model = Doc2Vec(tagged_data, vector_size=100, window=5, min_count=1, workers=4, epochs=20)

In [40]:
X_train_d2v = np.array([doc2vec_model.infer_vector(text.split()) for text in X_train])
X_test_d2v = np.array([doc2vec_model.infer_vector(text.split()) for text in X_test])

In [28]:
# BERT embeddings setup
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [29]:
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state[:,0,:].detach().numpy().flatten()

In [30]:
X_train_bert = np.array([get_bert_embedding(text) for text in X_train])
X_test_bert = np.array([get_bert_embedding(text) for text in X_test])

# Training and Evaluation
This section trains Logistic Regression and Random Forest models and save them


In [34]:
def train_model(classifier, X_train, y_train):
    model = classifier()
    model.fit(X_train, y_train)
    return model

In [35]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=1)
    recall = recall_score(y_test, y_pred, zero_division=1)
    f1 = f1_score(y_test, y_pred, zero_division=1)
    return {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

In [36]:
def save_model(model, filename):
    # Create directory if it doesn't exist
    if not os.path.exists('saved_models'):
        os.makedirs('saved_models')
    
    # Define the path to save the model
    path = os.path.join('saved_models', filename)
    
    # Save the model
    with open(path, 'wb') as file:
        pickle.dump(model, file)
    print(f"Model saved as {path}")


In [37]:
def train_and_evaluate(feature_type, X_train, y_train, X_test, y_test):
    results = []

    # Logistic Regression
    lr_model = train_model(LogisticRegression, X_train, y_train)
    lr_results = evaluate_model(lr_model, X_test, y_test)
    lr_results['Model'] = f'Logistic Regression ({feature_type})'
    results.append(lr_results)
    save_model(lr_model, f'LR_{feature_type}.pkl')

    # Random Forest
    rf_model = train_model(RandomForestClassifier, X_train, y_train)
    rf_results = evaluate_model(rf_model, X_test, y_test)
    rf_results['Model'] = f'Random Forest ({feature_type})'
    results.append(rf_results)
    save_model(rf_model, f'RF_{feature_type}.pkl')

    return results

In [41]:
# Assuming X_train_counts, X_test_counts, X_train_tfidf, X_test_tfidf, etc. are already defined
results_counts = train_and_evaluate('count_vectorizer', X_train_counts, y_train, X_test_counts, y_test)
results_tfidf = train_and_evaluate('tfidf_vectorizer', X_train_tfidf, y_train, X_test_tfidf, y_test)
results_w2v = train_and_evaluate('word2vec', X_train_w2v, y_train, X_test_w2v, y_test)
results_d2v = train_and_evaluate('doc2vec', X_train_d2v, y_train, X_test_d2v, y_test)
results_bert = train_and_evaluate('bert', X_train_bert, y_train, X_test_bert, y_test)

# Combine and display all results
all_results = results_counts + results_tfidf + results_w2v + results_d2v + results_bert
results_df = pd.DataFrame(all_results)
print(results_df)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model saved as LR_count_vectorizer.pkl
Model saved as RF_count_vectorizer.pkl
Model saved as LR_tfidf_vectorizer.pkl
Model saved as RF_tfidf_vectorizer.pkl
Model saved as LR_word2vec.pkl
Model saved as RF_word2vec.pkl
Model saved as LR_doc2vec.pkl
Model saved as RF_doc2vec.pkl


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model saved as LR_bert.pkl
Model saved as RF_bert.pkl
   Accuracy  Precision    Recall  F1 Score  \
0  0.993100   0.990371  0.987654  0.989011   
1  0.985770   0.984680  0.969822  0.977194   
2  0.976283   0.992690  0.931413  0.961076   
3  0.986201   0.990155  0.965706  0.977778   
4  0.986201   0.992928  0.962963  0.977716   
5  0.983614   0.985935  0.961591  0.973611   
6  0.965502   0.958982  0.930041  0.944290   
7  0.938767   0.956454  0.843621  0.896501   
8  0.987063   0.983402  0.975309  0.979339   
9  0.969383   0.991045  0.910837  0.949249   

                                    Model  
0  Logistic Regression (count_vectorizer)  
1        Random Forest (count_vectorizer)  
2  Logistic Regression (tfidf_vectorizer)  
3        Random Forest (tfidf_vectorizer)  
4          Logistic Regression (word2vec)  
5                Random Forest (word2vec)  
6           Logistic Regression (doc2vec)  
7                 Random Forest (doc2vec)  
8              Logistic Regression (bert)  

# Model Evaluation Results

The table below summarizes the performance metrics of the different models trained using various feature extraction techniques. Each row corresponds to a specific model and its associated performance metrics such as accuracy, precision, recall, and F1 score.

| Model                                 | Accuracy | Precision | Recall  | F1 Score |
|---------------------------------------|----------|-----------|---------|----------|
| Logistic Regression (count_vectorizer)| 0.9931   | 0.9904    | 0.9877  | 0.9890   |
| Random Forest (count_vectorizer)      | 0.9858   | 0.9847    | 0.9698  | 0.9772   |
| Logistic Regression (tfidf_vectorizer)| 0.9763   | 0.9927    | 0.9314  | 0.9611   |
| Random Forest (tfidf_vectorizer)      | 0.9862   | 0.9902    | 0.9657  | 0.9778   |
| Logistic Regression (word2vec)        | 0.9862   | 0.9929    | 0.9630  | 0.9777   |
| Random Forest (word2vec)              | 0.9836   | 0.9859    | 0.9616  | 0.9736   |
| Logistic Regression (doc2vec)         | 0.9655   | 0.9590    | 0.9300  | 0.9443   |
| Random Forest (doc2vec)               | 0.9388   | 0.9565    | 0.8436  | 0.8965   |
| Logistic Regression (bert)            | 0.9871   | 0.9834    | 0.9753  | 0.9793   |
| Random Forest (bert)                  | 0.9694   | 0.9910    | 0.9108  | 0.9492   |

## Observations and Analysis

- **Count Vectorizer Models**: Both Logistic Regression and Random Forest models trained with count vectorizer features show high accuracy, precision, and F1 scores. This suggests good generalization on the test data.
- **TF-IDF Vectorizer Models**: TF-IDF models also perform well, especially in terms of precision, indicating fewer false positives.
- **Word2Vec Models**: These models have demonstrated strong performance, particularly the Logistic Regression model, which exhibits high scores across all metrics.
- **Doc2Vec Models**: While still performing respectably, the Doc2Vec models show a slight dip in metrics compared to other models, particularly in recall and F1 score.
- **BERT Models**: The BERT-based models achieve excellent accuracy and precision, with the Logistic Regression variant showing slightly better recall and F1 scores compared to the Random Forest variant.

These results will guide further refinement of models and choice of feature extraction techniques for the spam email classification task.
