## Importing Libraries

In [3]:
# General libraries
import numpy as np
import pandas as pd
# NLP libraries
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re as regex
# Text embedding libraries
import gensim as gsm
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score

## Loading the dataset

In [4]:
data = pd.read_csv('Spam_Email_Data.csv')
cleaned_data = data.copy()

## Data Preprocessing

In [5]:
def data_preprocessing(text):
    text = ' '.join(word for word in text.split() if '@' not in word)
    text = ''.join(char for char in text if char.isalpha() or char.isspace())
    text = ''.join(char for char in text if char not in ['<', '>'])
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    filter_text = [stemmer.stem(token) for token in tokens if token not in stop_words]
    return ' '.join(filter_text)

cleaned_data['text'] = cleaned_data['text'].apply(data_preprocessing)

In [6]:
cleaned_data.head()

Unnamed: 0,text,target
0,mon jul returnpath deliveredto receiv localhos...,0
1,mon jun returnpath deliveryd tue jun receiv ma...,1
2,mon jul returnpath deliveredto receiv localhos...,1
3,mon jun returnpath deliveryd mon jun receiv ma...,1
4,mon aug returnpath deliveredto receiv localhos...,0


## Data Splitting

In [7]:
# Split the data
x_train, x_test, y_train, y_test = train_test_split(cleaned_data['text'], cleaned_data['target'], test_size=0.5, random_state=42)

## Text embedding techniques

### With neural networks

In [8]:
words = [rec.split() for rec in x_train]
documents = [TaggedDocument(rec, [i]) for i, rec in enumerate(words)]
word2vec = Word2Vec(words)
dec2vec = Doc2Vec(documents)
def word2vec_embedding(text):
    bag = text.split()
    return np.mean([word2vec.wv[word] for word in bag if word in word2vec.wv], axis=0)
def doc2vec_embedding(text):
    return dec2vec.infer_vector(text.split())

### Without neural networks

In [9]:
tfid = TfidfVectorizer()
bow = CountVectorizer()

## Model Training

In [10]:
models ={
    "Logistic Regression with Word2Vec":LogisticRegression(max_iter=1000) ,
    "SVM with Word2Vec": SVC(),
    "Logistic Regression with Doc2Vec": LogisticRegression(max_iter=1000),
    "SVM with Doc2Vec": SVC(),
    "Logistic Regression with TFID": LogisticRegression(max_iter=1000),
    "SVM with TFID": SVC(),
    "Logistic Regression with BOW": LogisticRegression(max_iter=1000),
    "SVM with BOW": SVC()
}

### With neural networks

In [11]:
#word2vec
x_train_word2vec = np.array([word2vec_embedding(text) for text in x_train])
x_test_word2vec = np.array([word2vec_embedding(text) for text in x_test])
#doc2vec
x_train_doc2vec = np.array([doc2vec_embedding(text) for text in x_train])
x_test_doc2vec = np.array([doc2vec_embedding(text) for text in x_test])

### Without neural networks

In [12]:
#tfid
x_train_tfid = tfid.fit_transform(x_train)
x_test_tfid = tfid.transform(x_test)
#bow
x_train_bow = bow.fit_transform(x_train)
x_test_bow = bow.transform(x_test)

### Model Evaluation

In [13]:
results = []
def get_model_evaluation(model,y_test,y_pred):
    recall = recall_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    accuracy = accuracy_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    results.append({'Model': model, 'Recall': recall, 'Precision': precision, 'Accuracy': accuracy, 'F1-Score': f1})


for model in models:
    if 'Word2Vec' in model:
        models[model].fit(x_train_word2vec,y_train)
        y_pred = models[model].predict(x_test_word2vec)
        get_model_evaluation(model,y_test,y_pred)
    elif 'Doc2Vec' in model:
        models[model].fit(x_train_doc2vec,y_train)
        y_pred = models[model].predict(x_test_doc2vec)
        get_model_evaluation(model,y_test,y_pred)
    elif 'TFID' in model:
        models[model].fit(x_train_tfid,y_train)
        y_pred = models[model].predict(x_test_tfid)
        get_model_evaluation(model,y_test,y_pred)
    else:
        models[model].fit(x_train_bow,y_train)
        y_pred = models[model].predict(x_test_bow)
        get_model_evaluation(model,y_test,y_pred)

In [14]:
print(pd.DataFrame(results))

                               Model    Recall  Precision  Accuracy  F1-Score
0  Logistic Regression with Word2Vec  0.956522   0.988235  0.981712  0.972120
1                  SVM with Word2Vec  0.950311   0.992432  0.981021  0.970915
2   Logistic Regression with Doc2Vec  0.897516   0.961197  0.953761  0.928266
3                   SVM with Doc2Vec  0.890269   0.975057  0.955832  0.930736
4      Logistic Regression with TFID  0.922360   0.997760  0.973430  0.958580
5                      SVM with TFID  0.967909   0.998932  0.988958  0.983176
6       Logistic Regression with BOW  0.987578   0.996865  0.994824  0.992200
7                       SVM with BOW  0.964803   0.988335  0.984472  0.976427
