In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score , f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import CountVectorizer

# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')



# Preprocessing

In [2]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    # Join tokens back into text
    preprocessed_text = ' '.join(stemmed_tokens)
    return preprocessed_text

def tfidf_features(text_data):
    vectorizer = TfidfVectorizer(max_features=1000)  # Limiting to top 1000 features
    X = vectorizer.fit_transform(text_data)
    return X.toarray(), vectorizer

In [3]:
# Load the data
data = pd.read_csv('Spam_Email_Data.csv')

# Display the first few rows of the data
print(data.head())

# Display the number of rows and columns in the data
print(data.shape)

# Display the number of spam and non-spam emails in the data
print(data['target'].value_counts())

# Display the percentage of spam and non-spam emails in the data
print(data['target'].value_counts(normalize=True) * 100)

                                                text  target
0  From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...       0
1  From gort44@excite.com Mon Jun 24 17:54:21 200...       1
2  From fork-admin@xent.com Mon Jul 29 11:39:57 2...       1
3  From dcm123@btamail.net.cn Mon Jun 24 17:49:23...       1
4  From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...       0
(5796, 2)
target
0    3900
1    1896
Name: count, dtype: int64
target
0    67.287785
1    32.712215
Name: proportion, dtype: float64


In [4]:
# Applying preprocessing to the data
data['clean_text'] = data['text'].apply(preprocess_text)

# Tfidf Vectorizer

In [5]:
# Applying tfidf Feature Extraction to the data
X_tfidf, vectorizer = tfidf_features(data['clean_text'])

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, data['target'], test_size=0.2, random_state=42)

# Convert y_train and y_test to integers
y_train = y_train.astype('int')
y_test = y_test.astype('int')

# Tfidf - Logistic Regression

In [7]:
# Intialize and Train the logistic regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train , y_train)

In [8]:
# Predict the test data
test_predicition = logistic_model.predict(X_test)

# Evaluate the model
test_f1 = f1_score(y_test , test_predicition)
test_accuracy = accuracy_score(y_test , test_predicition)

# Print the results
print(f"accuracy: {test_accuracy}")
print(f"f1 score: {test_f1}")

accuracy: 0.9870689655172413
f1 score: 0.9799732977303071


# Tfidf - KNN

In [9]:
# Intialize and Train the KNN model
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train , y_train)

In [10]:
# Predict the test data
test_predicition_knn = knn.predict(X_test)

# Evaluate the model
test_accuracy_knn = accuracy_score(y_test , test_predicition_knn)
test_f1 = f1_score(y_test , test_predicition_knn)

# Print the results
print(f"accuracy: {test_accuracy_knn}")
print(f"f1 score: {test_f1}")

accuracy: 0.9879310344827587
f1 score: 0.9815303430079155


# bag of words - Logistic Regression

In [11]:
# Applying bag of words Feature Extraction to the data
def bow_features(text_data):
    vectorizer = CountVectorizer(max_features=1000)
    X = vectorizer.fit_transform(text_data)
    return X.toarray(), vectorizer

X_bow, bow_vectorizer = bow_features(data['clean_text'])

# Split the data into training and testing sets
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X_bow, data['target'], test_size=0.2, random_state=42)

# Convert y_train_bow and y_test_bow to integers
y_train_bow = y_train_bow.astype('int')
y_test_bow = y_test_bow.astype('int')

In [12]:
# Intialize and Train the logistic regression model
logistic_model_bow = LogisticRegression()
logistic_model_bow.fit(X_train_bow , y_train_bow)

# Predict the test data
test_predicition_bow = logistic_model_bow.predict(X_test_bow)

# Evaluate the model
test_accuracy_bow = accuracy_score(y_test_bow , test_predicition_bow)
test_f1 = f1_score(y_test_bow , test_predicition_bow)

# Print the results
print(f"accuracy : {test_accuracy_bow}")
print(f"f1 score : {test_f1}")

accuracy : 0.996551724137931
f1 score : 0.994750656167979


# bag of words - KNN

In [13]:
# Intialize and Train the KNN model
knn_bow = KNeighborsClassifier(n_neighbors=5)
knn_bow.fit(X_train_bow , y_train_bow)

# Predict the test data
test_predicition_knn_bow = knn_bow.predict(X_test_bow)
# Evaluate the model
test_accuracy_knn_bow = accuracy_score(y_test_bow , test_predicition_knn_bow)
test_f1 = f1_score(y_test_bow , test_predicition_knn_bow)

# Print the results
print(f"accuracy : {test_accuracy_knn_bow}")
print(f"f1 score : {test_f1}")

accuracy : 0.9870689655172413
f1 score : 0.9802371541501976


# word2vec - Logistic Regression

In [14]:
# Tokenize the text data
tokenized_text = data['clean_text'].apply(lambda x: x.split())

# Train the Word2Vec model
word2vec_model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=1, sg=1)

# Get the word vectors
word_vectors = word2vec_model.wv

# Get the average word vector for each document
def get_average_word_vectors(tokens_list, vector, generate_missing=False, k=100):
    if len(tokens_list) < 1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

# Get the average word vector for each document
wordvec_arrays = np.zeros((len(tokenized_text), 100))

# Tokenize the text data
for i in range(len(tokenized_text)):
    wordvec_arrays[i,:] = get_average_word_vectors(tokenized_text[i], word_vectors, generate_missing=True, k=100)

# Split the data into training and testing sets
X_train_wordvec, X_test_wordvec, y_train_wordvec, y_test_wordvec = train_test_split(wordvec_arrays, data['target'], test_size=0.2, random_state=42)

In [15]:
# Intialize and Train the logistic regression model
logistic_model_wordvec = LogisticRegression()
logistic_model_wordvec.fit(X_train_wordvec, y_train_wordvec)

# Predict the test data
test_prediction_wordvec = logistic_model_wordvec.predict(X_test_wordvec)

# Evaluate the model
test_accuracy_wordvec = accuracy_score(y_test_wordvec, test_prediction_wordvec)
test_f1 = f1_score(y_test_wordvec, test_prediction_wordvec)

# Print the results
print(f"accuracy : {test_accuracy_wordvec}")
print(f"f1 score : {test_f1}")

accuracy : 0.9905172413793103
f1 score : 0.9853528628495339


# word2vec - KNN

In [16]:
# Intialize and Train the KNN model
knn_wordvec = KNeighborsClassifier(n_neighbors=11)
knn_wordvec.fit(X_train_wordvec, y_train_wordvec)

# Predict the test data
test_prediction_knn_wordvec = knn_wordvec.predict(X_test_wordvec)

# Evaluate the model
test_accuracy_knn_wordvec = accuracy_score(y_test_wordvec, test_prediction_knn_wordvec)
test_f1 = f1_score(y_test_wordvec, test_prediction_knn_wordvec)

# Print the results
print(f"accuracy : {test_accuracy_knn_wordvec}")
print(f"f1 score : {test_f1}")

accuracy : 0.9887931034482759
f1 score : 0.9828269484808454


# doc2vec - Logistic Regression

In [17]:

# Apply the Doc2Vec Feature Extraction
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data['clean_text'])]

max_epochs = 100
vec_size = 20
alpha = 0.025

# Build the model
model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=1)
model.build_vocab(tagged_data)

# Train the model
for epoch in range(max_epochs):
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    model.alpha -= 0.0002
    model.min_alpha = model.alpha

# Get the document vectors
docvec_arrays = np.zeros((len(tokenized_text), vec_size))

# Get the document vectors
for i in range(len(data)):
    docvec_arrays[i,:] = model.docvecs[i].reshape((1, vec_size))

  docvec_arrays[i,:] = model.docvecs[i].reshape((1, vec_size))


In [18]:
# Split the data into training and testing sets
X_train_docvec, X_test_docvec, y_train_docvec, y_test_docvec = train_test_split(docvec_arrays, data['target'], test_size=0.2, random_state=42)

In [19]:
# Intialize and Train the logistic regression model
logistic_model_docvec = LogisticRegression()
logistic_model_docvec.fit(X_train_docvec, y_train_docvec)

# Predict the test data
test_prediction_docvec = logistic_model_docvec.predict(X_test_docvec)

# Evaluate the model
test_accuracy_docvec = accuracy_score(y_test_docvec, test_prediction_docvec)
test_f1 = f1_score(y_test_docvec, test_prediction_docvec)

# Print the results
print(f"accuracy : {test_accuracy_docvec}")
print(f"f1 score : {test_f1}")

accuracy : 0.9948275862068966
f1 score : 0.9920634920634921


# doc2vec - KNN

In [20]:
# Intialize and Train the KNN model
knn_docvec = KNeighborsClassifier(n_neighbors=7)
knn_docvec.fit(X_train_docvec, y_train_docvec)

# Predict the test data
test_prediction_knn_docvec = knn_docvec.predict(X_test_docvec)

# Evaluate the model
test_accuracy_knn_docvec = accuracy_score(y_test_docvec, test_prediction_knn_docvec)
test_f1 = f1_score(y_test_docvec, test_prediction_knn_docvec)

# Print the results
print(f"accuracy : {test_accuracy_knn_docvec}")
print(f"f1 score : {test_f1}")

accuracy : 0.9879310344827587
f1 score : 0.9812834224598931
