# E-Mail Spam Classification
## YZV 311E Term Project

Abdullah Bilici, 150200330

Bora Boyacıoğlu, 150200310

Import the necessary libraries.

In [27]:
import pandas as pd
import numpy as np
from utils import tokenize_sentence, term_frequency, inverse_document_frequency, tfidf_calculator

import zipfile
import os

from sklearn.feature_extraction import DictVectorizer

## Loading the Data

Load the CSV data to a Pandas DataFrame.

In [28]:
mails = pd.read_csv("../Data/emails.csv")

## Text Preprocessing

Using an English NLP model, tokenise the sentences for each mail. Then, apply some rules to make the data workable. These include:

* Tokenisation
1. Lowercasing
2. Stop word removal
3. Special character removal
4. Lemmatisation

Apply tokenisation.

In [29]:
mails['tokenised_text'] = mails['text'].apply(tokenize_sentence)

In [30]:
mails['tokenised_text'].head()

0    [subject, naturally, irresistible, corporate, ...
1    [subject, stock, trading, gunslinger, fanny, m...
2    [subject, unbelievable, new, home, easy, m, wa...
3    [subject, 4, color, printing, special, request...
4    [subject, money, software, cd, software, compa...
Name: tokenised_text, dtype: object

## Feature Modelling

Transforming the text into numerical format and vectorising them

### TF-IDF
TF-IDF considers the frequency of the word in the sentence in relation to the frequency in the corpus, helping to diminish the importance of frequently occurring words in the dataset.

In [31]:
# Calculate term frequency
tf = term_frequency(mails["tokenised_text"])
# Calculate inverse documnet frequency
idf = inverse_document_frequency(mails["tokenised_text"])
# Calculate tf.idf
tfidf = tfidf_calculator(tf,idf)

# Vectorize the tfidf matrix
vectorizer = DictVectorizer(sparse=False)
X_tfidf = vectorizer.fit_transform(tfidf)

Because there are too many features (16908 without reducing), reducing the number of columns is a necessity. We have accomplished this by selecting the top 10% features using the word scores.

In [32]:
# Aggregate TF-IDF Scores
word_scores = np.sum(X_tfidf, axis=0)

# Sort and Select Top N Features
n = X_tfidf.shape[1] // 10
top_indices = np.argsort(word_scores)[::-1][:n]

# Create a new reduced TF-IDF matrix with only top features
tfidf_matrix = X_tfidf[:, top_indices]

In [33]:
tfidf_matrix = X_tfidf

In [34]:
tfidf_matrix = np.concatenate([tfidf_matrix, mails["spam"].to_numpy().reshape(-1,1)], axis = 1)

# Shuffle the matrix to create train test and validation data
np.random.shuffle(tfidf_matrix)

print(f"Number of mails: {tfidf_matrix.shape[0]}, Number of unique words: {tfidf_matrix.shape[1]}")

Number of mails: 5728, Number of unique words: 16908


## Save data

Zip the data since data is too large

In [35]:
# Parameters
train_size = 0.6
test_size = 0.2
validation_size = 0.2

if train_size + test_size + validation_size != 1:
    raise ValueError("Train, test and validation size should add up to 1")

In [37]:
file_path = "../Data/data"

# Write data to .npy files
np.save(file_path, tfidf_matrix)
np.save(file_path + "_train" , tfidf_matrix[:int(tfidf_matrix.shape[0]*train_size)])
np.save(file_path + "_test", tfidf_matrix[int(tfidf_matrix.shape[0]*train_size): int(tfidf_matrix.shape[0]*(train_size+test_size))])
np.save(file_path + "_validation", tfidf_matrix[int(tfidf_matrix.shape[0]*(train_size+test_size)):])


# Zip the .npy files
with zipfile.ZipFile(file_path + ".zip", 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(file_path + ".npy", arcname=os.path.basename(file_path + ".npy"))
    zipf.write(file_path + "_train.npy", arcname=os.path.basename(file_path + "_train.npy"))
    zipf.write(file_path + "_test.npy", arcname=os.path.basename(file_path + "_test.npy"))
    zipf.write(file_path + "_validation.npy", arcname=os.path.basename(file_path + "_validation.npy"))

# Delete large .npy files and folders
os.remove(file_path + ".npy")
os.remove(file_path + "_train.npy")
os.remove(file_path + "_test.npy")
os.remove(file_path + "_validation.npy")