# E-Mail Spam Classification
## YZV 311E Term Project

Abdullah Bilici, 150200330

Bora Boyacıoğlu, 150200310

Import the necessary libraries.

In [1]:
import pandas as pd
import numpy as np

import spacy
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Loading the Data

Load the CSV data to a Pandas DataFrame.

In [2]:
mails = pd.read_csv("../Data/emails.csv")

In [3]:
# Sice the data is too large, using a ~1/20 batch would be easier for test purposes.
mails_full = mails.copy()
mails = mails.iloc[:250]

## Text Preprocessing

Using an English NLP model, tokenise the sentences for each mail. Then, apply some rules to make the data workable. These include:

* Tokenisation
1. Lowercasing
2. Stop word removal
3. Special character removal
4. Lemmatisation

In [4]:
# Load the language model
nlp = spacy.load('en_core_web_sm')

# Get the stop words and punctuations
stop_words = set(nlp.Defaults.stop_words)
punctuations = set(string.punctuation)

Define a function for tokenising and preprocessing.

In [5]:
def tokenise_sentence(sentence):
    # Error handling for non-string inputs
    if not isinstance(sentence, str):
        return []
    
    # Tokenise the sentence
    tokenised = nlp(sentence)
    
    # Lowercase and lemmatise the words
    tokens = [token.lemma_.lower().strip() if token.pos_ != "PRON" else token.lower_ for token in tokenised]
    
    # Remove stop words and special characters
    tokens = [token for token in tokens if token not in stop_words and token not in punctuations]
    
    # Remove empty tokens
    tokens = [token for token in tokens if token != '']
    
    return tokens

Apply tokenisation.

In [6]:
mails['tokenised_text'] = mails['text'].apply(tokenise_sentence)

In [7]:
mails['tokenised_text'].head()

0    [subject, naturally, irresistible, corporate, ...
1    [subject, stock, trading, gunslinger, fanny, m...
2    [subject, unbelievable, new, home, easy, m, wa...
3    [subject, 4, color, printing, special, request...
4    [subject, money, software, cd, software, compa...
Name: tokenised_text, dtype: object

In [10]:
mails.iloc[0]['tokenised_text']

['subject',
 'naturally',
 'irresistible',
 'corporate',
 'identity',
 'lt',
 'hard',
 'recollect',
 'company',
 'market',
 'suqgestion',
 'information',
 'isoverwhelminq',
 'good',
 'catchy',
 'logo',
 'stylish',
 'statlonery',
 'outstanding',
 'website',
 'task',
 'easy',
 'promise',
 'havinq',
 'order',
 'iogo',
 'company',
 'automaticaily',
 'world',
 'ieader',
 'isguite',
 'ciear',
 'good',
 'product',
 'effective',
 'business',
 'organization',
 'practicable',
 'aim',
 'hotat',
 'nowadays',
 'market',
 'promise',
 'marketing',
 'effort',
 'effective',
 'list',
 'clear',
 'benefit',
 'creativeness',
 'hand',
 'original',
 'logo',
 'specially',
 'reflect',
 'distinctive',
 'company',
 'image',
 'convenience',
 'logo',
 'stationery',
 'provide',
 'format',
 'easy',
 'use',
 'content',
 'management',
 'system',
 'letsyou',
 'change',
 'website',
 'content',
 'structure',
 'promptness',
 'logo',
 'draft',
 'business',
 'day',
 'affordability',
 'marketing',
 'break',
 'shouldn',
 't',

## Feature Modelling

Transforming the text into numerical format and vectorising them

### Bag-of-Words (BoW)
The Bag-of-Words model represents text as an unordered collection of words frequencies.

In [11]:
# Re-Join the tokens into corpus for vectorisation
corpus = [' '.join(tokens) for tokens in mails['tokenised_text']]

# Vectorise the corpus
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

### TF-IDF
TF-IDF considers the frequency of the word in the sentence in relation to the frequency in the corpus, helping to diminish the importance of frequently occurring words in the dataset.

In [12]:
# TF-IDF Vectorisation
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

In [14]:
# Convert the sparse matrix to a dataframe
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [17]:
X_tfidf_df.describe()

Unnamed: 0,00,000,0000,0004,00076,0052,00971,01,0100,01019,...,zmsx,znalazlam,zndnioay,zoloftpain,zone,zoo,zoolant,zuid,zxghlajf,zzzz
count,250.0,250.0,250.0,250.0,250.0,250.0,250.0,250.0,250.0,250.0,...,250.0,250.0,250.0,250.0,250.0,250.0,250.0,250.0,250.0,250.0
mean,0.00375,0.009948,0.003794,0.0002,0.0002,0.000425,0.000378,0.001895,0.001117,0.0004,...,0.000315,0.000703,0.000753,0.000525,0.000582,0.001197,0.000419,0.000243,0.00018,0.000582
std,0.017907,0.033738,0.031841,0.003161,0.003161,0.006716,0.005982,0.013906,0.017668,0.006322,...,0.004976,0.01111,0.011899,0.008295,0.007368,0.018928,0.006633,0.003835,0.002841,0.009208
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.119739,0.190038,0.292068,0.049977,0.049977,0.106184,0.094587,0.141094,0.279362,0.099954,...,0.078676,0.175664,0.188132,0.131153,0.111565,0.299283,0.10487,0.060639,0.044923,0.145589
