# E-Mail Spam Classification
## YZV 311E Term Project

Abdullah Bilici, 150200330

Bora Boyacıoğlu, 150200310

Import the necessary libraries.

In [1]:
import pandas as pd
import numpy as np

import spacy
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Loading the Data

Load the CSV data to a Pandas DataFrame.

In [2]:
mails = pd.read_csv("../Data/emails.csv")

In [3]:
# Sice the data is too large, using a ~1/20 batch would be easier for test purposes.
mails_full = mails.copy()
mails = mails.iloc[:250]

## Text Preprocessing

Using an English NLP model, tokenise the sentences for each mail. Then, apply some rules to make the data workable. These include:

* Tokenisation
1. Lowercasing
2. Stop word removal
3. Special character removal
4. Lemmatisation

In [4]:
# Load the language model
nlp = spacy.load('en_core_web_sm')

# Get the stop words and punctuations
stop_words = set(nlp.Defaults.stop_words)
punctuations = set(string.punctuation)

Define a function for tokenising and preprocessing.

In [5]:
def tokenise_sentence(sentence):
    # Error handling for non-string inputs
    if not isinstance(sentence, str):
        return []
    
    # Tokenise the sentence
    tokenised = nlp(sentence)
    
    # Lowercase and lemmatise the words
    tokens = [token.lemma_.lower().strip() if token.pos_ != "PRON" else token.lower_ for token in tokenised]
    
    # Remove stop words and special characters
    tokens = [token for token in tokens if token not in stop_words and token not in punctuations]
    
    # Remove empty tokens
    tokens = [token for token in tokens if token != '']
    
    return tokens

Apply tokenisation.

In [6]:
mails['tokenised_text'] = mails['text'].apply(tokenise_sentence)

In [7]:
mails['tokenised_text'].head()

0    [subject, naturally, irresistible, corporate, ...
1    [subject, stock, trading, gunslinger, fanny, m...
2    [subject, unbelievable, new, home, easy, m, wa...
3    [subject, 4, color, printing, special, request...
4    [subject, money, software, cd, software, compa...
Name: tokenised_text, dtype: object

In [10]:
mails.iloc[0]['tokenised_text']

['subject',
 'naturally',
 'irresistible',
 'corporate',
 'identity',
 'lt',
 'hard',
 'recollect',
 'company',
 'market',
 'suqgestion',
 'information',
 'isoverwhelminq',
 'good',
 'catchy',
 'logo',
 'stylish',
 'statlonery',
 'outstanding',
 'website',
 'task',
 'easy',
 'promise',
 'havinq',
 'order',
 'iogo',
 'company',
 'automaticaily',
 'world',
 'ieader',
 'isguite',
 'ciear',
 'good',
 'product',
 'effective',
 'business',
 'organization',
 'practicable',
 'aim',
 'hotat',
 'nowadays',
 'market',
 'promise',
 'marketing',
 'effort',
 'effective',
 'list',
 'clear',
 'benefit',
 'creativeness',
 'hand',
 'original',
 'logo',
 'specially',
 'reflect',
 'distinctive',
 'company',
 'image',
 'convenience',
 'logo',
 'stationery',
 'provide',
 'format',
 'easy',
 'use',
 'content',
 'management',
 'system',
 'letsyou',
 'change',
 'website',
 'content',
 'structure',
 'promptness',
 'logo',
 'draft',
 'business',
 'day',
 'affordability',
 'marketing',
 'break',
 'shouldn',
 't',