## AML 2304 Lab 3 - Group E

In [1]:
import os
import pandas as pd
import email
import re
import nltk
import contractions
from email import policy
from email.header import decode_header
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.metrics import edit_distance, jaccard_distance
from collections import Counter

nltk.download('wordnet')
pd.set_option('max_colwidth', 100)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aurad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Read file contents from Dataset main folder. Extract the `SUBJECT`, `TO`, `FROM` and `EMAIL BODY` from the email, then add the data in dataframe with the correct label: `spam` or `ham`

In [2]:
def get_email_body(msg):
    email_body = ""

    if "multipart" in msg.get_content_type():

        for part in msg.get_payload(decode=False):
            if not isinstance(part, str):
                email_part = str(part.get_payload(decode=False)).strip()
                email_body = email_body + email_part
    else:
        email_body = str(msg.get_payload(decode=False)).strip()

    return email_body


try:
    data_raw_df = pd.DataFrame(columns=['filename', 'subject', 'to', 'from', 'email_body', 'label'])

    labels = ['spam', 'ham']
    for idx in range(len(labels)):

        spam_dir = "Dataset/" + labels[idx] + "/"

        for file in os.listdir(spam_dir)[:]:
            with open(spam_dir + file, "r", encoding="latin-1") as f:
                msg = email.message_from_string(f.read(), policy=policy.default)

                subject = str(decode_header(msg.get('subject'))[0][0]) if msg.get('subject') is not None else None
                email_body = get_email_body(msg)

                new_data = [file, subject, msg.get('to'), msg.get('from'), email_body, labels[idx]]
                data_raw_df.loc[len(data_raw_df)] = new_data

    display(data_raw_df.head(5))

except Exception as err:
    print(f"Error: {err} - filename: {file}")

Unnamed: 0,filename,subject,to,from,email_body,label
0,00001.317e78fa8ee2f54cd4890fdc09ba8176,[ILUG] STOP THE MLM INSANITY,ilug@linux.ie,Start Now <startnow2002@hotmail.com>,Greetings!\n\nYou are receiving this letter because you have expressed an interest in \nreceivin...,spam
1,00001.7848dde101aa985090474a91ec93fcf0,Life Insurance - Why Pay More?,dcek1a1@netsgo.com,12a1mailbot1@web.de,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Transitional//EN"">\n<HTML><HEAD>\n<META content=3D""t...",spam
2,00002.9438920e9a55591b18e60d1ed37d992b,"Real Protection, Stun Guns! Free Shipping! Time:2:01:35 PM",ranmoore@cybertime.net,lmrn@mailexcite.com,"<html>\n<body>\n<center>\n<h3>\n<font color=""blue"">\n<b>\nThe Need For Safety Is Real In 2002, Y...",spam
3,00002.d94f1b97e48ed3b553b3508d116e6a09,[ILUG] Guaranteed to lose 10-12 lbs in 30 days 10.206,ilug@linux.ie,Slim Down <taylor@s3.serveimage.com>,1) Fight The Risk of Cancer!\nhttp://www.adclick.ws/p.cfm?o=315&s=pk007\n\n2) Slim Down - Guaran...,spam
4,00003.2ee33bc6eacdb11f38d052c44819ba6c,Guaranteed to lose 10-12 lbs in 30 days 11.150,zzzz@spamassassin.taint.org,Slim Down <sabrina@mx3.1premio.com>,1) Fight The Risk of Cancer!\nhttp://www.adclick.ws/p.cfm?o=315&s=pk007\n\n2) Slim Down - Guaran...,spam


## Exploratory Data Analysis (incl. cleaning the data)
Check if there are null values

In [3]:
data_raw_df.isnull().sum()

filename        0
subject         6
to            332
from            0
email_body      0
label           0
dtype: int64

In [4]:
# This variable will contain the cleaned data
data_cleaned_df = data_raw_df.copy()
# Convert to lower case characters
data_cleaned_df = data_cleaned_df.applymap(lambda x: x.lower() if type(x) == str else x)

data_cleaned_df.head(5)

Unnamed: 0,filename,subject,to,from,email_body,label
0,00001.317e78fa8ee2f54cd4890fdc09ba8176,[ilug] stop the mlm insanity,ilug@linux.ie,Start Now <startnow2002@hotmail.com>,greetings!\n\nyou are receiving this letter because you have expressed an interest in \nreceivin...,spam
1,00001.7848dde101aa985090474a91ec93fcf0,life insurance - why pay more?,dcek1a1@netsgo.com,12a1mailbot1@web.de,"<!doctype html public ""-//w3c//dtd html 4.0 transitional//en"">\n<html><head>\n<meta content=3d""t...",spam
2,00002.9438920e9a55591b18e60d1ed37d992b,"real protection, stun guns! free shipping! time:2:01:35 pm",ranmoore@cybertime.net,lmrn@mailexcite.com,"<html>\n<body>\n<center>\n<h3>\n<font color=""blue"">\n<b>\nthe need for safety is real in 2002, y...",spam
3,00002.d94f1b97e48ed3b553b3508d116e6a09,[ilug] guaranteed to lose 10-12 lbs in 30 days 10.206,ilug@linux.ie,Slim Down <taylor@s3.serveimage.com>,1) fight the risk of cancer!\nhttp://www.adclick.ws/p.cfm?o=315&s=pk007\n\n2) slim down - guaran...,spam
4,00003.2ee33bc6eacdb11f38d052c44819ba6c,guaranteed to lose 10-12 lbs in 30 days 11.150,zzzz@spamassassin.taint.org,Slim Down <sabrina@mx3.1premio.com>,1) fight the risk of cancer!\nhttp://www.adclick.ws/p.cfm?o=315&s=pk007\n\n2) slim down - guaran...,spam


Extract mail server and domain name from `TO` and `FROM`

In [5]:
def extract_mail_server_and_domain(email):
    if email is not None:
        pattern = r'@([\w.]+\.[a-zA-Z]{2,})'
        match = re.search(pattern, email)
        return match.group(1) if match else None
    

data_cleaned_df['to'] = data_cleaned_df['to'].apply(extract_mail_server_and_domain)
data_cleaned_df['from'] = data_cleaned_df['from'].apply(extract_mail_server_and_domain)
data_cleaned_df[['to', 'from']].head(5)

Unnamed: 0,to,from
0,linux.ie,hotmail.com
1,netsgo.com,web.de
2,cybertime.net,mailexcite.com
3,linux.ie,s3.serveimage.com
4,spamassassin.taint.org,mx3.1premio.com


For `SUBJECT` and `EMAIL_BODY`, the following will be applied to clean the data
- Remove email addresses, URLs and HTML tags
- Replace all whitespaces with space
- Handling contractions
- Remove special characters and numbers
- Tokenization, stopwords removal and lemmatization
- Handling typos and mispellings (?)
- Remove words that are not in dictionary (?)

In [6]:
pd.set_option('max_colwidth', None)


# Remove email addresses
pattern_email = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : re.sub(pattern_email, '', x) if x is not None else x)

# Remove URLs
pattern_url = r'https?://(?:www\.)?[\w\.-]+(?:\.[a-zA-Z]{2,})+(?:/[-\w\.,/]*)*(?:\?[\w\%&=]*)?'
data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : re.sub(pattern_url, '', x) if x is not None else x)

# Handle contractions
data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : contractions.fix(x) if x is not None else x)

# Remove HTML tags
pattern_html = r'<[^>]+>'
data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : re.sub(pattern_html, '', x) if x is not None else x)
data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : x.replace('&nbsp;', ' ') if x is not None else x)


# Remove all special characters and numbers
pattern_special_num = r'(?<![a-zA-Z])-(?![a-zA-Z])|[^a-zA-Z\s-]+'
data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : re.sub(pattern_special_num, ' ', x) if x is not None else x)

# Replace all whitespaces (new lines, tabs, etc.) to space
pattern_whitespace = r'\s+'
data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : re.sub(pattern_whitespace, ' ', x) if x is not None else x)

display(data_cleaned_df.head(2))
display(data_cleaned_df.tail(2))

Unnamed: 0,filename,subject,to,from,email_body,label
0,00001.317e78fa8ee2f54cd4890fdc09ba8176,ilug stop the mlm insanity,linux.ie,hotmail.com,greetings you are receiving this letter because you have expressed an interest in receiving information about online business opportunities if this is erroneous then please accept my most sincere apology this is a one-time mailing so no removal is necessary if you have been burned betrayed and back-stabbed by multi-level marketing mlm then please read this letter it could be the most important one that has ever landed in your inbox multi-level marketing is a huge mistake for most people mlm has failed to deliver on its promises for the past years the pursuit of the mlm dream has cost hundreds of thousands of people their friends their fortunes and their sacred honor the fact is that mlm is fatally flawed meaning that it cannot work for most people the companies and the few who earn the big money in mlm are not going to tell you the real story finally there is someone who has the courage to cut through the hype and lies and tell the truth about mlm here is good news there is an alternative to mlm that works and works big if you have not yet abandoned your dreams then you need to see this earning the kind of income you have dreamed about is easier than you think with your permission i would like to send you a brief letter that will tell you why mlm does not work for most people and will then introduce you to something so new and refreshing that you will wonder why you have not heard of this before i promise that there will be no unwanted follow up no sales pitch no one will call you and your email address will only be used to send you the information period to receive this free life-changing information simply click reply type send info in the subject box and hit send i will get the information to you within hours just look for the words mlm wall of shame in your inbox cordially siddhi p s someone recently sent the letter to me and it has been the most eye-opening financially beneficial information i have ever received i honestly believe that you will feel the same way once you have read it and it is free this email is never sent unsolicited this is not spam you are receiving this email because you explicitly signed yourself up to our list with our online signup form or through use of our ffa links page and e-maildom systems which have explicit terms of use which state that through its use you agree to receive our emailings you may also be a member of a altra computer systems list or one of many numerous free marketing services and as such you agreed when you signed up for such list that you would also be receiving this emailing due to the above this email message cannot be considered unsolicitated or spam irish linux users group for un subscription information list maintainer,spam
1,00001.7848dde101aa985090474a91ec93fcf0,life insurance why pay more,netsgo.com,web.de,save up to on life insurance why spend more than you have to life quote savings ensurin g your family s financial security is very important life quote savings ma kes buying life insurance simple and affordable we provide free access to the very best companies and the lowest rates life quote savings is fast eas y and saves you money let us help you get started with the best val ues in the country on new coverage you can save hundreds or even though usands of dollars by requesting a free quote from lifequote savings our service will take you less than minutes to complete shop an d compare save up to on all types of life insurance click here for your free quote protecting your family is the best investment you will eve are make if you are in receipt of this email in error and or wish to be removed from our list please click here and type remove if you reside in any state which prohibits e-mail solicitations for insuran ce please disregard this email,spam


Unnamed: 0,filename,subject,to,from,email_body,label
8595,2550.963956a866a2e00b3d5b353ecc9216ae,flexible retirement gains ground,example.com,example.com,url date t money government admits millions may have to work on beyond,ham
8596,2551.3b1f94418de5bd544c977b44bcc7e740,critical us satellites could be hacked,example.com,example.com,url date not supplied military communications could be jammed or intercepted and satellites thrown off course or destroyed a new us study warns,ham


In [7]:
# Tokenize the text and remove stopwords
stop_words = set(stopwords.words('english'))
data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : [word for word in word_tokenize(x) if word not in stop_words] if x is not None else x)

display(data_cleaned_df.head(2))
display(data_cleaned_df.tail(2))

Unnamed: 0,filename,subject,to,from,email_body,label
0,00001.317e78fa8ee2f54cd4890fdc09ba8176,"[ilug, stop, mlm, insanity]",linux.ie,hotmail.com,"[greetings, receiving, letter, expressed, interest, receiving, information, online, business, opportunities, erroneous, please, accept, sincere, apology, one-time, mailing, removal, necessary, burned, betrayed, back-stabbed, multi-level, marketing, mlm, please, read, letter, could, important, one, ever, landed, inbox, multi-level, marketing, huge, mistake, people, mlm, failed, deliver, promises, past, years, pursuit, mlm, dream, cost, hundreds, thousands, people, friends, fortunes, sacred, honor, fact, mlm, fatally, flawed, meaning, work, people, companies, earn, big, money, mlm, going, tell, real, story, finally, someone, courage, cut, hype, lies, tell, truth, mlm, good, news, alternative, mlm, works, works, big, yet, abandoned, dreams, need, see, earning, kind, income, dreamed, easier, think, permission, ...]",spam
1,00001.7848dde101aa985090474a91ec93fcf0,"[life, insurance, pay]",netsgo.com,web.de,"[save, life, insurance, spend, life, quote, savings, ensurin, g, family, financial, security, important, life, quote, savings, kes, buying, life, insurance, simple, affordable, provide, free, access, best, companies, lowest, rates, life, quote, savings, fast, eas, saves, money, let, us, help, get, started, best, val, ues, country, new, coverage, save, hundreds, even, though, usands, dollars, requesting, free, quote, lifequote, savings, service, take, less, minutes, complete, shop, compare, save, types, life, insurance, click, free, quote, protecting, family, best, investment, eve, make, receipt, email, error, wish, removed, list, please, click, type, remove, reside, state, prohibits, e-mail, solicitations, insuran, ce, please, disregard, email]",spam


Unnamed: 0,filename,subject,to,from,email_body,label
8595,2550.963956a866a2e00b3d5b353ecc9216ae,"[flexible, retirement, gains, ground]",example.com,example.com,"[url, date, money, government, admits, millions, may, work, beyond]",ham
8596,2551.3b1f94418de5bd544c977b44bcc7e740,"[critical, us, satellites, could, hacked]",example.com,example.com,"[url, date, supplied, military, communications, could, jammed, intercepted, satellites, thrown, course, destroyed, new, us, study, warns]",ham


In [8]:
# Handling typos and mispellings
english_vocab = set(w.lower() for w in words.words())


def lemmatize_word(word, pos=None):
    lemmatizer = WordNetLemmatizer()
    if pos is None:
        return lemmatizer.lemmatize(word)        
    
    return lemmatizer.lemmatize(word, pos=pos)


def suggest_words(word, threshold=3):
    suggested_words = [e_word for e_word in english_vocab if edit_distance(word, e_word) <= threshold]
    return suggested_words


def is_combined_word(word_list, index):
    combined_word = word_list[index] + word_list[index+1]
    lemmatized_word = lemmatize_word(combined_word, 'v')

    # Check if the combined word is in the dictionary and not a stopword
    if lemmatized_word in english_vocab and lemmatized_word not in stop_words:
        word_list[index] = lemmatize_word(combined_word)
        word_list.pop(index+1)
        return True

    return False


def clean_words(word_list):
    word_list_size = len(word_list)-1
    idx = 0
    
    while idx < word_list_size:
        # Remove trailing and leading dash(es)
        cleaned_word = re.sub(r'^-+|-+$', '', word_list[idx])

        # Check if any of two consecutive words (current and after) aren't in dictionary and 
        # see if putting them together might work
        if any(word not in english_vocab for word in word_list[idx:idx+2]):
            # Check if two words can be combined
            if is_combined_word(word_list, idx):
                word_list_size = word_list_size - 1
            # Remove one-character word from the list
            elif len(cleaned_word) == 1:
                word_list.pop(idx)    
                word_list_size = word_list_size - 1
            else:
                # suggested_words = suggest_words(cleaned_word)
                # TODO: Improve this section of the code
                # For now, we only take words that have error distance of 3
                # if len(suggested_words) == 0:
                #     word_list.pop(idx)    
                #     word_list_size = word_list_size - 1
                # else:
                word_list[idx] = lemmatize_word(cleaned_word)
        else:
            word_list[idx] = lemmatize_word(cleaned_word)
            
        idx = idx + 1

    return word_list


data_cleaned_df[['subject', 'email_body']] = data_cleaned_df[['subject', 'email_body']].applymap(lambda x : clean_words(x) if x is not None else x)

In [9]:
data_cleaned_df[['subject', 'email_body']]

Unnamed: 0,subject,email_body
0,"[ilug, stop, mlm, insanity]","[greeting, receiving, letter, expressed, interest, receiving, information, online, business, opportunity, erroneous, please, accept, sincere, apology, one-time, mailing, removal, necessary, burned, betrayed, back-stabbed, multi-level, marketing, mlm, please, read, letter, could, important, one, ever, landed, inbox, multi-level, marketing, huge, mistake, people, mlm, failed, deliver, promise, past, year, pursuit, mlm, dream, cost, hundred, thousand, people, friend, fortune, sacred, honor, fact, mlm, fatally, flawed, meaning, work, people, company, earn, big, money, mlm, going, tell, real, story, finally, someone, courage, cut, hype, lie, tell, truth, mlm, good, news, alternative, mlm, work, work, big, yet, abandoned, dream, need, see, earning, kind, income, dreamed, easier, think, permission, ...]"
1,"[life, insurance, pay]","[save, life, insurance, spend, life, quote, saving, ensuring, family, financial, security, important, life, quote, saving, kes, buying, life, insurance, simple, affordable, provide, free, access, best, company, lowest, rate, life, quote, saving, fast, ea, save, money, let, u, help, get, started, best, value, country, new, coverage, save, hundred, even, though, usands, dollar, requesting, free, quote, lifequote, saving, service, take, le, minute, complete, shop, compare, save, type, life, insurance, click, free, quote, protecting, family, best, investment, eve, make, receipt, email, error, wish, removed, list, please, click, type, remove, reside, state, prohibits, e-mail, solicitation, insurance, please, disregard, email]"
2,"[real, protection, stun, gun, free, shipping, time, pm]","[need, safety, real, might, get, one, chance, ready, free, shipping, handling, within, usa, order, may, day, super, sale, may, save, item, getting, spring, protect, walk, jog, exercise, outside, also, protect, loved, one, return, home, college, legal, protection, college, student, great, coming, outdoor, protection, gift, nothing, worth, protecting, life, stun, device, pepper, product, legal, protection, join, war, crime, stun, gun, baton, effective, safe, nonlethal, protect, loved, one, matter, matter, city, town, live, live, america, touched, crime, hear, tv, read, newspaper, secret, crime, major, problem, today, criminal, finding, easier, commit, crime, time, weapon, readily, available, city, police, force, work, handle, even, criminal, ...]"
3,"[ilug, guaranteed, lose, lb, days]","[fight, risk, cancer, slim, guaranteed, lose, lb, day, get, child, support, deserve, free, legal, advice, join, web, fastest, growing, single, community, start, private, photo, album, online, wonderful, day, offer, manager, prizemama, wish, leave, list, please, use, link, irish, linux, user, group, un, subscription, information, list, maintainer]"
4,"[guaranteed, lose, lb, days]","[fight, risk, cancer, slim, guaranteed, lose, lb, day, get, child, support, deserve, free, legal, advice, join, web, fastest, growing, single, community, start, private, photo, album, online, wonderful, day, offer, manager, prizemama, wish, leave, list, please, use, link]"
...,...,...
8592,"[gene, technique, reveals, human, evolution]","[url, date, supplied, method, could, allow, scientist, probe, ancestor, evolution, last, year, pass, first, test]"
8593,"[go-ahead, new-style, hospitals]","[url, date, society, blair, broker, whitehall, deal, trust, borrowing, private, cash]"
8594,"[malicious, code, hidden, email, software]","[url, date, supplied, new, software, switched, dummy, code, containing, trojan, horse, download, servers]"
8595,"[flexible, retirement, gain, ground]","[url, date, money, government, admits, million, may, work, beyond]"


## Building Model (uncleaned data)

Note: Make sure to run 3 iterations of tuning and explain your rationale for the tuning approaches used each iteration

In [12]:
# Split independent variables and dependent variable
X_raw = data_raw_df.drop(['label'], axis=1)
y_raw = data_raw_df['label']

X_raw.shape, y_raw.shape

### Model Evaluation

Note: Evaluate both test and train data. Make sure there are enough data points in the test set (>500) for Confusion Matrix, AUC etc.

## Building Model (cleaned data)

Note: Make sure to run 3 iterations of tuning and explain your rationale for the tuning approaches used each iteration

In [13]:
# Split independent variables and dependent variable
X_cleaned = data_cleaned_df.drop(['label'], axis=1)
y_cleaned = data_cleaned_df['label']

X_cleaned.shape, y_cleaned.shape

### Model Evaluation

Note: Evaludate both test and train data. Make sure there are enough data points in the test set (>500) for Confusion Matrix, AUC etc.

## Conclusion

Note: Include what else could be done to tune the model and how it would have helped (w/ some numbers)

## References
- https://coderzcolumn.com/tutorials/python/email-how-to-represent-an-email-message-in-python