# Imports and preparation

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
df = pd.read_csv('data\enron_spam_data.csv')
df

Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date
0,0,christmas tree farm pictures,,ham,1999-12-10
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14
...,...,...,...,...,...
33711,33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",spam,2005-07-29
33712,33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,spam,2005-07-29
33713,33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,spam,2005-07-30
33714,33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,spam,2005-07-30


In [4]:
df = df.drop(columns='Message ID')

In [5]:
# drop NaNs and reset index
df = df.dropna()
df = df.reset_index(drop=True)
df

Unnamed: 0,Subject,Message,Spam/Ham,Date
0,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13
1,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14
2,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14
3,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14
4,mcmullen gas for 11 / 99,"jackie ,\nsince the inlet to 3 river plant is ...",ham,1999-12-14
...,...,...,...,...
33102,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",spam,2005-07-29
33103,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,spam,2005-07-29
33104,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,spam,2005-07-30
33105,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,spam,2005-07-30


In [6]:
# rename classification column for clarity
df = df.rename(columns={'Spam/Ham': 'Class'})

# remap spam to 1 and ham to 0
df['Class'] = df['Class'].map({'spam': 1, 'ham': 0})

In [7]:
# check the distribution of classes
df['Class'].value_counts()

Class
1    16614
0    16493
Name: count, dtype: int64

# Feature Engineering

## Count features commonly in spam emails

In [10]:
import re
def count_url(text):
    # count the occurrences of 'http', 'https', and 'www'
    count_http = len(re.findall(r'http', text))
    count_https = len(re.findall(r'https', text))
    count_www = len(re.findall(r'www', text))
    
    # return the total count of urls in a new column
    return count_http + count_https + count_www

In [11]:
def count_special_chars(text):
    return len(re.findall(r'[!$%&]', text))

In [12]:
def count_urgency_words(text):
    urgency_words = [
        "immediate", "urgent", "critical", "important", "now", "ASAP", "as soon as possible",
        "emergency", "priority", "alert", "rush", "prompt", "hasten", "swift", "instantly",
        "right away", "without delay", "high priority", "imminent", "pressing", "time - sensitive",
        "expedite", "top priority", "crucial", "vital", "necessary", "quick", "speedy", "at once",
        "rapid", "flash", "instantaneous", "accelerated", "breakneck", "hurry", "immediately",
        "fast-track", "at the earliest", "act now", "don't delay", "on the double", "without hesitation",
        "fast", "soon", "now or never", "urgent action", "right now", "straightaway", "double-time",
        "speed", "express", "high-priority", "pressing need", "at your earliest convenience", "this instant",
        "forthwith", "like a shot", "snap to it", "on the spot", "no time to lose", "no delay",
        "in a hurry", "right this minute", "get going", "with haste"
    ]
    words = re.findall(r'\b\w+\b', text.lower())
    count = sum(1 for word in words if word in urgency_words)
    return count

## Other indicators

In [14]:
def get_length(text):
    return len(text)

In [15]:
def is_forwarded(text):
    if (len(re.findall(r'-', text))) > 9 and len(re.findall(r'forward', text)) > 0:
        return 1
    else:
        return 0

## Create the columns

In [17]:
# email lengths
df['length_message'] = df['Message'].apply(get_length)
df['length_subject'] = df['Subject'].apply(get_length)

# url counts
df['urls_count_message'] = df['Message'].apply(count_url)
df['urls_count_subject'] = df['Subject'].apply(count_url)
df['urls_count'] = df['urls_count_message'] + df['urls_count_subject']
df = df.drop(columns={'urls_count_message', 'urls_count_subject'})

# special char counts
df['special_chars_count_message'] = df['Message'].apply(count_special_chars)
df['special_chars_count_subject'] = df['Subject'].apply(count_special_chars)
df['special_chars_count'] = df['special_chars_count_message'] + df['special_chars_count_subject']
df = df.drop(columns={'special_chars_count_message', 'special_chars_count_subject'})

# urgent phrase counts
df['urgent_phrase_count_message'] = df['Message'].apply(count_urgency_words)
df['urgent_phrase_count_subject'] = df['Subject'].apply(count_urgency_words)
df['urgent_phrase_count'] = df['urgent_phrase_count_message'] + df['urgent_phrase_count_subject']
df = df.drop(columns={'urgent_phrase_count_message', 'urgent_phrase_count_subject'})

# forwarded
df['forwarded'] = df['Message'].apply(is_forwarded)
df

Unnamed: 0,Subject,Message,Class,Date,length_message,length_subject,urls_count,special_chars_count,urgent_phrase_count,forwarded
0,"vastar resources , inc .","gary , production from the high island larger ...",0,1999-12-13,4282,24,0,1,1,1
1,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,0,1999-12-14,38,28,0,0,0,0
2,re : issue,fyi - see note below - already done .\nstella\...,0,1999-12-14,1171,10,0,0,0,1
3,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,0,1999-12-14,1124,25,0,0,0,1
4,mcmullen gas for 11 / 99,"jackie ,\nsince the inlet to 3 river plant is ...",0,1999-12-14,534,24,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...
33102,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",1,2005-07-29,281,82,0,2,0,0
33103,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,1,2005-07-29,803,99,1,1,2,0
33104,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,1,2005-07-30,317,37,0,1,0,0
33105,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,1,2005-07-30,74,30,0,0,0,0


# Text Cleaning

In [19]:
def clean_text(text):
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove excessive spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [20]:
df['Message'] = df['Message'].apply(clean_text)
df['Subject'] = df['Subject'].apply(clean_text)
df

Unnamed: 0,Subject,Message,Class,Date,length_message,length_subject,urls_count,special_chars_count,urgent_phrase_count,forwarded
0,vastar resources inc,gary production from the high island larger bl...,0,1999-12-13,4282,24,0,1,1,1
1,calpine daily gas nomination,calpine daily gas nomination 1 doc,0,1999-12-14,38,28,0,0,0,0
2,re issue,fyi see note below already done stella forward...,0,1999-12-14,1171,10,0,0,0,1
3,meter 7268 nov allocation,fyi forwarded by lauri a allen hou ect on 12 1...,0,1999-12-14,1124,25,0,0,0,1
4,mcmullen gas for 11 99,jackie since the inlet to 3 river plant is shu...,0,1999-12-14,534,24,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...
33102,iso 8859 1 q good news c edaliss val edumm vl ...,hello welcome to gigapharm onlinne shop prescr...,1,2005-07-29,281,82,0,2,0,0
33103,all prescript medicines are on special to be p...,i got it earlier than expected and it was wrap...,1,2005-07-29,803,99,1,1,2,0
33104,the next generation online pharmacy,are you ready to rock on let the man in you ri...,1,2005-07-30,317,37,0,1,0,0
33105,bloow in 5 10 times the time,learn how to last 5 10 times longer in bed rea...,1,2005-07-30,74,30,0,0,0,0


In [21]:
# imports
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# stop words
# nltk.download('punkt') <------ need these lines to 
# nltk.download('stopwords') <-- load stopwords
stop_words = stopwords.words()

# lemmatizer initialization
# nltk.download('averaged_perceptron_tagger') <---- need these lines to downnload
# nltk.download('wordnet') <----------------------- wordnet used for lemmitization
lemmatizer = WordNetLemmatizer()

### Remove Stop Words

In [23]:
def remove_stop_words(text, stop_words):
    word_tokens = word_tokenize(text)
    new_text = [w for w in word_tokens if not w.lower() in stop_words]

    return ' '.join(new_text)

In [24]:
df['Message'] = df['Message'].apply(remove_stop_words, stop_words=stop_words)
df['Subject'] = df['Subject'].apply(remove_stop_words, stop_words=stop_words)

### Lemmatize

In [26]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_email(text, lemmatizer):
    words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in words]
    return ' '.join(lemmatized_words)

In [27]:
# lemmatize message line 
df['Message'] = df['Message'].apply(lemmatize_email, lemmatizer=lemmatizer)

# lemmatize subject line
df['Subject'] = df['Subject'].apply(lemmatize_email, lemmatizer=lemmatizer)

In [28]:
df

Unnamed: 0,Subject,Message,Class,Date,length_message,length_subject,urls_count,special_chars_count,urgent_phrase_count,forwarded
0,vastar resource,gary production high island large block 1 2 co...,0,1999-12-13,4282,24,0,1,1,1
1,calpine daily gas nomination,calpine daily gas nomination 1 doc,0,1999-12-14,38,28,0,0,0,0
2,issue,fyi note stella forward stella morris hou ect ...,0,1999-12-14,1171,10,0,0,0,1
3,meter 7268 nov allocation,fyi forward lauri hou ect 12 14 99 12 17 pm ki...,0,1999-12-14,1124,25,0,0,0,1
4,mcmullen gas 11 99,jackie inlet 3 river plant shut 10 19 99 day f...,0,1999-12-14,534,24,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...
33102,iso 8859 1 news edaliss val edumm vl eoggra,welcome gigapharm onlinne shop prescri linecan...,1,2005-07-29,281,82,0,2,0,0
33103,prescript medicine special precise put buck ba...,earlier expect wrap cautiously impressed speed...,1,2005-07-29,803,99,1,1,2,0
33104,generation online pharmacy,ready rock rise solitude show society show tal...,1,2005-07-30,317,37,0,1,0,0
33105,bloow 5 10 time time,learn 5 10 time longer bed read plod net,1,2005-07-30,74,30,0,0,0,0


## Export dataset

In [30]:
import os
def save_dataset(df, name, dir):
    save_path = os.path.join(dir, f'{name}.csv')
    df.to_csv(save_path, index=False)

save_dataset(df, 'clean_enron_spam_data', 'data')