# Imports and preparation

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data\enron_spam_data.csv')
df

Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date
0,0,christmas tree farm pictures,,ham,1999-12-10
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14
...,...,...,...,...,...
33711,33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",spam,2005-07-29
33712,33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,spam,2005-07-29
33713,33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,spam,2005-07-30
33714,33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,spam,2005-07-30


In [3]:
# Unnecessary columns
df = df.drop(columns='Message ID')
df = df.drop(columns='Date')

In [4]:
# drop NaNs and reset index
df = df.dropna()
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Subject,Message,Spam/Ham
0,"vastar resources , inc .","gary , production from the high island larger ...",ham
1,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham
2,re : issue,fyi - see note below - already done .\nstella\...,ham
3,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham
4,mcmullen gas for 11 / 99,"jackie ,\nsince the inlet to 3 river plant is ...",ham


In [5]:
# fix dtypes
df['Message'] = df['Message'].astype(str)
df['Subject'] = df['Subject'].astype(str)

In [6]:
# rename classification column for clarity
df = df.rename(columns={'Spam/Ham': 'Class'})

# remap spam to 1 and ham to 0
df['Class'] = df['Class'].map({'spam': 1, 'ham': 0})

In [7]:
# check the distribution of classes
df['Class'].value_counts()

Class
1    16614
0    16493
Name: count, dtype: int64

# Feature Engineering

## Count features commonly in spam emails

In [8]:
import re
def count_url(text):
    # count the occurrences of 'http', 'https', and 'www'
    count_http = len(re.findall(r'http', text))
    count_https = len(re.findall(r'https', text))
    count_www = len(re.findall(r'www', text))
    
    # return the total count of urls in a new column
    return count_http + count_https + count_www

In [9]:
def count_special_chars(text):
    return len(re.findall(r'[!$%&]', text))

In [10]:
def count_urgency_words(text):
    urgency_words = [
        "immediate", "urgent", "critical", "important", "now", "ASAP", "as soon as possible",
        "emergency", "priority", "alert", "rush", "prompt", "hasten", "swift", "instantly",
        "right away", "without delay", "high priority", "imminent", "pressing", "time - sensitive",
        "expedite", "top priority", "crucial", "vital", "necessary", "quick", "speedy", "at once",
        "rapid", "flash", "instantaneous", "accelerated", "breakneck", "hurry", "immediately",
        "fast-track", "at the earliest", "act now", "don't delay", "on the double", "without hesitation",
        "fast", "soon", "now or never", "urgent action", "right now", "straightaway", "double-time",
        "speed", "express", "high-priority", "pressing need", "at your earliest convenience", "this instant",
        "forthwith", "like a shot", "snap to it", "on the spot", "no time to lose", "no delay",
        "in a hurry", "right this minute", "get going", "with haste"
    ]
    words = re.findall(r'\b\w+\b', text.lower())
    count = sum(1 for word in words if word in urgency_words)
    return count

## Other indicators

In [11]:
def get_length(text):
    return len(text)

In [12]:
def is_forwarded(text):
    if (len(re.findall(r'-', text))) > 9 and len(re.findall(r'forward', text)) > 0:
        return 1
    else:
        return 0

## Create the columns

In [13]:
# email lengths
df['length_message'] = df['Message'].apply(get_length)
df['length_subject'] = df['Subject'].apply(get_length)

# url counts
df['urls_count_message'] = df['Message'].apply(count_url)
df['urls_count_subject'] = df['Subject'].apply(count_url)
df['urls_count'] = df['urls_count_message'] + df['urls_count_subject']
df = df.drop(columns={'urls_count_message', 'urls_count_subject'})

# special char counts
df['special_chars_count_message'] = df['Message'].apply(count_special_chars)
df['special_chars_count_subject'] = df['Subject'].apply(count_special_chars)
df['special_chars_count'] = df['special_chars_count_message'] + df['special_chars_count_subject']
df = df.drop(columns={'special_chars_count_message', 'special_chars_count_subject'})

# urgent phrase counts
df['urgent_phrase_count_message'] = df['Message'].apply(count_urgency_words)
df['urgent_phrase_count_subject'] = df['Subject'].apply(count_urgency_words)
df['urgent_phrase_count'] = df['urgent_phrase_count_message'] + df['urgent_phrase_count_subject']
df = df.drop(columns={'urgent_phrase_count_message', 'urgent_phrase_count_subject'})

# forwarded
df['forwarded'] = df['Message'].apply(is_forwarded)
df.head()

Unnamed: 0,Subject,Message,Class,length_message,length_subject,urls_count,special_chars_count,urgent_phrase_count,forwarded
0,"vastar resources , inc .","gary , production from the high island larger ...",0,4282,24,0,1,1,1
1,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,0,38,28,0,0,0,0
2,re : issue,fyi - see note below - already done .\nstella\...,0,1171,10,0,0,0,1
3,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,0,1124,25,0,0,0,1
4,mcmullen gas for 11 / 99,"jackie ,\nsince the inlet to 3 river plant is ...",0,534,24,0,0,3,0


# Text Cleaning

In [14]:
def clean_text(text):
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove excessive spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [15]:
df['Message'] = df['Message'].apply(clean_text)
df['Subject'] = df['Subject'].apply(clean_text)
df.head()

Unnamed: 0,Subject,Message,Class,length_message,length_subject,urls_count,special_chars_count,urgent_phrase_count,forwarded
0,vastar resources inc,gary production from the high island larger bl...,0,4282,24,0,1,1,1
1,calpine daily gas nomination,calpine daily gas nomination 1 doc,0,38,28,0,0,0,0
2,re issue,fyi see note below already done stella forward...,0,1171,10,0,0,0,1
3,meter 7268 nov allocation,fyi forwarded by lauri a allen hou ect on 12 1...,0,1124,25,0,0,0,1
4,mcmullen gas for 11 99,jackie since the inlet to 3 river plant is shu...,0,534,24,0,0,3,0


In [16]:
# imports
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# stop words
# nltk.download('punkt') <------ need these lines to 
# nltk.download('stopwords') <-- load stopwords
stop_words = stopwords.words()

# lemmatizer initialization
# nltk.download('averaged_perceptron_tagger') <---- need these lines to downnload
# nltk.download('wordnet') <----------------------- wordnet used for lemmitization
lemmatizer = WordNetLemmatizer()

### Remove Stop Words

In [17]:
def remove_stop_words(text, stop_words):
    word_tokens = word_tokenize(text)
    new_text = [w for w in word_tokens if not w.lower() in stop_words]

    return ' '.join(new_text)

In [18]:
df['Message'] = df['Message'].apply(remove_stop_words, stop_words=stop_words)
df['Subject'] = df['Subject'].apply(remove_stop_words, stop_words=stop_words)

### Lemmatize

In [19]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_email(text, lemmatizer):
    words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in words]
    return ' '.join(lemmatized_words)

In [20]:
# lemmatize message line 
df['Message'] = df['Message'].apply(lemmatize_email, lemmatizer=lemmatizer)

# lemmatize subject line
df['Subject'] = df['Subject'].apply(lemmatize_email, lemmatizer=lemmatizer)

In [21]:
df.head()

Unnamed: 0,Subject,Message,Class,length_message,length_subject,urls_count,special_chars_count,urgent_phrase_count,forwarded
0,vastar resource,gary production high island large block 1 2 co...,0,4282,24,0,1,1,1
1,calpine daily gas nomination,calpine daily gas nomination 1 doc,0,38,28,0,0,0,0
2,issue,fyi note stella forward stella morris hou ect ...,0,1171,10,0,0,0,1
3,meter 7268 nov allocation,fyi forward lauri hou ect 12 14 99 12 17 pm ki...,0,1124,25,0,0,0,1
4,mcmullen gas 11 99,jackie inlet 3 river plant shut 10 19 99 day f...,0,534,24,0,0,3,0


# Sentiment Analysis

## Imports

In [22]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

## Initialization

In [23]:
MODEL = 'cardiffnlp/twitter-roberta-base-sentiment-latest'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.to('cuda')

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

## Run Emails through Neural Net


In [24]:
def polarity_scores(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: value.to('cuda') for key, value in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    scores = logits[0].cpu().detach().numpy()
    scores = softmax(scores)
    return scores

In [25]:
from tqdm.notebook import tqdm
results = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    msg = row['Message']
    subject = row['Subject']
    message_scores = polarity_scores(msg)
    subject_scores = polarity_scores(subject)
    message_results = {
        'msg_neg' : message_scores[0],
        'msg_neu' : message_scores[1],
        'msg_pos' : message_scores[2]
    }
    subject_results = {
        'sub_neg' : subject_scores[0],
        'sub_neu' : subject_scores[1],
        'sub_pos' : subject_scores[2]
    }
    results[i] = message_results | subject_results

  0%|          | 0/33107 [00:00<?, ?it/s]

In [26]:
results_df = pd.DataFrame(results).T
results_df

Unnamed: 0,msg_neg,msg_neu,msg_pos,sub_neg,sub_neu,sub_pos
0,0.017629,0.906160,0.076210,0.065139,0.728012,0.206849
1,0.035552,0.877274,0.087174,0.051015,0.819466,0.129519
2,0.031416,0.934020,0.034564,0.265126,0.658116,0.076759
3,0.035840,0.921378,0.042782,0.058819,0.878761,0.062419
4,0.078795,0.898731,0.022474,0.043723,0.874298,0.081979
...,...,...,...,...,...,...
33102,0.008647,0.128509,0.862844,0.030983,0.886038,0.082979
33103,0.015938,0.236931,0.747131,0.026812,0.882739,0.090448
33104,0.042042,0.837309,0.120649,0.032800,0.705230,0.261970
33105,0.045250,0.870471,0.084279,0.029581,0.756366,0.214053


In [27]:
df = pd.concat([df, results_df], axis=1)
df

Unnamed: 0,Subject,Message,Class,length_message,length_subject,urls_count,special_chars_count,urgent_phrase_count,forwarded,msg_neg,msg_neu,msg_pos,sub_neg,sub_neu,sub_pos
0,vastar resource,gary production high island large block 1 2 co...,0,4282,24,0,1,1,1,0.017629,0.906160,0.076210,0.065139,0.728012,0.206849
1,calpine daily gas nomination,calpine daily gas nomination 1 doc,0,38,28,0,0,0,0,0.035552,0.877274,0.087174,0.051015,0.819466,0.129519
2,issue,fyi note stella forward stella morris hou ect ...,0,1171,10,0,0,0,1,0.031416,0.934020,0.034564,0.265126,0.658116,0.076759
3,meter 7268 nov allocation,fyi forward lauri hou ect 12 14 99 12 17 pm ki...,0,1124,25,0,0,0,1,0.035840,0.921378,0.042782,0.058819,0.878761,0.062419
4,mcmullen gas 11 99,jackie inlet 3 river plant shut 10 19 99 day f...,0,534,24,0,0,3,0,0.078795,0.898731,0.022474,0.043723,0.874298,0.081979
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33102,iso 8859 1 news edaliss val edumm vl eoggra,welcome gigapharm onlinne shop prescri linecan...,1,281,82,0,2,0,0,0.008647,0.128509,0.862844,0.030983,0.886038,0.082979
33103,prescript medicine special precise put buck ba...,earlier expect wrap cautiously impressed speed...,1,803,99,1,1,2,0,0.015938,0.236931,0.747131,0.026812,0.882739,0.090448
33104,generation online pharmacy,ready rock rise solitude show society show tal...,1,317,37,0,1,0,0,0.042042,0.837309,0.120649,0.032800,0.705230,0.261970
33105,bloow 5 10 time time,learn 5 10 time longer bed read plod net,1,74,30,0,0,0,0,0.045250,0.870471,0.084279,0.029581,0.756366,0.214053


# TF-IDF

In [28]:
# found that a lot of 2 digit numbers appeared in the top frequencies
import re
def remove_two_digit_numbers(text):
    return re.sub(r'\b\d{2}\b', '', text)

df['Message'] = df['Message'].apply(remove_two_digit_numbers)
df['Subject'] = df['Subject'].apply(remove_two_digit_numbers)

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_msg = TfidfVectorizer(max_features=300, ngram_range=(1, 2))
X_msg = tfidf_msg.fit_transform(df['Message'])
X_msg = pd.DataFrame(X_msg.toarray(), columns=('msg_freq_' + word for word in tfidf_msg.get_feature_names_out()))

tfidf_sub = TfidfVectorizer(max_features=100, ngram_range=(1, 2))
X_sub = tfidf_sub.fit_transform(df['Subject'])
X_sub = pd.DataFrame(X_sub.toarray(), columns=('sub_freq_' + word for word in tfidf_sub.get_feature_names_out()))

bdf = pd.concat([X_msg, X_sub], axis=1)
bdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33107 entries, 0 to 33106
Columns: 400 entries, msg_freq_000 to sub_freq_year
dtypes: float64(400)
memory usage: 101.0 MB


In [30]:
# add additional features to new data frame
columns_to_add = df.drop(columns={'Message', 'Subject'}).columns
bdf[columns_to_add] = df[columns_to_add]
bdf

Unnamed: 0,msg_freq_000,msg_freq_100,msg_freq_2000,msg_freq_2000 pm,msg_freq_2001,msg_freq_2001 pm,msg_freq_2004,msg_freq_2005,msg_freq_500,msg_freq_713,...,urls_count,special_chars_count,urgent_phrase_count,forwarded,msg_neg,msg_neu,msg_pos,sub_neg,sub_neu,sub_pos
0,0.488862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140131,0.031776,...,0,1,1,1,0.017629,0.906160,0.076210,0.065139,0.728012,0.206849
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0,0,0,0,0.035552,0.877274,0.087174,0.051015,0.819466,0.129519
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0,0,0,1,0.031416,0.934020,0.034564,0.265126,0.658116,0.076759
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0,0,0,1,0.035840,0.921378,0.042782,0.058819,0.878761,0.062419
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0,0,3,0,0.078795,0.898731,0.022474,0.043723,0.874298,0.081979
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33102,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0,2,0,0,0.008647,0.128509,0.862844,0.030983,0.886038,0.082979
33103,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,1,1,2,0,0.015938,0.236931,0.747131,0.026812,0.882739,0.090448
33104,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0,1,0,0,0.042042,0.837309,0.120649,0.032800,0.705230,0.261970
33105,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0,0,0,0,0.045250,0.870471,0.084279,0.029581,0.756366,0.214053


In [31]:
# drop rows with 0 across all word frequencies
bdf = bdf.loc[~(bdf[bdf.drop(columns=columns_to_add).columns] == 0).all(axis=1)]
bdf = bdf.reset_index(drop=True)
bdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32544 entries, 0 to 32543
Columns: 413 entries, msg_freq_000 to sub_pos
dtypes: float32(6), float64(400), int64(7)
memory usage: 101.8 MB


# Export dataset

In [35]:
import pickle
import os
def save_dataset(df, name, dir):
    save_path = os.path.join(dir, f'{name}.pkl')
    with open(save_path, 'wb') as file:
        pickle.dump(df, file)

save_dataset(bdf, 'clean_enron_spam_data', 'data')