In [32]:
import numpy as np
import pandas as pd

filepath = "./data/emails.csv"

emails = pd.read_csv(filepath)

print("Successfully loaded {} rows and {} columns".format(emails.shape[0],emails.shape[1]))
print(emails.head(n=5))

Successfully loaded 517401 rows and 2 columns
                       file                                            message
0     allen-p/_sent_mail/1.  Message-ID: <18782981.1075855378110.JavaMail.e...
1    allen-p/_sent_mail/10.  Message-ID: <15464986.1075855378456.JavaMail.e...
2   allen-p/_sent_mail/100.  Message-ID: <24216240.1075855687451.JavaMail.e...
3  allen-p/_sent_mail/1000.  Message-ID: <13505866.1075863688222.JavaMail.e...
4  allen-p/_sent_mail/1001.  Message-ID: <30922949.1075863688243.JavaMail.e...


In [31]:
#print(emails.loc[0]["message"])

In [5]:
import email

def extract_messages(df):
    messages=[]
    for item in df["message"]:
        e=email.message_from_string(item)
        message_body=e.get_payload()
        messages.append(message_body)
    print("Successfully retrieved message body from emails")
    return messages

In [6]:
bodies = extract_messages(emails)

Successfully retrieved message body from emails


In [30]:
bodies_df = pd.DataFrame(bodies)
#bodies_df.head()

In [8]:
filepath2 = "./data/fradulent_emails.txt"
with open(filepath2,'r',encoding="latin1") as file:
    data=file.read()
    
fraud_emails = data.split("From r")
print("Successfully loaded {} spam emails!".format(len(fraud_emails)))

Successfully loaded 3978 spam emails!


In [29]:
fraud_bodies = extract_messages(pd.DataFrame(fraud_emails, columns=["message"],dtype=str))
fraud_bodies_df=pd.DataFrame(fraud_bodies[1:])
#print(fraud_bodies_df.head())

Successfully retrieved message body from emails


In [11]:
Nsamp=1000
maxtokens=50
maxtokenlen=20

In [12]:
def tokenize(row):
    if row in [None,'']:
        tokens=""
    else:
        tokens=str(row).split(" ")[:maxtokens]
    return tokens

In [13]:
import re
def reg_expressions(row):
    tokens=[]
    try:
        for token in row:
            token=token.lower()
            token=re.sub(r'[\W\d]', "", token)
            token=token[:maxtokenlen]
            tokens.append(token)
    except:
        token=""
        tokens.append(token)
    return tokens

In [14]:
import nltk

nltk.download('stopword')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

def stop_words_removal(row):
    token = [token for token in row if token not in stopwords]
    token = filter(None, token)
    return token

[nltk_data] Error loading stopword: Package 'stopword' not found in
[nltk_data]     index


In [16]:
import random
EnronEmails = bodies_df.iloc[:,0].apply(tokenize)
EnronEmails = EnronEmails.apply(stop_words_removal)

In [19]:
EnronEmails = EnronEmails.apply(reg_expressions)
EnronEmails = EnronEmails.sample(Nsamp)

SpamEmails = fraud_bodies_df.iloc[:,0].apply(tokenize)
SpamEmails = SpamEmails.apply(stop_words_removal)
SpamEmails = SpamEmails.apply(reg_expressions)
SpamEmails = SpamEmails.sample(Nsamp)

raw_data = pd.concat([SpamEmails,EnronEmails], axis=0).values

In [28]:
print("Shape of combined data represented as NumPy array is:")
print(raw_data.shape)
print("Data represented as NumPy array is:")
#print(raw_data)

Shape of combined data represented as NumPy array is:
(2000,)
Data represented as NumPy array is:


In [21]:
Categories = ['spam', 'notspam']
header = ([1]*Nsamp)
header.extend(([0]*Nsamp))

In [25]:
def assemble_bag(data):
    used_tokens=[]
    all_tokens=[]
    
    for item in data:
        for token in item:
            if token in all_tokens:
                if token not in used_tokens:
                    used_tokens.append(token)
            else:
                all_tokens.append(token)
    df=pd.DataFrame(0, index=np.arange(len(data)), columns=used_tokens)
    
    for i, item in enumerate(data):
        for token in item:
            if token in used_tokens:
                df.iloc[i][token] += 1
    return df

In [27]:
EnronSpamBag = assemble_bag(raw_data)
#EnronSpamBag

In [33]:
predictors=[column for column in EnronSpamBag.columns]

In [34]:
def unison_shuffle_data(data,header):
    p=np.random.permutation(len(header))
    data=data[p]
    header=np.asarray(header)[p]
    return data,header

In [None]:
data, header = unison_shuffle_data(EnronSpamBag.values, header)
idx = int(0.7*data.shape[0])
train_x = data[:idx]
train_y = header[:idx]
test_x = data[:idx]
test_y = header[:idx:]