## Spam classification with the Enron Email dataset

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn import utils

## Read in Data

In [2]:
PATH = 'assets/dataset/enron1-training-data-raw/'
folders = os.listdir(PATH)
folders

['ham', 'spam']

In [3]:
from collections import defaultdict
df = defaultdict(lambda: defaultdict(list))

for category in folders:
    files = os.listdir(os.path.join(PATH, category))
    
    ## only read in the text files
    files = [i for i in files if '.txt' in i]
    num_docs = 0
    
    for file in files:
        file_path = os.path.join(PATH, category, file)
        
        with open(file_path, encoding = 'latin-1') as fp:
            line= fp.readlines()
            df[category][num_docs] = ' '.join(line)
            num_docs+=1

## Throw everything into a pandas dataframe for easy processing
df = pd.DataFrame.from_dict(df)

## Turn column names (labels into a variable)
df = pd.melt(df, var_name = "Label", value_name="Features")
df = utils.shuffle(df)
df.dropna(inplace=True)
df.head()

Unnamed: 0,Label,Features
77,ham,"Subject: koch three rivers , # 6722\n the outa..."
1056,ham,Subject: re : panenergy marketing march 2000 p...
4153,spam,Subject: \n
3968,spam,Subject: hi paliourg have pills here . everyth...
633,ham,Subject: fw : men & cars\n because i ' m a man...


In [4]:
df['Features'].loc[3]

'Subject: re : issue\n fyi - see note below - already done .\n stella\n - - - - - - - - - - - - - - - - - - - - - - forwarded by stella l morris / hou / ect on 12 / 14 / 99 10 : 18\n am - - - - - - - - - - - - - - - - - - - - - - - - - - -\n from : sherlyn schumack on 12 / 14 / 99 10 : 06 am\n to : stella l morris / hou / ect @ ect\n cc : howard b camp / hou / ect @ ect\n subject : re : issue\n stella ,\n this has already been taken care of . you did this for me yesterday .\n thanks .\n howard b camp\n 12 / 14 / 99 09 : 10 am\n to : stella l morris / hou / ect @ ect\n cc : sherlyn schumack / hou / ect @ ect , howard b camp / hou / ect @ ect , stacey\n neuweiler / hou / ect @ ect , daren j farmer / hou / ect @ ect\n subject : issue\n stella ,\n can you work with stacey or daren to resolve\n hc\n - - - - - - - - - - - - - - - - - - - - - - forwarded by howard b camp / hou / ect on 12 / 14 / 99 09 : 08\n am - - - - - - - - - - - - - - - - - - - - - - - - - - -\n from : sherlyn schumack 12

In [5]:
df.shape

(5172, 2)

## A bit of cleaning with textacy

In [8]:
#!pip install textacy

Collecting textacy
  Downloading https://files.pythonhosted.org/packages/41/9f/22b9dec63bff5e6ef7fb47b2cd37025087c3995b6ca5467d78160f5b0eb3/textacy-0.6.1-py2.py3-none-any.whl (137kB)
[K    100% |████████████████████████████████| 143kB 3.2MB/s ta 0:00:01
[?25hCollecting tqdm>=4.11.1 (from textacy)
  Downloading https://files.pythonhosted.org/packages/93/24/6ab1df969db228aed36a648a8959d1027099ce45fad67532b9673d533318/tqdm-4.23.4-py2.py3-none-any.whl (42kB)
[K    100% |████████████████████████████████| 51kB 6.9MB/s eta 0:00:01
[?25hCollecting unidecode>=0.04.19 (from textacy)
  Downloading https://files.pythonhosted.org/packages/59/ef/67085e30e8bbcdd76e2f0a4ad8151c13a2c5bce77c85f8cad6e1f16fb141/Unidecode-1.0.22-py2.py3-none-any.whl (235kB)
[K    100% |████████████████████████████████| 235kB 3.3MB/s ta 0:00:01
[?25hCollecting ijson>=2.3 (from textacy)
  Downloading https://files.pythonhosted.org/packages/7f/e9/8508c5f4987ba238a2b169e582c1f70a47272b22a2f1fb06b9318201bb9e/ijson-2.3-py2

  Running setup.py bdist_wheel for murmurhash ... [?25ldone
[?25h  Stored in directory: /Users/ystrano/Library/Caches/pip/wheels/b8/94/a4/f69f8664cdc1098603df44771b7fec5fd1b3d8364cdd83f512
  Running setup.py bdist_wheel for preshed ... [?25ldone
[?25h  Stored in directory: /Users/ystrano/Library/Caches/pip/wheels/8f/85/06/2d132fb649a6bbcab22487e4147880a55b0dd0f4b18fdfd6b5
  Running setup.py bdist_wheel for thinc ... [?25ldone
[?25h  Stored in directory: /Users/ystrano/Library/Caches/pip/wheels/d8/5c/3e/9acf5d9974fb1c9e7b467563ea5429c9325f67306e93147961
  Running setup.py bdist_wheel for pathlib ... [?25ldone
[?25h  Stored in directory: /Users/ystrano/Library/Caches/pip/wheels/f9/b2/4a/68efdfe5093638a9918bd1bb734af625526e849487200aa171
  Running setup.py bdist_wheel for dill ... [?25ldone
[?25h  Stored in directory: /Users/ystrano/Library/Caches/pip/wheels/99/c4/ed/1b64d2d5809e60d5a3685530432f6159d6a9959739facb61f2
  Running setup.py bdist_wheel for regex ... [?25ldone
[?25h

In [10]:
import textacy.preprocess as preprocess

def clean_enron(text):
    return preprocess.preprocess_text(text, 
                                      fix_unicode=True, 
                                      lowercase=True, 
                                      no_urls=False, 
                                      no_emails=True, 
                                      no_phone_numbers=True, 
                                      no_numbers=True, 
                                      no_currency_symbols=False, 
                                      no_punct=True, 
                                      no_contractions=True)

In [11]:
clean_enron(df['Features'].loc[3])

'subject re issue\n fyi see note below already done stella\n forwarded by stella l morris hou ect on number number number number number am from sherlyn schumack on number number number number number am\n to stella l morris hou ect ect\n cc howard b camp hou ect ect\n subject re issue\n stella this has already been taken care of you did this for me yesterday thanks howard b camp\n number number number number number am\n to stella l morris hou ect ect\n cc sherlyn schumack hou ect ect howard b camp hou ect ect stacey\n neuweiler hou ect ect daren j farmer hou ect ect\n subject issue\n stella can you work with stacey or daren to resolve\n hc\n forwarded by howard b camp hou ect on number number number number number am from sherlyn schumack number number number number number pm\n to howard b camp hou ect ect\n cc subject issue\n i have to create accounting arrangement for purchase from unocal energy at\n meter number deal not tracked for number number volume on deal number expired number n

## Perform your basic EDA - don't spend more than 5-10 minutes

In [14]:
df.shape

(5172, 2)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5172 entries, 77 to 460
Data columns (total 2 columns):
Label       5172 non-null object
Features    5172 non-null object
dtypes: object(2)
memory usage: 281.2+ KB


## Using sklearn's count and tfidf vectorizer and then the model of your choice to classify the emails

Experiment with preprocessing steps.  Do you get better results with cleaned or uncleaned data? 

To clean the data, use:
        - df['Features'] = df['Features'].apply(lambda x: clean_enron(x))

In [19]:
df['Features'] = df['Features'].apply(lambda x: clean_enron(x))