# Actionable Email Item Detection

In [53]:
import numpy as np
import pandas as pd
import re

import nltk

In [2]:
# Dataset directory, set this before your experiments
dataset = '/home/syed.b/emails.csv'

In [3]:
# Pick a sample for understanding perspective
df_sample  = pd.read_csv(dataset, skiprows = lambda x : np.random.rand() > 0.01 and x > 0)

In [4]:
df_sample.shape

(5148, 2)

In [5]:
df_sample['message'][0]

"Message-ID: <2068674.1075855691152.JavaMail.evans@thyme>\nDate: Tue, 25 Apr 2000 05:22:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: hargr@webtv.net\nSubject: Re: #30\nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Phillip K Allen\nX-To: hargr@webtv.net (Neal Hargrove) @ ENRON\nX-cc: \nX-bcc: \nX-Folder: \\Phillip_Allen_Dec2000\\Notes Folders\\'sent mail\nX-Origin: Allen-P\nX-FileName: pallen.nsf\n\n2000-1969=31"

In [6]:
# Establish cleaning patterns

[ x.strip() for x in df_sample['message'][11].split('FileName:')[1].split('\n',1)[1].strip('\n').split('\n') if x]


['---------------------- Forwarded by Phillip K Allen/HOU/ECT on 10/30/2000',
 '09:14 AM ---------------------------',
 '',
 '',
 'Enron Technology',
 '',
 'From:  Stephen Stock                           10/27/2000 12:49 PM',
 '',
 'To: Phillip K Allen/HOU/ECT@ECT',
 'cc:',
 'Subject: ERMS / RMS Databases',
 'Phillip,',
 'It looks as though we have most of the interim hardware upgrades in the',
 'building now, although we are still expecting a couple of components to',
 'arrive on Monday.',
 'The Unix Team / DBA Team and Applications team expect to have a working test',
 "server environment on Tuesday. If anything changes, I'll let you know.",
 'best regards',
 'Steve Stock']

In [7]:
# Load the whole data

df = pd.read_csv(dataset)

In [8]:
df.shape

(517401, 2)

In [9]:
# Store emails here in the list
# Use nltk's sentence tokenizer to tokenize the email sentences -- since we notice above that 
# order
emails = []

for i in range(df.shape[0]):
    
    emails.append(nltk.sent_tokenize(' '.join([ x.strip() for x in df['message'][i].split('FileName:')[1].split('\n',1)[1].strip('\n').split('\n') if x]
)))

In [10]:
#Sanity check
emails[1]

['Traveling to have a business meeting takes the fun out of the trip.',
 'Especially if you have to prepare a presentation.',
 'I would suggest holding the business plan meetings here then take a trip without any formal business meetings.',
 'I would even try and get some honest opinions on whether a trip is even desired or necessary.',
 'As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.',
 'Too often the presenter speaks and the others are quiet just waiting for their turn.',
 'The meetings might be better if held in a round table discussion format.',
 'My suggestion for where to go is Austin.',
 "Play golf and rent a ski boat and jet ski's.",
 'Flying somewhere takes too much time.']

## First Filter


Check for action words. A curated list of action words is specified and if our sentences from the emails contain one of these words, we store it for a further filtering process.

The action words file is in the current directory as this Jupyter Notebook.

In [49]:
# Read in action verbs
action_verbs = []

with open('./action_words.txt', 'r') as f:
    for line in f:
        action_verbs.append(line.strip().lower())
        
        
# Storing the actionable items as a tuple of 3 items - (text, action-words, no. of action-words in sentence)
action_texts = []   

# Due to memory/time constraints, we do not read in the entire emails, rather take a subset of 1,00,000 to show results 
for item in emails[:100000]:
    for text in item:
        temp_verb_list = []
        
        for verb in action_verbs:
            if verb in text.lower().split():
                temp_verb_list.append(verb)
        if len(temp_verb_list) > 0:
            action_texts.append((text, temp_verb_list, len(temp_verb_list)))
                
            
                

In [54]:
action_texts[:20]

[('As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.',
  ['think'],
  1),
 ('My suggestion for where to go is Austin.', ['go', 'suggestion'], 2),
 ("Play golf and rent a ski boat and jet ski's.", ['play'], 1),
 ('Randy, Can you send me a schedule of the salary and level of everyone in the scheduling group.',
  ['send'],
  1),
 ('Please cc the following distribution list with updates: Phillip Allen (pallen@enron.com) Mike Grigsby (mike.grigsby@enron.com) Keith Holst (kholst@enron.com) Monique Sanchez Frank Ermis John Lavorato Thank you for your help Phillip Allen',
  ['help'],
  1),
 ("1. login:  pallen pw: ke9davis I don't think these are required by the ISP 2.  static IP address IP: 64.216.90.105 Sub: 255.255.255.248 gate: 64.216.90.110 DNS: 151.164.1.8 3.",
  ['think'],
  1),
 ('kWh deal must have limited/ > no risk forward gas price to make deal work.',
  ['make'],


## Hierarchical Filters

**NOTE**: This section is applicable when we want to achieve high precision i.e. we might miss a few action items from the original list but we will achieve high accuracy on whatever has been retrieved.

### First Level:

**Filter A**: Check if no. of action words are more than 1 and if length of email sentence is less than 30.


### Second Level:

**Filter B**: Check if object pronouns or subject pronouns are present in the sentence. The list is specified as below.


### Third Level:


**Filter C**: Disregard those with negation words.




In [58]:
# Filters A,B,C:

object_pronouns = ['me', 'her', 'him', 'us', 'them']
subject_pronouns = ['i', 'we', 'you', 'he', 'she', 'they']
negation_words = ["shouldn't", "couldn't", "wouldn't"]

filtered_action_texts = []

for item in action_texts:
    if item[2]>1 and len(item[0].split())<30:
        
        
    
        obj_flag =0 
        for x in object_pronouns:
            if x in item[0].lower().split(): 
                filtered_action_texts.append(item[0])
                obj_flag =1
                break
        if (obj_flag == 0):
            for x in subject_pronouns:
                if x in item[0].lower().split():
                    filtered_action_texts.append(item[0])
                    break

final_action_texts = []
for item in filtered_action_texts:
    #print (item)
    count_neg = 0
    for x in negation_words:
        if x in item.lower().split():
            count_neg += 1
            continue
    if count_neg == 0:
        final_action_texts.append(item)
    
        
        
        
        

In [59]:
final_action_texts

['We really need a single point of contact to help drive the trader requirements and help come to a consensus regarding the requirements.',
 'We really need a single point of contact to help drive the trader requirements and help come to a consensus regarding the requirements.',
 'Please get back to me as soon as your schedule permits regarding the site visit and feel free to call at any time.',
 'Please get back to me as soon as your schedule permits regarding the site visit and feel free to call at any time.',
 'Phillip, I will call you today to go over this more thoroughly.',
 'Please get back to me as soon as your schedule permits regarding the site visit and feel free to call at any time.',
 'During this period, you can use Outlook Web Access (OWA) via your= web browser (Internet Explorer 5.0) to read and send mail.',
 'When you open the file, go to the "Checkbook" tab and look at the yellow highlighted items.',
 "I will read it this weekend and ask my dad about the a/c's.",
 'Wha