# Neural Coreference

In [3]:
from collections import Counter, defaultdict
import itertools
import os, sys, email, re
from nltk.corpus import stopwords 

import pkg_resources
pkg_resources.require('SpaCy==2.1.0')

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import spacy
from spacy import displacy
import neuralcoref

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
from nltk.stem.porter import PorterStemmer

%matplotlib inline

nlp = spacy.load("en_core_web_sm")

# Add neural coref to SpaCy's pipe
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x22fb74aae80>

In [4]:
# Read the data into a DataFrame
emails_df = pd.read_csv('emails.csv')
print(emails_df.shape)
emails_df.head()

(517401, 2)


Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [5]:
## Helper functions
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

def split_email_addresses(line):
    '''To separate multiple email addresses'''
    if line:
        addrs = line.split(',')
        addrs = frozenset(map(lambda x: x.strip(), addrs))
    else:
        addrs = None
    return addrs

In [6]:
# Parse the emails into a list email objects
messages = list(map(email.message_from_string, emails_df['message']))
emails_df.drop('message', axis=1, inplace=True)
# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
    emails_df[key] = [doc[key] for doc in messages]
# Parse content from emails
emails_df['content'] = list(map(get_text_from_email, messages))
# Split multiple email addresses
emails_df['From'] = emails_df['From'].map(split_email_addresses)
emails_df['To'] = emails_df['To'].map(split_email_addresses)

# Extract the root of 'file' as 'user'
emails_df['user'] = emails_df['file'].map(lambda x:x.split('/')[0])
del messages

emails_df.head()

Unnamed: 0,file,Message-ID,Date,From,To,Subject,Mime-Version,Content-Type,Content-Transfer-Encoding,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content,user
0,allen-p/_sent_mail/1.,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",(phillip.allen@enron.com),(tim.belden@enron.com),,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Here is our forecast\n\n,allen-p
1,allen-p/_sent_mail/10.,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",(phillip.allen@enron.com),(john.lavorato@enron.com),Re:,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Traveling to have a business meeting takes the...,allen-p
2,allen-p/_sent_mail/100.,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",(phillip.allen@enron.com),(leah.arsdall@enron.com),Re: test,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,test successful. way to go!!!,allen-p
3,allen-p/_sent_mail/1000.,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",(phillip.allen@enron.com),(randall.gay@enron.com),,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"Randy,\n\n Can you send me a schedule of the s...",allen-p
4,allen-p/_sent_mail/1001.,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",(phillip.allen@enron.com),(greg.piper@enron.com),Re: Hello,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,Let's shoot for Tuesday at 11:45.,allen-p


In [None]:
# Set index and drop columns with two few values
emails_df = emails_df.set_index('Message-ID')\
    .drop(['file', 'Mime-Version', 'Content-Type', 'Content-Transfer-Encoding'], axis=1)
# Parse datetimeq
emails_df['Date'] = pd.to_datetime(emails_df['Date'], infer_datetime_format=True)
emails_df.dtypes

## Basic application of neural coref

### Taking sample of 500 emails

In [None]:
content_list = []
for x in range(len(emails_df[0:500])):
    content_list.append(emails_df['content'][x])
    
doc_list = []
for content in content_list:
    doc_list.append(nlp(content))

Look at entity tags for those emails.

In [None]:
# Many of the PERSON labels are just whitespace (e.g. \r\n\r\n). 
# Filter those out
ents_tags = [(e.text.strip(), e.label_) for x in range(len(doc_list)) for e in doc_list[x].ents if e.text.strip()]

persons = [e for e in ents_tags if e[1] == 'PERSON']

person_counts = Counter(persons)

person_counts.most_common(20)

In [None]:
raw_people = []

for person in persons:
    if person[0] not in raw_people:
        raw_people.append(person[0])

Basic functions to clean out some of the text in the emails

In [None]:
def clean_people_list(people_list):
    for x in range(len(people_list)):
        people_list[x] = people_list[x].replace('ENRONDEVELOPMENT', '')
        people_list[x] = people_list[x].replace('ENRON', '')
        people_list[x] = people_list[x].replace('Contract', '')
        people_list[x] = people_list[x].replace('Forward', '')
        people_list[x] = people_list[x].replace('Accepted', '')
        people_list[x] = people_list[x].replace('Login', '')
        people_list[x] = people_list[x].replace('URGENT OWA', '')    
        people_list[x] = re.sub(r'r?\n|\r/', '', people_list[x])
        people_list[x] = re.sub(r'/[^/]*$', '', people_list[x])
        people_list[x] = re.sub(r'/[^-]*$', '', people_list[x])
        people_list[x] = re.sub(' +', ' ', people_list[x])
        people_list[x] = re.sub(r'[^a-zA-Z ]+', '', people_list[x])
        people_list[x] = people_list[x].strip()
    return(people_list)

In [14]:
raw_people = clean_people_list(raw_people)

Create dictionary where longest person name is the key and all variants that are components are values.

In [15]:
raw_people_sorted = sorted(set(raw_people), key=len, reverse=True)

people_dict = defaultdict(list)
all_strings = []
for person in raw_people_sorted[0:500]:
    if not people_dict:
        people_dict[person] = [person]
    else:
        keys = [i.split(' ') for i in list(people_dict.keys())]
    if all([name in [item for sublist in keys for item in sublist] for name in person.split()]):
        for key, value in people_dict.items():
            if all([name in key.split() for name in person.split()]):
                people_dict[key].append(person) 
    else:
        people_dict[person] = [person]

Use this dict to enhance neural coref conversion dict definitions and re-process content.

In [22]:
for key, value in people_dict.items():
    nlp.get_pipe('neuralcoref').set_conv_dict({key: value})

In [23]:
doc_list2 = []
for content in content_list:
    doc_list2.append(nlp(content))

Try limiting to emails with a 'PERSON' entity.

In [39]:
filtered_list = []
for x in range(len(doc_list)):
    for e in doc_list[x].ents:
        if 'PERSON' in e.label_:
            if x not in filtered_list:
                filtered_list.append(x)

In [42]:
filtered_docs = []
for index in filtered_list:
    filtered_docs.append(nlp(content_list[index]))

In [None]:
filtered_docs[134]._.coref_resolved

In [36]:
len(filtered_list)

386

In [None]:
long_content_list = []
for x in range(len(emails_df)):
    content_list.append(emails_df['content'][x])

In [None]:
content_list = []
for x in range(len(emails_df[0:500])):
    content_list.append(emails_df['content'][x])

In [16]:
orig_people = []
for person in persons:
    orig_people.append(person[0])

In [17]:
orig_people = clean_people_list(orig_people)

In [31]:
## This doesn't work with 500 emails (at least on my machine)

# content_blob = ' '.join(content_list)
# processed_blob = nlp(content_blob)

In [274]:
length = len(content_list) 
content_list.sort(key=len) 
content_list[length-60]

"Reagan,\n\nThank you for the quick response on the bid for the residence.  Below is a \nlist of questions on the specs:\n\n1.  Is the framing Lumber #2 yellow pine?  Wouldn't fir or spruce warp less \nand cost about the same?\n\n2.  What type of floor joist would be used?  2x12 or some sort of factory \njoist?\n\n3.  What type for roof framing?  On site built rafters? or engineered trusses?\n\n4.  Are you planning for insulation between floors to dampen sound?  What \ntype of insulation in floors and ceiling?  Batts or blown?  Fiberglass or \nCellulose?\n\n5.  Any ridge venting or other vents (power or turbine)?\n\n6. Did you bid for interior windows to have trim on 4 sides?  I didn't know \nthe difference between an apron and a stool.\n\n7.  Do you do anything special under the upstairs tile floors to prevent \ncracking?  Double plywood or hardi board underlay?\n\n8.  On the stairs, did you allow for a bannister?  I was thinking a partial \none out of iron.  Only about 5 feet.\n\n9. 

In [307]:
doc_list[length-121]._.coref_resolved

'---------------------- Forwarded by Phillip K Allen/HOU/ECT on 08/20/2000 \n05:38 PM ---------------------------\n\n\n"Lucy Gonzalez" <stagecoachmama@hotmail.com> on 08/17/2000 02:37:55 PM\nTo: pallen@enron.com\ncc:  \nSubject: Daily Report\n\n\n\n   Phillip,\n        Today was one of those days because Wade had to go pay Wade fine and\nI had to go take Wade that takes alot of time out of my schedule.If you get a\nchance will you mention to Wade that Wade needs to, try to fix Wade van so tht\nhis van can go get what ever his van needs. Tomorrow gary is going to be here.I have\nto go but Iwill E-Mail you tomorrow\n                                            Lucy\n\n________________________________________________________________________\nGet Your Private, Free E-mail from MSN Hotmail at http://www.hotmail.com\n\n'

In [31]:
doc_list[70]

Cooper,
 
 Can you give access to the new west power site to Jay Reitmeyer.  He is an 
analyst in our group.

Phillip

In [32]:
doc_list[70]._.has_coref
doc_list[70]._.coref_clusters
doc_list[70]._.coref_resolved

'Cooper,\n \n Can you give access to the new west power site to Jay Reitmeyer.  Jay Reitmeyer is an \nanalyst in our group.\n\nPhillip'

In [24]:
doc_list[86]

Brad,

 With regard to Tori Kuykendall, I would like to promote her to commercial 
manager instead of converting her from a commercial support manager to an 
associate.  Her duties since the beginning of the year have been those of a 
commercial manager.  I have no doubt that she will compare favorably to 
others in that category at year end.  

 Martin Cuilla on the central desk is in a similiar situation as Tori.  
Hunter would like Martin handled the same as Tori.

 Let me know if there are any issues.

Phillip

In [25]:
doc_list[86]._.coref_resolved

'Brad,\n\n With regard to Tori Kuykendall, I would like to promote Tori Kuykendall to commercial \nmanager instead of converting Tori Kuykendall from a commercial support manager to an \nassociate.  Tori Kuykendall duties since the beginning of the year have been those of a \ncommercial manager.  I have no doubt that Tori Kuykendall will compare favorably to \nothers in that category at year end.  \n\n Martin Cuilla on the central desk is in a similiar situation as Tori.  \nHunter would like Martin Cuilla handled the same as Tori.\n\n Let me know if there are any issues.\n\nPhillip'

In [367]:
# doc_list[86]._.has_coref
# doc_list[86]._.coref_clusters
print(doc_list[86]._.coref_clusters)

# 

[Tori Kuykendall: [Tori Kuykendall, her, her, Her, she], Martin Cuilla: [Martin Cuilla, Martin]]
[Tori Kuykendall: [Tori Kuykendall, her, her, Her, she], Martin Cuilla: [Martin Cuilla, Martin]]


In [40]:
# doc_list[86]._.has_coref
doc_list[86]._.coref_clusters
# doc_list[86]._.coref_resolved

[Tori Kuykendall: [Tori Kuykendall, her, her, Her, she],
 Martin Cuilla: [Martin Cuilla, Martin]]

In [41]:
resolved_list = []
for x in range(len(doc_list)):
    resolved_list.append(nlp(doc_list[x]._.coref_resolved))

In [42]:
ents_tags_resolved = [(e.text.strip(), e.label_) for x in range(len(resolved_list)) for e in resolved_list[x].ents if e.text.strip()]

persons_resolved = [e for e in ents_tags_resolved if e[1] == 'PERSON']

person_counts_resolved = Counter(persons_resolved)

person_counts_resolved.most_common(20)

[(('Phillip K Allen', 'PERSON'), 312),
 (('Phillip Allen', 'PERSON'), 70),
 (('Lucy', 'PERSON'), 48),
 (('Keith', 'PERSON'), 40),
 (('Jeff', 'PERSON'), 36),
 (('Keith Holst', 'PERSON'), 26),
 (('Phillip', 'PERSON'), 26),
 (('John', 'PERSON'), 26),
 (('Larry Lewter', 'PERSON'), 24),
 (('Reagan', 'PERSON'), 23),
 (('Larry', 'PERSON'), 23),
 (('Tim Belden', 'PERSON'), 22),
 (('George', 'PERSON'), 21),
 (('Mike Grigsby', 'PERSON'), 20),
 (('Alan Comnes', 'PERSON'), 18),
 (('George Richards', 'PERSON'), 17),
 (('Scott Neal', 'PERSON'), 17),
 (('Creekside Builders', 'PERSON'), 16),
 (('Mike', 'PERSON'), 16),
 (('Gary', 'PERSON'), 16)]

In [29]:
def clean(text):
    stop = set(stopwords.words('english'))
    stop.update(("to","cc","subject","http","from","sent","aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"))
    exclude = set(string.punctuation) 
    lemma = WordNetLemmatizer()
    porter= PorterStemmer()
    
    text=text.rstrip()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    stop_free = " ".join([i for i in text.lower().split() if((i not in stop) and (not i.isdigit()))])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    #stem = " ".join(porter.stem(token) for token in normalized.split())
    
    return normalized

In [24]:
analysis_df=emails_df[['From', 'To', 'Date','content']].dropna().copy()
analysis_df = analysis_df.loc[analysis_df['To'].map(len) == 1]
sub_df=analysis_df.sample(1000)

In [30]:
#sub_df["content"]=sub_df["content"].map(clean)
text_clean=[]
for text in sub_df['content']:
    text_clean.append(clean(text).split())