# Neural Coreference

In [2]:
from collections import defaultdict
import re

import pkg_resources
pkg_resources.require('SpaCy==2.1.0')

import pandas as pd
import spacy
import neuralcoref

%matplotlib inline

nlp = spacy.load("en_core_web_sm")

# Add neural coref to SpaCy's pipe
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x1afc462bf98>

In [3]:
# Read the data into a DataFrame
emails_df = pd.read_csv('emails_processed.csv')
print(emails_df.shape)
emails_df.head()

(517401, 14)


Unnamed: 0,Message-ID,Date,From,To,Subject,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content,user
0,<18782981.1075855378110.JavaMail.evans@thyme>,2001-05-14 16:39:00-07:00,frozenset({'phillip.allen@enron.com'}),frozenset({'tim.belden@enron.com'}),,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Here is our forecast\n\n,allen-p
1,<15464986.1075855378456.JavaMail.evans@thyme>,2001-05-04 13:51:00-07:00,frozenset({'phillip.allen@enron.com'}),frozenset({'john.lavorato@enron.com'}),Re:,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Traveling to have a business meeting takes the...,allen-p
2,<24216240.1075855687451.JavaMail.evans@thyme>,2000-10-18 03:00:00-07:00,frozenset({'phillip.allen@enron.com'}),frozenset({'leah.arsdall@enron.com'}),Re: test,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,test successful. way to go!!!,allen-p
3,<13505866.1075863688222.JavaMail.evans@thyme>,2000-10-23 06:13:00-07:00,frozenset({'phillip.allen@enron.com'}),frozenset({'randall.gay@enron.com'}),,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"Randy,\n\n Can you send me a schedule of the s...",allen-p
4,<30922949.1075863688243.JavaMail.evans@thyme>,2000-08-31 05:07:00-07:00,frozenset({'phillip.allen@enron.com'}),frozenset({'greg.piper@enron.com'}),Re: Hello,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,Let's shoot for Tuesday at 11:45.,allen-p


-----------

In [6]:
def process_docs(df, column='content', is_string=False):
    if not is_string:
        content_list = []
        for x in range(len(df)):
            content_list.append(df[column][x])
    else:
        try:
            if isinstance(df[column], list):
                content_list = df[column]
        except:
            'Needs a list of strings'

    doc_list = []
    for content in content_list:
        doc_list.append(nlp(content))
        
    return doc

In [7]:
process_docs(emails_df[0:500])

[Here is our forecast
 
  ,
 Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.
 
 As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the presenter speaks and the others are quiet just waiting for their turn.   The meetings might be better if held in a round table discussion format.  
 
 My suggestion for where to go is Austin.  Play golf and rent a ski boat and jet ski's.  Flying somewhere takes too much time.,
 test successful.  way to go!!!,
 Randy,
 
  Can you send me a schedule of the salary and level of everyone in the 
 scheduling group.  Plus your thoughts on any changes that need 

In [26]:
## NeuralCoref has attributes that show the groupings and whether or not the doc has anything that was coresolved

doc_list[86]._.coref_resolved

'Brad,\n\n With regard to Tori Kuykendall, I would like to promote Tori Kuykendall to commercial \nmanager instead of converting Tori Kuykendall from a commercial support manager to an \nassociate.  Tori Kuykendall duties since the beginning of the year have been those of a \ncommercial manager.  I have no doubt that Tori Kuykendall will compare favorably to \nothers in that category at year end.  \n\n Martin Cuilla on the central desk is in a similiar situation as Tori.  \nHunter would like Martin Cuilla handled the same as Tori.\n\n Let me know if there are any issues.\n\nPhillip'

In [28]:
def get_entity_list(spacy_list, tag='PERSON'):
    ents_tags = [(e.text.strip(), e.label_) for x in range(len(spacy_list)) for e in spacy_list[x].ents if e.text.strip()]
    ents = [e for e in ents_tags if e[1] == tag]

    raw_ents = []
    for ent in ents:
        if ent[0] not in raw_ents:
            raw_ents.append(ent[0])
    return raw_ents

In [29]:
def clean_enron_people_list(people_list):
    for x in range(len(people_list)):
        people_list[x] = people_list[x].replace('ENRONDEVELOPMENT', '')
        people_list[x] = people_list[x].replace('ENRON', '')
        people_list[x] = people_list[x].replace('Contract', '')
        people_list[x] = people_list[x].replace('Forward', '')
        people_list[x] = people_list[x].replace('Accepted', '')
        people_list[x] = people_list[x].replace('Login', '')
        people_list[x] = people_list[x].replace('URGENT OWA', '')    
        people_list[x] = re.sub(r'r?\n|\r/', '', people_list[x])
        people_list[x] = re.sub(r'/[^/]*$', '', people_list[x])
        people_list[x] = re.sub(r'/[^-]*$', '', people_list[x])
        people_list[x] = re.sub(' +', ' ', people_list[x])
        people_list[x] = re.sub(r'[^a-zA-Z ]+', '', people_list[x])
        people_list[x] = people_list[x].strip()
    return(people_list)

In [30]:
raw_people = clean_enron_people_list(get_entity_list(doc_list))

In [35]:
def enhance_neural_coref_pipeline(people_list, reprocess_df='', reprocess=False):
    raw_people_sorted = sorted(set(people_list), key=len, reverse=True)

    people_dict = defaultdict(list)
    all_strings = []
    for person in raw_people_sorted:
        if not people_dict:
            people_dict[person] = [person]
        else:
            keys = [i.split(' ') for i in list(people_dict.keys())]
            if all([name in [item for sublist in keys for item in sublist] for name in person.split()]):
                for key, value in people_dict.items():
                    if all([name in key.split() for name in person.split()]):
                        people_dict[key].append(person) 
            else:
                people_dict[person] = [person]
            
    for key, value in people_dict.items():
        nlp.get_pipe('neuralcoref').set_conv_dict({key: value})
    
    if reprocess:
        return process_docs(reprocess_df)
    else:
        return people_dict

In [36]:
enhance_neural_coref_pipeline(raw_people)

defaultdict(list,
            {'Robert NeustaedterDEVELOPMENT': ['Robert NeustaedterDEVELOPMENT',
              'Robert',
              ''],
             'V Accounting Mechanisms': ['V Accounting Mechanisms', ''],
             'Bentley CompanyExchange': ['Bentley CompanyExchange', ''],
             'Janie Tholt Frank Ermis': ['Janie Tholt Frank Ermis',
              'Janie Tholt',
              'Frank Ermis',
              'Janie',
              'Frank',
              ''],
             'DEVELOPMENTDEVELOPMENT': ['DEVELOPMENTDEVELOPMENT', ''],
             'Matt Lenhart Randy Gay': ['Matt Lenhart Randy Gay',
              'Matt Lenhart',
              'Randy Gay',
              'Randy',
              'Matt',
              ''],
             'Elizabeth L Hernandez': ['Elizabeth L Hernandez',
              'Elizabeth L',
              ''],
             'Mary Theresa Franklin': ['Mary Theresa Franklin', 'Mary', ''],
             'Mongorr the Merciful': ['Mongorr the Merciful', ''],
        

In [43]:
def clean_threads(subject_df, subject_col='Subject'):
    subject_df = subject_df.astype(str)
    
    for index, row in subject_df.iterrows():
        row[subject_col] = row[subject_col].replace('Re:', '')
        row[subject_col] = row[subject_col].replace('re:', '')
        row[subject_col] = row[subject_col].replace('RE:', '')
        row[subject_col] = row[subject_col].replace('FW:', '')
        row[subject_col] = row[subject_col].strip()
    return subject_df

In [45]:
def process_threads(subject_df, base_df, subject_col='Subject', content_col='content'):
    subject_df = clean_threads(subject_df)
    
    # Group by subject
    subject_dict = subject_df.groupby(subject_col).groups
    
    # Get content for emails in same thread
    thread_dict = defaultdict(list)
    thread_dict_processed = defaultdict(list)
    for key, value in subject_dict.items():
        for value in subject_dict[key]:
            thread_dict[key].append(base_df.loc[value][content_col])

    for key in thread_dict.keys():
        thread_dict_processed[key] = ' '.join(thread_dict[key])
    
    # Pass processed strings to SpaCy pipeline
    for key in thread_dict_processed.keys():
        thread_dict_processed[key] = nlp(thread_dict_processed[key])
    
    return thread_dict_processed

In [49]:
processed_threads = process_threads(emails_df[['Subject']][0:500], base_df = emails_df)

In [50]:
processed_threads['(No Subject)']._.coref_resolved

"Greg,\n\n Got your message.  Good luck on the bike ride.\n\n What were you doing to your apartment?  Are you setting up a studio?\n\n The kids are back in school.  Otherwise just work is going on here.\n\nKeith Greg,\n\n            The kids are into typical toys and games.  Justin likes power \nranger stuff.  Kelsey really likes art.  Books would also be good.  \n\n            Greg,\n\n  are spending Christmas in Houston with Heather's sister.  Greg,\n\n  \nare planning to come to San Marcos for New Years.  \n\n            How long will you stay?  what are your plans?  Email me with \nlatest happenings with you in the big city.\n\nkeith"