In [2]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# The Dataset
This enron dataset includes labels on 1700 of the emails. Information on how the dataset it labeled can be found [here](https://data.world/brianray/enron-email-dataset)

Because only some of the emails are labeled, the first step is to save the labeled emails in their own file. This will be our working dataset.

In [15]:
enron_df = pd.read_csv('./enron.csv', low_memory=False, index_col=0)
enron_df.head()

Unnamed: 0,Message-ID,Date,From,To,Subject,X-From,X-To,X-cc,X-bcc,X-Folder,...,Cat_10_level_1,Cat_10_level_2,Cat_10_weight,Cat_11_level_1,Cat_11_level_2,Cat_11_weight,Cat_12_level_1,Cat_12_level_2,Cat_12_weight,labeled
0,<18782981.1075855378110.JavaMail.evans@thyme>,2001-05-14 23:39:00,frozenset({'phillip.allen@enron.com'}),frozenset({'tim.belden@enron.com'}),,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",...,,,,,,,,,,False
1,<15464986.1075855378456.JavaMail.evans@thyme>,2001-05-04 20:51:00,frozenset({'phillip.allen@enron.com'}),frozenset({'john.lavorato@enron.com'}),Re:,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",...,,,,,,,,,,False
2,<24216240.1075855687451.JavaMail.evans@thyme>,2000-10-18 10:00:00,frozenset({'phillip.allen@enron.com'}),frozenset({'leah.arsdall@enron.com'}),Re: test,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,...,,,,,,,,,,False
3,<13505866.1075863688222.JavaMail.evans@thyme>,2000-10-23 13:13:00,frozenset({'phillip.allen@enron.com'}),frozenset({'randall.gay@enron.com'}),,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,...,,,,,,,,,,False
4,<30922949.1075863688243.JavaMail.evans@thyme>,2000-08-31 12:07:00,frozenset({'phillip.allen@enron.com'}),frozenset({'greg.piper@enron.com'}),Re: Hello,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,...,,,,,,,,,,False


In [16]:
enron_df = enron_df.loc[enron_df['labeled'] == True]
enron_df = enron_df.reset_index(drop=True)

print(enron_df.shape)
enron_df.head()

(1702, 51)


Unnamed: 0,Message-ID,Date,From,To,Subject,X-From,X-To,X-cc,X-bcc,X-Folder,...,Cat_10_level_1,Cat_10_level_2,Cat_10_weight,Cat_11_level_1,Cat_11_level_2,Cat_11_weight,Cat_12_level_1,Cat_12_level_2,Cat_12_weight,labeled
0,<9831685.1075855725804.JavaMail.evans@thyme>,2001-03-15 14:45:00,frozenset({'phillip.allen@enron.com'}),frozenset({'todd.burke@enron.com'}),Re: Confidential Employee Information/Lenhart,Phillip K Allen,Todd Burke,,,\Phillip_Allen_June2001\Notes Folders\'sent mail,...,,,,,,,,,,True
1,<21041312.1075855725847.JavaMail.evans@thyme>,2001-03-15 14:11:00,frozenset({'phillip.allen@enron.com'}),frozenset({'kim.bolton@enron.com'}),RE: PERSONAL AND CONFIDENTIAL COMPENSATION INF...,Phillip K Allen,Kim Bolton,,,\Phillip_Allen_June2001\Notes Folders\'sent mail,...,,,,,,,,,,True
2,<5907100.1075858639941.JavaMail.evans@thyme>,2001-06-20 17:04:51,frozenset({'k..allen@enron.com'}),"frozenset({'matt.smith@enron.com', 'matthew.le...",FW: Western Wholesale Activities - Gas & Power...,"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...","Lenhart, Matthew </O=ENRON/OU=NA/CN=RECIPIENTS...",,,"\PALLEN (Non-Privileged)\Allen, Phillip K.\Sen...",...,,,,,,,,,,True
3,<26625142.1075858639964.JavaMail.evans@thyme>,2001-06-20 17:09:00,frozenset({'k..allen@enron.com'}),"frozenset({'matt.smith@enron.com', 'matthew.le...",FW: Western Wholesale Activities - Gas & Power...,"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...","Lenhart, Matthew </O=ENRON/OU=NA/CN=RECIPIENTS...",,,"\PALLEN (Non-Privileged)\Allen, Phillip K.\Sen...",...,,,,,,,,,,True
4,<19730598.1075858642129.JavaMail.evans@thyme>,2001-08-09 12:30:58,frozenset({'k..allen@enron.com'}),"frozenset({'matt.smith@enron.com', 'm..tholt@e...",FW: Western Wholesale Activities - Gas & Power...,"Allen, Phillip K. </O=ENRON/OU=NA/CN=RECIPIENT...","Smith, Matt </O=ENRON/OU=NA/CN=RECIPIENTS/CN=M...",,,"\PALLEN (Non-Privileged)\Allen, Phillip K.\Sen...",...,,,,,,,,,,True


In [17]:
enron_df.to_csv('./enron_labeled.csv', index=False)

# Methods for Pre-Processing
The email headers have already been broken down. The method used in this dataset is identical to the one used in our initial attempts at using the entire enron corpus. Emails are broken down into multiple fields, including To/From, a subject, content, and user.

In [36]:
enron_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1702 entries, 0 to 1701
Data columns (total 51 columns):
Message-ID        1702 non-null object
Date              1702 non-null object
From              1702 non-null object
To                1557 non-null object
Subject           1636 non-null object
X-From            1702 non-null object
X-To              1567 non-null object
X-cc              415 non-null object
X-bcc             0 non-null object
X-Folder          1702 non-null object
X-Origin          1702 non-null object
X-FileName        1700 non-null object
content           1697 non-null object
user              1702 non-null object
Cat_1_level_1     1702 non-null float64
Cat_1_level_2     1702 non-null float64
Cat_1_weight      1702 non-null float64
Cat_2_level_1     1506 non-null float64
Cat_2_level_2     1506 non-null float64
Cat_2_weight      1506 non-null float64
Cat_3_level_1     1235 non-null float64
Cat_3_level_2     1235 non-null float64
Cat_3_weight      1235 non-null

In [33]:
enron_df['content'][0]

'I also need to know the base salaries of Jay Reitmeyer and Monique Sanchez. They are doing the same job as Matt.'

# Tokenization

nltk allows us to break text down into word tokens or sentences, depending on our goal.

In [60]:
print(sent_tokenize(enron_df['content'][0]))
print()
print(word_tokenize(enron_df['content'][0]))

['I also need to know the base salaries of Jay Reitmeyer and Monique Sanchez.', 'They are doing the same job as Matt.']

['I', 'also', 'need', 'to', 'know', 'the', 'base', 'salaries', 'of', 'Jay', 'Reitmeyer', 'and', 'Monique', 'Sanchez', '.', 'They', 'are', 'doing', 'the', 'same', 'job', 'as', 'Matt', '.']


# Stopword Removal

Stopword removal is important for getting rid of words that do not contribute to the overall meaning of the text. This will become important in later steps of our learning.

In [46]:
stop_words = set(stopwords.words("english"))
print(stop_words)

{'do', 'so', 'those', 'needn', "doesn't", "shouldn't", 'you', 'after', 'that', 'ours', 'than', "shan't", 'above', 'm', 'out', 'yours', 'didn', 'hasn', 'until', 'below', 'this', 'doesn', 'no', 'now', "didn't", 'as', 'isn', 're', 'from', 'few', 'we', 'why', 'his', 'for', "couldn't", 'a', 'when', "haven't", 's', 'i', 'will', 'before', 'between', 'again', 'hadn', "she's", "that'll", 'mightn', 'further', "aren't", 'but', 'these', 'he', 'here', 'not', 'into', "hadn't", 'ma', 'theirs', 'can', 'only', 'haven', 'weren', 'too', 'it', 'their', 'yourself', 'were', 'with', 'having', 'them', 'her', 'did', 'doing', 'up', 'off', 'am', 'any', 'they', 'has', 'the', "mightn't", 'itself', 'herself', 'nor', 'which', 'just', 'should', 'down', 'she', 'while', 'on', 'other', 'where', 'shouldn', 'all', 'very', 'himself', 'your', 'd', 'each', 'to', 'y', 'ourselves', "isn't", 'shan', 'both', 'myself', 'won', 'is', "wasn't", "hasn't", "you'll", 'against', 've', 'of', 'hers', 'own', 'how', "you're", 'or', "mustn't

# Part of Speech Tagging

nltk offers part-of-speech tagging, which is hugely beneficial for certain applications. 
[Tag Definitions can be found here](https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/)


In [3]:

        # CC    coordinating conjunction
        # CD    cardinal digit 
        # DT    determiner 
        # EX    existential there (like: "there is" ... think of it like "there exists")
        # FW    foreign word 
        # IN    preposition/subordinating conjunction
        # JJ    adjective 'big'
        # JJR   adjective, comparative 'bigger'
        # JJS   adjective, superlative 'biggest'
        # LS    list marker 1)
        # MD    modal could, will
        # NN    noun, singular 'desk'
        # NNS   noun plural 'desks'
        # NNP   proper noun, singular 'Harrison'
        # NNPS  proper noun, plural 'Americans'
        # PDT   predeterminer 'all the kids'
        # POS   possessive ending parent's
        # PRP   personal pronoun I, he, she
        # PRP$  possessive pronoun my, his, hers
        # RB    adverb very, silently, 
        # RBR   adverb, comparative better
        # RBS   adverb, superlative best
        # RP    particle give up
        # TO    to go 'to' the store.
        # UH    interjection errrrrrrrm
        # VB    verb, base form take
        # VBD   verb, past tense took
        # VBG   verb, gerund/present participle taking
        # VBN   verb, past participle taken
        # VBP   verb, sing. present, non-3d take
        # VBZ   verb, 3rd person sing. present takes
        # WDT   wh-determiner which
        # WP    wh-pronoun who, what
        # WP$   possessive wh-pronoun whose
        # WRB   wh-abverb where, when


In [55]:
nltk.pos_tag(word_tokenize(enron_df['content'][0]))

[('I', 'PRP'),
 ('also', 'RB'),
 ('need', 'VBP'),
 ('to', 'TO'),
 ('know', 'VB'),
 ('the', 'DT'),
 ('base', 'JJ'),
 ('salaries', 'NNS'),
 ('of', 'IN'),
 ('Jay', 'NNP'),
 ('Reitmeyer', 'NNP'),
 ('and', 'CC'),
 ('Monique', 'NNP'),
 ('Sanchez', 'NNP'),
 ('.', '.'),
 ('They', 'PRP'),
 ('are', 'VBP'),
 ('doing', 'VBG'),
 ('the', 'DT'),
 ('same', 'JJ'),
 ('job', 'NN'),
 ('as', 'IN'),
 ('Matt', 'NNP'),
 ('.', '.')]

# Chunking and Chinking

Chunking can be used to grab only certain parts of speech after a text has been tagged. For example, if we want to chunk for NNP (_Proper Noun_), we would do this:

In [85]:
chunkGram = r"Chunk: {<NNP>+}" # Regex that defines what a chunk is
chunkParser = nltk.RegexpParser(chunkGram) # Turn our regex into a parser object

for sentence in sent_tokenize(enron_df['content'][0]): # For each sentence in our text
    tagged_words = nltk.pos_tag(word_tokenize(sentence)) # Tag the words in the sentence with part of speech
    chunked = chunkParser.parse(tagged_words) # Run the chunkParser on the list of tagged words
    
    chunked.draw()
    
    # This will show the chunks visually for each sentence.

As you should see, combinations of first + last names are put into the same chunk.

Chinking is similar to chunking. The main difference is that while Chunking __includes__ terms, chinking will __exclude__ terms. If we want to try to remove any VB* (_Verbs_), IN (_Preposition_), and DT(_determiner_), we can get some good noun-phrases.

In [86]:
chunkGram = r"""Chunk: {<.*>+} 
                        }<VB.?|IN|DT|TO>+{"""
chunkParser = nltk.RegexpParser(chunkGram)

for sentence in sent_tokenize(enron_df['content'][0]): # For each sentence in our text
    tagged_words = nltk.pos_tag(word_tokenize(sentence)) # Tag the words in the sentence with part of speech
    chunked = chunkParser.parse(tagged_words) # Run the chunkParser on the list of tagged words
    
    chunked.draw()

# Lemmatizing

Lemmatizing is a form of normalizing text. It reduces words down to their base lemma. For example, _runs_ and _running_ should reduce to the same lemma, _run_.

The following example shows that lemmas depend on the word's part of speech. Therefore, lemmatization will be more accurate in retaining the original meaning of the text when used with nltk's part-of-speech tagging.

In [102]:
lemmatizer = WordNetLemmatizer()

print("noun:")
print(lemmatizer.lemmatize("runs", pos="n"))
print(lemmatizer.lemmatize("running", pos="n"))
print()
print("adjective:")
print(lemmatizer.lemmatize("runs", pos="a"))
print(lemmatizer.lemmatize("running", pos="a"))
print()
print("verb:")
print(lemmatizer.lemmatize("runs", pos="v"))
print(lemmatizer.lemmatize("running", pos="v"))

noun:
run
running

adjective:
runs
running

verb:
run
run


# Using nltk's wordnet 
wordnet is essentially a dictionary of words. It can be used to look up a word's meaning, synonyms and antonyms, and other words that share the same base lemma. It can also be used to estimate the similarity of one word to another.

### Synonyms and Antonyms

In [11]:
# List synonyms for a word
syns = wordnet.synsets("program")
syns

[Synset('plan.n.01'),
 Synset('program.n.02'),
 Synset('broadcast.n.02'),
 Synset('platform.n.02'),
 Synset('program.n.05'),
 Synset('course_of_study.n.01'),
 Synset('program.n.07'),
 Synset('program.n.08'),
 Synset('program.v.01'),
 Synset('program.v.02')]

In [13]:
# Access one word from the list (we will use broadcast for the following cells)
print(syns[2])

# Access the actual word rather than viewing it as a Synset
print(syns[2].name())

Synset('broadcast.n.02')
broadcast.n.02


In [120]:
# List lemmas for the synonym
syns[2].lemmas()

[Lemma('broadcast.n.02.broadcast'),
 Lemma('broadcast.n.02.program'),
 Lemma('broadcast.n.02.programme')]

In [130]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
    
print(set(synonyms))
print()
print(set(antonyms))

{'effective', 'trade_good', 'respectable', 'upright', 'ripe', 'estimable', 'unspoilt', 'expert', 'dear', 'secure', 'proficient', 'honorable', 'thoroughly', 'right', 'goodness', 'full', 'well', 'adept', 'salutary', 'dependable', 'undecomposed', 'practiced', 'unspoiled', 'in_force', 'serious', 'sound', 'honest', 'beneficial', 'near', 'soundly', 'skilful', 'commodity', 'good', 'in_effect', 'skillful', 'safe', 'just'}

{'ill', 'badness', 'evilness', 'evil', 'bad'}


### Word Definitions and Use of a Word in a Sentence

In [122]:
# Print the word's definition
syns[2].definition()

'a radio or television show'

In [123]:
# examples of the word's use in context
syns[2].examples()

['did you see his program last night?']

### Similarity between 2 words

In [6]:
# Similarity between two words
# Ship and Boat
word1 = wordnet.synset("ship.n.01")
word2 = wordnet.synset("boat.n.01")

print(word1.wup_similarity(word2)) # These two words are 90% similar

0.9090909090909091


In [7]:
# Ship and Car
word1 = wordnet.synset("ship.n.01")
word2 = wordnet.synset("car.n.01")

print(word1.wup_similarity(word2))

0.6956521739130435


In [9]:
# Ship and Cactus
word1 = wordnet.synset("ship.n.01")
word2 = wordnet.synset("cactus.n.01")

print(word1.wup_similarity(word2))

0.38095238095238093


In [8]:
# Ship and Cat
word1 = wordnet.synset("ship.n.01")
word2 = wordnet.synset("cat.n.01")

print(word1.wup_similarity(word2))

0.32
