In [1]:
import pandas as pd

In [4]:
emails = pd.read_csv("../../hillary_emails.csv")

In [7]:
def CountsByKeyword(df, col, person, topics):
    """
    Returns a dict of total mention counts per keyword for the given person.
    'By' parameter controls which field you're getting counts by.
    Big return says: return a dictionary via comprehension for lists, or just a dict for one value
    """
    
    if not isinstance(topics, (str, unicode, list)): 
        raise TypeError('\'topics\' parameter must be either str or list') 
    
    person = '(' + person + ')'
    
    return (
        {topic: df[col].loc[
                (df[col].str.contains(person, case = False)) 
                & (df['ExtractedBodyText'].str.contains(topic, case = False))].count()
            for topic in topics} 
        if isinstance(topics, list) 
        else {topics: df[col].loc[
                (df[col].str.contains(person, case = False))
                & (df['ExtractedBodyText'].str.contains(topics, case = False))].count()}
    )

In [9]:
# using the function above
CountsByKeyword(
    emails, 
    col = 'MetadataFrom', 
    person = '.*', 
    topics = 'blumenthal'
)




{'blumenthal': 20}

In [16]:
emails.ExtractedBodyText

0                                                     NaN
1       B6\nThursday, March 3, 2011 9:45 PM\nH: Latest...
2                                                     Thx
3                                                     NaN
4       H <hrod17@clintonemail.com>\nFriday, March 11,...
5       Pis print.\n-•-...-^\nH < hrod17@clintonernail...
6                                                     NaN
7       H <hrod17@clintonemail.corn>\nFriday, March 11...
8                                                     FYI
9       B6\nWednesday, September 12, 2012 6:16 PM\nFwd...
10                                           Fyi\nB6\n— —
11      B6\nWednesday, September 12, 2012 6:16 PM\nFwd...
12                                                    Fyi
13      Anne-Marie Slaughter\nSunday, March 13, 2011 9...
14      _ .....\nFrom Randolph, Lawrence M\nSent: Wedn...
15      I asked to attend your svtc today with Embassy...
16                   Hope. See picture below Kamala sent.
17            

In [18]:
import nltk
import re

In [33]:
# To lowercase - not necessary
emails.CleanedBody = emails.ExtractedBodyText.str.lower()

In [137]:
# Functional parser - removes punctuation and numbers!
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'[a-z]+')
test = tokenizer.tokenize(emails.CleanedBody[1])
test

['b',
 'thursday',
 'march',
 'pm',
 'h',
 'latest',
 'how',
 'syria',
 'is',
 'aiding',
 'qaddafi',
 'and',
 'more',
 'sid',
 'hrc',
 'memo',
 'syria',
 'aiding',
 'libya',
 'docx',
 'hrc',
 'memo',
 'syria',
 'aiding',
 'libya',
 'docx',
 'march',
 'for',
 'hillary']

In [140]:
# Functional! Remove stop words

filtered_words = [word for word in test if word not in stopwords.words('english')]
filtered_words

['b',
 'thursday',
 'march',
 'pm',
 'h',
 'latest',
 'syria',
 'aiding',
 'qaddafi',
 'sid',
 'hrc',
 'memo',
 'syria',
 'aiding',
 'libya',
 'docx',
 'hrc',
 'memo',
 'syria',
 'aiding',
 'libya',
 'docx',
 'march',
 'hillary']

In [143]:
tag_dic = [nltk.pos_tag(word) for word in filtered_words]


KeyboardInterrupt: 

In [142]:
# Broken Stemmer. When working turns hillary into "hillari". Might be a problem...

stemmer = nltk.PorterStemmer()
stemmed_test = []
for word in test[0]:
    stems = stemmer.stem(word)
    stemmed_test.append(stems)
stemmed_test

[u'b']

In [134]:
# Broken count dictionary

count = {} # initialize dictionary
for word in test[0]:
    word = word.lower() # normalize case
    if word not in count: # previously unseen word?
        count[word] = 0 # if so set count to 0
        count[word] += 1 # increment word count
count

{'b': 1}

In [146]:
# import nltk.classify.util
# from nltk.classify import NaiveBayesClassifier
# from nltk.corpus import movie_reviews
 
# def word_feats(words):
#     return dict([(word, True) for word in words])
 
# negids = movie_reviews.fileids('neg')
# posids = movie_reviews.fileids('pos')
 
# negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
# posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
# negcutoff = len(negfeats)*3/4
# poscutoff = len(posfeats)*3/4
 
# trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
# testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
# print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
 
# classifier = NaiveBayesClassifier.train(trainfeats)
# print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
# classifier.show_most_informative_features()

In [176]:
# TextBlob tagger

from textblob import TextBlob

blob = TextBlob(emails.ExtractedBodyText[14])
blob.tags

[('_', u'NN'),
 ('..', u'NN'),
 ('From', u'IN'),
 ('Randolph', u'NNP'),
 ('Lawrence', u'NNP'),
 ('M', u'NNP'),
 ('Sent', u'NNP'),
 ('Wednesday', u'NNP'),
 ('September', u'NNP'),
 ('12', u'CD'),
 ('2012', u'CD'),
 ('04:33', u'CD'),
 ('PM', u'NNPS'),
 ('To', u'TO'),
 ('Mills', u'NNS'),
 ('Cheryl', u'NNP'),
 ('D', u'NNP'),
 ('Subject', u'NNP'),
 ('RE', u'NN'),
 ('Not', u'RB'),
 ('a', u'DT'),
 ('dry', u'JJ'),
 ('eye', u'NN'),
 ('in', u'IN'),
 ('NEA', u'NNP'),
 ('Including', u'NNP'),
 ('mine', u'NN'),
 ('Her', u'PRP$'),
 ('remarks', u'NNS'),
 ('were', u'VBD'),
 ('really', u'RB'),
 ('moving', u'VBG'),
 ('Chriswas', u'NNP'),
 ('an', u'DT'),
 ('amazing', u'JJ'),
 ('man', u'NN'),
 ('Such', u'PDT'),
 ('a', u'DT'),
 ('huge', u'JJ'),
 ('loss', u'NN'),
 ('You', u'PRP'),
 ('know', u'VBP'),
 ('I', u'PRP'),
 ('was', u'VBD'),
 ('in', u'IN'),
 ('Libya', u'NNP'),
 ('before', u'IN'),
 ('coming', u'VBG'),
 ('here', u'RB'),
 ('and', u'CC'),
 ('in', u'IN'),
 ('my', u'PRP$'),
 ('almost', u'RB'),
 ('ten', u'JJ

In [177]:
# Print nouns

blob.noun_phrases   

WordList([u'_ ... ..', 'randolph', 'lawrence', 'september', 'pm', 'mills', u'cheryl d subject', 're', u'dry eye', u'nea including', 'chriswas', u'amazing man', u'huge loss', 'libya', u'funny diplomat', 'made', u'hardest places'])

In [184]:
# Average sentiment across sentences

import numpy as np
sentiment = []
for sentence in blob.sentences:
    sentiment.append(sentence.sentiment.polarity)
print sentiment
np.mean(sentiment)

[-0.06666666666666667, 0.2, 0.6000000000000001, 0.20000000000000004, 0.18, 0.43333333333333335]


0.25777777777777783

In [197]:
type(len(emails.ExtractedBodyText))

int

In [202]:
for x in range(len(emails.ExtractedBodyText)):
    if type(emails.ExtractedBodyText[x] == str):
        emails['tags'][x] = TextBlob(emails.ExtractedBodyText[x]).tags

TypeError: The `text` argument passed to `__init__(text)` must be a string, not <type 'float'>

In [204]:
import string
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re

def preprocess(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [w for w in tokens if not w in stopwords.words('english')]
    return " ".join(filtered_words)

emails.testBody = emails.ExtractedBodyText.apply(lambda x: str(x)).apply(lambda x: preprocess(x))

In [205]:
emails.testBody

0                                                     nan
1       b6 thursday march 3 2011 9 45 pm h latest syri...
2                                                     thx
3                                                     nan
4       h hrod17 clintonemail com friday march 11 2011...
5       pis print h hrod17 clintonernailcom wednesday ...
6                                                     nan
7       h hrod17 clintonemail corn friday march 11 201...
8                                                     fyi
9       b6 wednesday september 12 2012 6 16 pm fwd lib...
10                                                 fyi b6
11      b6 wednesday september 12 2012 6 16 pm fwd lib...
12                                                    fyi
13      anne marie slaughter sunday march 13 2011 9 39...
14      _ randolph lawrence m sent wednesday september...
15      asked attend svtc today embassy tripoli first ...
16                           hope see picture kamala sent
17            

In [216]:
tags_test = []
for x in range(10):
    tags_test.append(TextBlob(emails.testBody[x]).tags)

In [217]:
tags_test

[[('nan', u'NN')],
 [('b6', u'NN'),
  ('thursday', u'NN'),
  ('march', u'VBD'),
  ('3', u'CD'),
  ('2011', u'CD'),
  ('9', u'CD'),
  ('45', u'CD'),
  ('pm', u'NN'),
  ('h', u'NN'),
  ('latest', u'JJS'),
  ('syria', u'NN'),
  ('aiding', u'VBG'),
  ('qaddafi', u'JJ'),
  ('sid', u'NN'),
  ('hrc', u'NN'),
  ('memo', u'NN'),
  ('syria', u'NN'),
  ('aiding', u'VBG'),
  ('libya', u'JJ'),
  ('030311', u'CD'),
  ('docx', u'JJ'),
  ('hrc', u'NN'),
  ('memo', u'NN'),
  ('syria', u'NN'),
  ('aiding', u'VBG'),
  ('libya', u'JJ'),
  ('030311', u'CD'),
  ('docx', u'JJ'),
  ('march', u'NN'),
  ('3', u'CD'),
  ('2011', u'CD'),
  ('hillary', u'NN')],
 [('thx', u'NN')],
 [('nan', u'NN')],
 [('h', u'NN'),
  ('hrod17', u'NN'),
  ('clintonemail', u'NN'),
  ('com', u'NN'),
  ('friday', u'JJ'),
  ('march', u'VBZ'),
  ('11', u'CD'),
  ('2011', u'CD'),
  ('1', u'CD'),
  ('36', u'CD'),
  ('pm', u'NN'),
  ('huma', u'NN'),
  ('abedin', u'NN'),
  ('fw', u'NN'),
  ('h', u'NN'),
  ('latest', u'JJS'),
  ('syria', u'NN

In [221]:
body_tags = []
for x in range(len(emails.testBody)):
    body_tags.append(TextBlob(emails.testBody[x]).tags)

KeyboardInterrupt: 

In [219]:
body_tags

[[('nan', u'NN')],
 [('b6', u'NN'),
  ('thursday', u'NN'),
  ('march', u'VBD'),
  ('3', u'CD'),
  ('2011', u'CD'),
  ('9', u'CD'),
  ('45', u'CD'),
  ('pm', u'NN'),
  ('h', u'NN'),
  ('latest', u'JJS'),
  ('syria', u'NN'),
  ('aiding', u'VBG'),
  ('qaddafi', u'JJ'),
  ('sid', u'NN'),
  ('hrc', u'NN'),
  ('memo', u'NN'),
  ('syria', u'NN'),
  ('aiding', u'VBG'),
  ('libya', u'JJ'),
  ('030311', u'CD'),
  ('docx', u'JJ'),
  ('hrc', u'NN'),
  ('memo', u'NN'),
  ('syria', u'NN'),
  ('aiding', u'VBG'),
  ('libya', u'JJ'),
  ('030311', u'CD'),
  ('docx', u'JJ'),
  ('march', u'NN'),
  ('3', u'CD'),
  ('2011', u'CD'),
  ('hillary', u'NN')],
 [('thx', u'NN')],
 [('nan', u'NN')],
 [('h', u'NN'),
  ('hrod17', u'NN'),
  ('clintonemail', u'NN'),
  ('com', u'NN'),
  ('friday', u'JJ'),
  ('march', u'VBZ'),
  ('11', u'CD'),
  ('2011', u'CD'),
  ('1', u'CD'),
  ('36', u'CD'),
  ('pm', u'NN'),
  ('huma', u'NN'),
  ('abedin', u'NN'),
  ('fw', u'NN'),
  ('h', u'NN'),
  ('latest', u'JJS'),
  ('syria', u'NN