In [None]:
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
from os import path
from os.path import exists
import random
from wordcloud import WordCloud, STOPWORDS

import nltk
from nltk.corpus import stopwords
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

import pycountry
#nltk.download()

Credits: Background used in this Notebook was made by <a href="http://www.flaticon.com/authors/freepik" title="Freepik">Freepik</a> from <a href="http://www.flaticon.com" title="Flaticon">www.flaticon.com</a>.

# Utils

In [None]:
def generate_raw_text(data):
    text = ''
    for d in data:
        text += str(d) + ' '
    return text


def do_stemming_words(stemmer, words):
    text = ''
    for w in words:
        text += stemmer.stem(w)
    return text


def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(40, 60)


def generate_word_cloud(text, img_name='envelope.png', max_words=1000, width=900, height=900, dpi=400, file_name=None):
    stopwords = set(STOPWORDS)
    mask = np.array(Image.open(img_name))
    wc = WordCloud(background_color="white", mask=mask, max_words=max_words, stopwords=stopwords).generate(text)
    plt.figure(figsize=(9, 9), dpi=dpi)
    plt.axis("off")
    plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3))
    if file_name:
        path = './images/' + file_name + '.png'
        if not exists(path):
            plt.savefig(path, dpi=dpi)
    else:
        plt.show()


def get_country_name(word):
    lower_word = str.lower(word)
    for c in pycountry.countries:
        if (word == c.alpha_2) or (word == c.alpha_3) or (lower_word == str.lower(c.name)) or (hasattr(c, 'official_name') and (lower_word == str.lower(c.official_name))):
            return c.name
    return None


def count_country_occurrences(text):
    lower_text = str.lower(text)
    countries = Counter()
    for country in pycountry.countries:
        nb_occurrences = 0
        nb_occurrences += text.count(country.alpha_2)
        nb_occurrences += text.count(country.alpha_3)
        nb_occurrences += lower_text.count(str.lower(country.name))
        if hasattr(country, 'official_name'):
            nb_occurrences += lower_text.count(str.lower(country.official_name))
        countries[country.name] = nb_occurrences
    return countries


def get_wordnet_tag_type(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None


def retrieve_email_sentiment(email):
    email_content = str(email['ExtractedSubject']) + ' ' + str(email['ExtractedBodyText'])
    tokens = word_tokenize(email_content)
    types = Counter({'Positive': 0, 'Negative': 0, 'Neutral': 0})
    for word, pos_tag in nltk.pos_tag(tokens):
        tag = get_wordnet_tag_type(pos_tag)
        synset_list = list(swn.senti_synsets(word, pos=tag))
        if synset_list:
            if synset_list[0].pos_score() > synset_list[0].neg_score():
                types['Positive'] += 1
            elif synset_list[0].pos_score() < synset_list[0].neg_score():
                types['Negative'] += 1
            else:
                types['Neutral'] += 1
                
    if types.most_common()[0][1] > types.most_common()[1][1]:
        email['Type'] = max(types)
    else:
        email['Type'] = 'Neutral'
        
    return email

# Read files

First, we import all data as DataFrames.

In [None]:
df_aliases = pd.read_csv('hillary-clinton-emails/Aliases.csv', index_col=0)

In [None]:
df_aliases.head()

In [None]:
df_email_receivers = pd.read_csv('hillary-clinton-emails/EmailReceivers.csv', index_col=0)

In [None]:
df_email_receivers.head()

In [None]:
df_emails = pd.read_csv('hillary-clinton-emails/Emails.csv', index_col=0)

In [None]:
df_emails.head()

In [None]:
df_persons = pd.read_csv('hillary-clinton-emails/Persons.csv', index_col=0)

In [None]:
df_persons.head()

# Handling raw text

In [None]:
emails_content = df_emails['ExtractedSubject'] + ' ' + df_emails['ExtractedBodyText']
raw_text = generate_raw_text(data=emails_content.values)

In [None]:
generate_word_cloud(raw_text)

# Cleaning text

In [None]:
tokens = word_tokenize(raw_text)
word_tokenized_text = generate_raw_text(data=tokens)

In [None]:
generate_word_cloud(text=word_tokenized_text, file_name='1_word_tokenize')

In [None]:
# PB: Kernal keep crashing when executing this cell and the next one.

#tokenizer = RegexpTokenizer(r'\w+')
#regexp_tokens = tokenizer.tokenize(raw_text)
#regexp_tokenized_text = generate_raw_text(data=regexp_tokens)

In [None]:
#generate_word_cloud(text=regexp_tokenized_text, max_words=500, dpi=200, file_name='1_regexp_tokenized')

In [None]:
filtered_words = [word for word in tokens if word not in stopwords.words('english')]
filtered_text = generate_raw_text(data=filtered_words)

In [None]:
generate_word_cloud(text=filtered_text, file_name='2_stopwords')

In [None]:
wl = WordNetLemmatizer()
wl_text = wl.lemmatize(filtered_text)

In [None]:
generate_word_cloud(text=wl_text, file_name='3_WordNetLemmatizer')

In [None]:
ps = PorterStemmer()
ps_stemming_text = do_stemming_words(stemmer=ps, words=wl_text)

In [None]:
generate_word_cloud(text=ps_stemming_text, file_name='4_PorterStemmer')

In [None]:
ss = SnowballStemmer("english")
ss_stemming_text = do_stemming_words(stemmer=ss, words=wl_text)

In [None]:
generate_word_cloud(text=ss_stemming_text, file_name='4_SnowballStemmer')

# Question 2

In [None]:
countries_occurrences = count_country_occurrences(ps_stemming_text)
df_countries_occurrences = pd.DataFrame.from_dict(countries_occurrences, orient='index')
df_countries_occurrences.columns = ['Occurrences']
df_countries_occurrences.sort_values('Occurrences', ascending=False, inplace=True)
df_countries_occurrences.head(15)

In [None]:
most_quoted_countries = df_count.nlargest(15, 'Occurrences')
countries_plot = sns.barplot(x=most_quoted_countries.index, y='Occurrences', data=most_quoted_countries, color='r')
for label in countries_plot.get_xticklabels():
    label.set_rotation(90)
countries_plot.set(ylabel='Occurrences')
countries_plot.set_title('Number of occurrences of 15 most-quoted countries')
sns.plt.show()

In [None]:
df_emails_content = df_emails[['ExtractedSubject', 'ExtractedBodyText']]
df_emails_content.head()

In [None]:
results = df_emails_content.apply(retrieve_email_sentiment, axis=1)

In [None]:
# PB: No negative email...
results[(results['Type'] == 'Positive')]

In [None]:
# TODO: Try with PerceptronTagger
tagger=PerceptronTagger()
for word, pos_tag in tagger.tag(tokens):
# As before...