# Homework 05 - Taming text

## Import

In [None]:
# Data processing
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import pandas as pd
from sklearn import preprocessing
from gensim import models, corpora
%matplotlib inline

# NLTK
from nltk.corpus import stopwords, subjectivity
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk.data

# UTILS
import pycountry
import random
import re
from PIL import Image
from os import path
from os.path import exists
from collections import Counter
from collections import defaultdict

# Bonus: NetworkX and Community
#import community
#import networkx as nx

# Our code (autoreload enabled - useful in developement mode)
%load_ext autoreload
%autoreload 1
%aimport utils

**IMPORTANT NOTE:** For readability reasons, we decide to store all defined functions in a file called *utils.py*. When functions are used in a cell, a comment mentions it (*# UTILS: use of &lt;name of function 1&gt;[, &lt;name of function 2&gt;, ...]*).

If some components are missing, please uncomment the following line and execute the cell in order to start the download utility of nltk.

In [None]:
# nltk.download()

## Preliminary remarks

This Notebook is based on the <a href="https://en.wikipedia.org/wiki/Hillary_Clinton_email_controversy">Hillary Clinton email controversy</a>. Please note that only a fraction of the published emails are used here. The full list, which was published by U.S. Department of State under the Freedom of Information Act, can be consulted <a href="https://foia.state.gov/Learn/New.aspx">here</a>.

**Credits:** Background used in this Notebook was made by <a href="http://www.flaticon.com/authors/freepik" title="Freepik">Freepik</a> from <a href="http://www.flaticon.com" title="Flaticon">www.flaticon.com</a>.

# Import data

First, we import all data as DataFrames.

In [None]:
df_aliases = pd.read_csv('hillary-clinton-emails/Aliases.csv', index_col=0)

In [None]:
df_aliases.head()

In [None]:
df_email_receivers = pd.read_csv('hillary-clinton-emails/EmailReceivers.csv', index_col=0)

In [None]:
df_email_receivers.head()

In [None]:
df_emails = pd.read_csv('hillary-clinton-emails/Emails.csv', index_col=0)
df_emails.fillna('', inplace=True)
df_emails.replace({'\n': ' '}, regex=True, inplace=True)

In [None]:
df_emails.head()

In [None]:
df_persons = pd.read_csv('hillary-clinton-emails/Persons.csv', index_col=0)

In [None]:
df_persons.head()

# Handling raw text

First, we retrieve all the content using two columns: *MetadataSubject* and *ExtractedBodyText*. These columns contain respectively the core title and the body of each email in integrality.

In [None]:
# UTILS: use of generate_raw_text
emails_content = df_emails['MetadataSubject'] + ' ' + df_emails['ExtractedBodyText']
raw_text = utils.generate_raw_text(data=emails_content.values)

Some words, which are specific to the study, don't bring anything to our analysis and, in contrary, pollute it. Thus, we decide to remove them.

In [None]:
# UTILS: use of SPECIFIC_STOP_WORDS
for word in utils.SPECIFIC_STOP_WORDS: 
    raw_text = re.sub(r'\b%s\b' % word, '', raw_text, flags=re.IGNORECASE)

Let's display first word cloud of the raw text, before any processing pipeline.

In [None]:
# UTILS: use of generate_word_cloud
word_cloud = utils.generate_word_cloud(raw_text, file_name='0_raw_text')
word_cloud.show()

# Q1 - Processing pipeline (cleaning of the text)

**Generate a word cloud based on the raw corpus -- I recommend you to use the Python word_cloud library. With the help of nltk (already available in your Anaconda environment), implement a standard text pre-processing pipeline (e.g., tokenization, stopword removal, stemming, etc.) and generate a new word cloud. Discuss briefly the pros and cons (if any) of the two word clouds you generated.**

First, we tokenize the retrieved text. Here, we focus on each word as an all, thanks to a RegExp tokenizer.

After the processing, each token will be a candidate for an index entry.

In [None]:
# UTILS: use of REGEX_TOKENIZER, generate_raw_text
tokens = utils.REGEX_TOKENIZER.tokenize(raw_text)
word_tokenized_text = utils.generate_raw_text(data=tokens)

Let's visualize if there are differences between word clouds describing, respectively, raw text and tokenized text.

In [None]:
# UTILS: use of generate_word_cloud
word_cloud_tokenize = utils.generate_word_cloud(text=word_tokenized_text, file_name='1_word_tokenize')
word_cloud_tokenize.show()

As we can see - and it is not really surprising - there is no significant changes after the tokenization part. Note that using the default tokenizer, we would end up with some words like 'nt' (which come from the decomposition of 'don't' in this case). Using Regexp tokenizer permit us to avoid such problems.

Now, we want to remove all meaningless words which are present in the emails. Here, we use a set of English words defined as 'stopwords', assuming that all emails were written in such language.

In [None]:
# UTILS: use of generate_raw_text
filtered_words = [word for word in tokens if word not in stopwords.words('english')]
filtered_text = utils.generate_raw_text(data=filtered_words)

In [None]:
# UTILS: use of generate_word_cloud
word_cloud_stopwords = utils.generate_word_cloud(text=filtered_text, file_name='2_stopwords')
word_cloud_stopwords.show()

Note that even if we don't see big changes, the process removes for sure a lot of unwanted words, that may impact our further analysis if we don't remove them, and even if they don't appear as the most common ones.

In [None]:
print('Length before filtering: %d' % len(tokens))
print('Length after filtering: %d' % len(filtered_words))

Tokenize have some disadvantages for multiple reasons: 
* break up hyphenated sequence
* be unsensible to lower case
* usability/scalability.

This is why, we should use differents method : Lemmatization and stemming.

Now that we tokenized and filtered the content, we may want to apply lemmatization and stemming to obtain the most common words. Indeed, in such text, words are declined in different combinaisons and we must count all of these combinaisons as unique ones for the kind of analysis we run here.

**Note: About the difference between stemming and lemmatization**

The goal of both processes is to reduce inflectional forms or to find related forms of a word with a common base form ; however, the two techniques differ in the way they achieve to do it. Also, stemming was commonly implemented with reduction techniques, though this is not universal. Lemmatization, as for it, implies a possibly broader scope of functionality, which may include synonyms.

Let's firstly reduce inflectional (variant) forms to base form of the words.

In [None]:
#UTILS: use of WORDNET_LEMMATIZER
wl_text = utils.WORDNET_LEMMATIZER.lemmatize(filtered_text)

In [None]:
# UTILS: use of generate_word_cloud
word_cloud_lemmatizer = utils.generate_word_cloud(text=wl_text, file_name='3_WordNetLemmatizer')
word_cloud_lemmatizer.show()

Now, we use stemmer to reduce terms to their roots before indexing. Note that here we use both Porter stemmer, which is the most common.

In [None]:
# UTILS: use of do_stemming_words, PORTER_STEMMER
ps_stemming_text = utils.do_stemming_words(stemmer=utils.PORTER_STEMMER, words=wl_text)

In [None]:
# UTILS: use of generate_word_cloud
word_cloud_porter_stemmer = utils.generate_word_cloud(text=ps_stemming_text, file_name='4_PorterStemmer')
word_cloud_porter_stemmer.show()

Just for curiosity, we can also use the Snowball stemmer to produce the same result. A good explanation was given about the differences between the main stemmer's algorithms <a href="http://stackoverflow.com/questions/10554052/what-are-the-major-differences-and-benefits-of-porter-and-lancaster-stemming-alg">here</a>.

In [None]:
# UTILS: use of do_stemming_words, SNOWBALL_STEMMER
ss_stemming_text = utils.do_stemming_words(stemmer=utils.SNOWBALL_STEMMER, words=wl_text)

In [None]:
# UTILS: use of generate_word_cloud
word_cloud_stemmer = utils.generate_word_cloud(text=ss_stemming_text, file_name='4_SnowballStemmer')
word_cloud_stemmer.show()

Here, no big differences are observed between the two processing (respectively apply of Porter stemmer and Snowball stemmer).

# Q2 - Analysis on countries

**Find all the mentions of world countries in the whole corpus, using the pycountry utility (HINT: remember that there will be different surface forms for the same country in the text, e.g., Switzerland, switzerland, CH, etc.) Perform sentiment analysis on every email message using the demo methods in the nltk.sentiment.util module. Aggregate the polarity information of all the emails by country, and plot a histogram (ordered and colored by polarity level) that summarizes the perception of the different countries. Repeat the aggregation + plotting steps using different demo methods from the sentiment analysis module -- can you find substantial differences?**

### Processing data

First, we count all the occurrences of countries in Hillary Clinton's emails.

In [None]:
# UTILS: use of count_countries_occurrences
countries_occurrences = utils.count_countries_occurrences(ps_stemming_text)

In [None]:
df_countries_occurrences = pd.DataFrame.from_dict(countries_occurrences, orient='index')
df_countries_occurrences.columns = ['Occurrences']
df_countries_occurrences.sort_values('Occurrences', ascending=False, inplace=True)
df_countries_occurrences.head(15)

It is not surprising to find at the top of the list some countries like Israel, Libya, Haiti or even United States. Indeed, these countries are directly linked to the ex-Secretary of State!

Let's plot the 30 most quoted countries to have a better observation of what's going on.

In [None]:
# UTILS: use of plot_most_quoted_countries
utils.plot_most_quoted_countries(df_countries_occurrences, 30)    

One can remark the fact that the number of quotes of a country drastically fall. We need to pay attention to this detail in the next part of the study as an insufficient number of occurrences could change the sense of the result!

### Sentimental Study

In this part, we want to attribute some kind of sentiment to each quoted country, according of what was discussed in the emails.

In [None]:
df_emails_content = df_emails[['MetadataSubject', 'ExtractedBodyText']]

We first use <a href="http://sentiwordnet.isti.cnr.it/">SentiWordNet</a> in order to retrieve the sentimal score of each email.

In [None]:
# UTILS: use of retrieve_email_sentiment
results = df_emails_content.apply(utils.retrieve_email_sentiment, axis=1)

In [None]:
results_plot = pd.DataFrame(pd.value_counts(results['Type']))
results_plot.plot(kind='bar', title='Number of occurrences for each type of emails (using SentiWordNet)')

As we can notice from the previous graph, the number of positive emails is much more greater than the number of the neutral and negative ones.

Now let's use <a href="https://github.com/cjhutto/vaderSentiment">Vader sentiment analysis</a> to see if we obtain different results.

*Note: Some detailed examples on the performance of Vader sentiment analysis used in NLTK can be found <a href="http://www.nltk.org/howto/sentiment.html">here</a>.*

In [None]:
# UTILS: use of retrieve_email_sentiment
results_vader = df_emails_content.apply(utils.retrieve_email_sentiment, args=('Vader',), axis=1)

In [None]:
result_vader_plot = pd.DataFrame(pd.value_counts(results_vader['Type']))
result_vader_plot.plot(kind='bar', title='Number of occurrences for each type of emails (using Vader)')

Using Vader sentiment analysis, we obtain a different distribution of the data. This difference can be explained by the fact that algorithm associated with SentiWordNet doesn't take into account the entire sentence and only associates sentiment to each word, individually.

Now we have defined the sentiment of each email, let's define the sentiment associated to each country. Note that in order to avoid interferences, we only search for exact name of a country. Put in other words, we decide to ignore <a href="https://fr.wikipedia.org/wiki/ISO_3166-1_alpha-2">ISO 3166-1 alpha-2</a> and <a href="https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3">ISO 3166-1 alpha-3</a> country codes, as these codes can represent some words or abbreviations in English.

In [None]:
# UTILS: use of get_countries_sentiment
countries_sentiment = utils.get_countries_sentiment(results_vader)

In [None]:
df_countries_sentiment = pd.DataFrame.from_dict(countries_sentiment, orient='index')
df_countries_sentiment.columns = ['Sentiment']
df_countries_sentiment.sort_values('Sentiment', ascending=False, inplace=True)

In [None]:
df_countries_sentiment.head(10)

In [None]:
df_countries_sentiment.tail(10)

Here, we display the 10 most-preferred and less-preferred countries, respectively, according to sentiment analysis. For some of them, there is explanations about the obtained score, even if it is not evident.

For good scores, and as an example, <a href="https://en.wikipedia.org/wiki/Haiti%E2%80%93United_States_relations">relations between Haiti and USA</a> can explain the obtained score. Also, even if USA was directly mobilized as part of Afghanistan's war, it does not mean that the sentiment toward this country is necessarily negative (quite the opposite!).

Now, regarding negative scores, some recent events may explain them, as for Libya (see <a href="https://en.wikipedia.org/wiki/2012_Benghazi_attack">2012 Benghazi attack</a> or <a href="https://en.wikipedia.org/wiki/Libyan_Civil_War_(2011)">Libyan Civil War of 2011</a>). Some historical reasons also justify the bad sentiment toward some countries like Serbia (see <a href="https://en.wikipedia.org/wiki/Serbia%E2%80%93United_States_relations">relations between Serbia and USA</a> and <a href="https://en.wikipedia.org/wiki/2008_Kosovo_declaration_of_independence">declaration of independence of Kosovo</a>).

### Plotting sentiment and occurrences

We merge occurrences and sentiment in order to have all the information in an unique DataFrame.

In [None]:
countries_data = pd.merge(df_countries_occurrences, df_countries_sentiment, how='inner', left_index=True, right_index=True, sort=True)

As sentiment seems to decrease exponentially, we decide to adjust the values using MinMaxScaler. More precisely, we use the log here because of the distribution.

In [None]:
min_max_scaler = preprocessing.MinMaxScaler((-1, 1))
countries_data[['Sentiment']] = np.log(countries_data['Sentiment'] + abs(min(countries_data['Sentiment'])) + 10)
countries_data['Sentiment'] = min_max_scaler.fit_transform(countries_data[['Sentiment']].as_matrix())

In [None]:
# UTILS: use of plot_countries_by_occurrences_and_sentiment
utils.plot_countries_by_occurrences_and_sentiment(countries_data, 40)

*Important note: Some results are more difficult to explain and are directly linked with the sentiment analyzers and their bias, but also to the fact that we ignored country codes during the process, while they are used in emails (it is about weighing up the pros and cons here). Also, and among other things, our assumptions and decisions explain the observed differences between different analysis on the same topic (see <a href="https://www.kaggle.com/operdeck/d/kaggle/hillary-clinton-emails/hillary-s-sentiment-about-countries">another example on Kaggle</a>).*

Let's also display the countries according to their score. Here, we display the 20 most-preferred countries (left side) and the 20 less-preferred countries (right side). **Here, colors represent the number of occurrences.**

In [None]:
# UTILS: use of plot_sentiment_by_country
utils.plot_sentiment_by_country(countries_data, None, nb_country=20)

We can notice that except for few countries (Lativia, Serbia or Lybia), bad feeling seems to be linked with few occurences. Hence, for these countries, we must pay attention and don't make any definitive conclusions!

Note that, in contrary, it seems to be a correlation between good sentiment and number of occurrences for a given country, except for Libya...

# Q3 - Retrieve of the main topics

**Using the models.ldamodel module from the gensim library, run topic modeling over the corpus. Explore different numbers of topics (varying from 5 to 50), and settle for the parameter which returns topics that you consider to be meaningful at first sight.**

First, we create our corpus using all content.

In [None]:
corpus, id2word = utils.create_corpus(df_emails['ExtractedBodyText'])

Then we create our LDA model with different number of topics.

Let's create LDA models with different number of topics and see if we observe significant differences.

In [None]:
lda_5_topics = utils.create_lda_model(corpus, id2word, 5)
lda_5_topics.print_topics()

In [None]:
lda_10_topics = utils.create_lda_model(corpus, id2word, 10)
lda_10_topics.print_topics(10)

In [None]:
lda_25_topics = utils.create_lda_model(corpus, id2word, 25)
lda_25_topics.print_topics(25)

In [None]:
lda_50_topics = utils.create_lda_model(corpus, id2word, 50)
lda_50_topics.print_topics(50)

### ALTERNATIVES

**<span style="color:red">TODO</span>**

#### First alternative: use of processed text.

In [None]:
sentences = utils.SENTENCES_DETECTOR.tokenize(ps_stemming_text.strip())
processed_corpus, processed_id2word = utils.create_corpus(sentences)

In [None]:
lda_5_topics_processed = utils.create_lda_model(processed_corpus, processed_id2word, 5)
lda_5_topics_processed.print_topics()

#### Second alternative: regrouping emails by conversations

**IMPORTANT NOTE:**

Here, we clean text. If we want to use raw text and apply only regrouping, replace *CleanedBodyText* by *ExtractedBodyText* in the cells marked as: CELL_1, CELL_2.

In [None]:
df_emails_content['CleanedBodyText'] = df_emails_content.apply(utils.process_email_content, axis=1)

In [None]:
untitled_emails = df_emails_content[df_emails_content['MetadataSubject'] == '']
titled_emails = df_emails_content[df_emails_content['MetadataSubject'] != '']

In [None]:
# CELL_1
aggregated_content = titled_emails.groupby('MetadataSubject').apply(lambda x: "%s" % ' '.join(x['CleanedBodyText'])).tolist()

In [None]:
# CELL_2
# NOT OPTIMIZED (2 loops that do the same thing)

# To test at the end as alternative for first loop
# all_text_array = [[word for word in sentence.lower().split() if word not in stopwords.words('english')] for sentence in utils.SENTENCES_DETECTOR.tokenize(content.strip()) for content in aggregated_content]
all_text_array = []
all_text_array += utils.get_text_without_Stop_Word(aggregated_content)
all_text_array += utils.get_text_without_Stop_Word(untitled_emails['CleanedBodyText'].tolist())

In [None]:
# TODO: Will be merged with create_corpus in utils at the end
dictionary_conversations = corpora.Dictionary(all_text_array)

id2word_conversations = {}
for word in dictionary_conversations.token2id:    
    id2word_conversations[dictionary_conversations.token2id[word]] = word

corpus_conversations = [dictionary_conversations.doc2bow(text) for text in all_text_array]

In [None]:
lda_5_topics_conversations = utils.create_lda_model(corpus_conversations, id2word_conversations, 5)
lda_5_topics_conversations.print_topics()

# Bonus

<b>Build the communication graph (unweighted and undirected) among the different email senders and recipients using the NetworkX library. Find communities in this graph with community.best_partition(G) method from the community detection module. Print the most frequent 20 words used by the email authors of each community. Do these word lists look similar to what you've produced at step 3 with LDA? Can you identify clear discussion topics for each community? Discuss briefly the obtained results.</b>

We first need to build the full graph of all communications. For each email, we add a link between the sender and the receiver (we optionaly add the weight of the link).

In [None]:
links = defaultdict(lambda : defaultdict(int))
nodes = set()
for email in df_emails.itertuples():
    senderId = email.SenderPersonId
    if senderId:
        nodes |= {int(senderId)}
        receivers = df_email_receivers.loc[df_email_receivers['EmailId'] == email.Index]
        for receiver in receivers.itertuples():
            nodes |= {int(receiver.PersonId)}
            links[int(senderId)][int(receiver.PersonId)] += 1

In [None]:
print('Number of nodes: ' + str(len(nodes)))
print('Number of links: ' + str(sum([len(receivers) for sender, receivers in links.items()])))

To build a graphic representation of the graph, we need a simple list of all link. To do so, we flatten the links dictionnary build before.

In [None]:
list_links = []
for sender, receivers in links.items():
    for receiver, weight in receivers.items():
        list_links.append((sender, receiver))

In [None]:
print('Number of links: ' + str(len(list_links)))

We can now, build the first graph with all links and all nodes:

In [None]:
G = nx.Graph()
for node in nodes:
    name = df_persons.loc[1]['Name']
    G.add_node(node, id=node, name=name)
G.add_edges_from(list_links)

In [None]:
nx.draw_circular(G, node_size=30, node_color='white')
plt.show()

Like we can see, there are a lot of nodes and links and it's not relevant of anything. Let's try with the `community` tools to find partitions and at the same time fetch email for each community.

In [None]:
# Base on example here: http://perso.crans.org/aynaud/communities/index.html
partition = community.best_partition(G)

size = float(len(set(partition.values())))
pos = nx.spring_layout(G)
count = 0
communities = {}
for communityId in set(partition.values()):
    
    # Build graph
    count = count + 1
    list_nodes = [partition_nodes for partition_nodes in partition.keys() if partition[partition_nodes] == communityId]
    nx.draw_networkx_nodes(G, pos, list_nodes, node_size=20, node_color=str(count / size))
    
    # Build communities with emails
    content = ''
    for personId in list_nodes:
        content += ' '.join([email.MetadataSubject + ' ' + email.ExtractedBodyText for email in df_emails.loc[df_emails['SenderPersonId'] == personId].itertuples()]).lower()
    communities[communityId] = {
        'document': content,
        'nodes': list_nodes,
        'counter': Counter(content.split())
    }

nx.draw_networkx_edges(G, pos, alpha=0.5)
plt.show()

In [None]:
print('Number of communities: ' + str(len(set(partition.values()))))

We can see, that there is only one real community (with Clinton at the middle for sure) and few others without a lot of links. We can get now most used words in each communities:

In [None]:
for key, com in communities.items():
    print('Words: ', end='')
    for word, number in com['counter'].most_common(20):
        print('"'+ word + '" (' + str(number) + ')', end=', ')
    print('\n-----------------------')