# Homework 05 - Taming text

In [None]:
# data processing
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import pandas as pd
from sklearn import preprocessing
from gensim import models, corpora

# nltk import

from nltk.corpus import stopwords, subjectivity
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk.data

# utils
import pycountry
import random
from PIL import Image
from os import path
from os.path import exists
from collections import Counter
from collections import defaultdict
%matplotlib inline  


# our code (mark it at autoreload at every cell execution - useful in developement mode)
%load_ext autoreload
%autoreload 1
%aimport utils

In [None]:
#nltk.download()

Credits: Background used in this Notebook was made by <a href="http://www.flaticon.com/authors/freepik" title="Freepik">Freepik</a> from <a href="http://www.flaticon.com" title="Flaticon">www.flaticon.com</a>.

# Read files

First, we import all data as DataFrames.

In [None]:
df_aliases = pd.read_csv('hillary-clinton-emails/Aliases.csv', index_col=0)

In [None]:
df_aliases.head()

In [None]:
df_email_receivers = pd.read_csv('hillary-clinton-emails/EmailReceivers.csv', index_col=0)

In [None]:
df_email_receivers.head()

In [None]:
df_emails = pd.read_csv('hillary-clinton-emails/Emails.csv', index_col=0)
df_emails.fillna('', inplace=True)
df_emails.replace({'\n': ' '}, regex=True, inplace=True)

In [None]:
df_emails.head()

In [None]:
df_persons = pd.read_csv('hillary-clinton-emails/Persons.csv', index_col=0)

In [None]:
df_persons.head()

# Handling raw text

We need to extract the text in the body of each mail.

In [None]:
emails_content = df_emails['ExtractedSubject'] + ' ' + df_emails['ExtractedBodyText']
raw_text = utils.generate_raw_text(data=emails_content.values)

> We generate the word cloud without any further modifications. <br/>
> The word cloud below contains important words use in the hilary's mails

In [None]:
# We have noticed that certain amount of words are specific to mail. 
# these words don't bring anything important in the word cloud, we have decided to remove them.
specific_mail_words = ['Fw','Re','pm']

for word in specific_mail_words: 
    raw_text = raw_text.replace(word,'')

In [None]:
word_cloud = utils.generate_word_cloud(raw_text)

In [None]:
word_cloud.show()

# Apply different processes for cleaning the text

Frist we try to tokenize the hilary's emails <br/>

* A token is an instance of a sequence of characters
* Each such token is now a candidate for an index entry, after further processing

In [None]:
tokens = utils.REGEX_TOKENIZER.tokenize(raw_text)
word_tokenized_text = utils.generate_raw_text(data=tokens)

> Let's visualize the difference with the same wordcloud

In [None]:
word_cloud_tokenize = utils.generate_word_cloud(text=word_tokenized_text, file_name='1_word_tokenize')

In [None]:
word_cloud_tokenize.show()

> Our goal is to remove all meaningless words present in emails.

In [None]:
# stopwords contains "meaningless" words
filtered_words = [word for word in tokens if word not in stopwords.words('english')]
filtered_text = utils.generate_raw_text(data=filtered_words)

In [None]:
word_cloud_stopwords = utils.generate_word_cloud(text=filtered_text, file_name='2_stopwords')

In [None]:
word_cloud_stopwords.show()

Tokenize have some disadvantages for multiple reasons: 
* break up hyphenated sequence
* be unsensible to lower case
* usability/scalability.

This is why, we should use differents method : Lemmatization and stemming.

Let's lemmatize hilari's emails, 
>Reduce inflectional/variant forms to base form

In [None]:
wl = WordNetLemmatizer()
wl_text = wl.lemmatize(filtered_text)

In [None]:
word_cloud_lemmatizer = utils.generate_word_cloud(text=wl_text, file_name='3_WordNetLemmatizer')

In [None]:
word_cloud_lemmatizer.show()

let's stemmerize hilari's emails :
>Reduce terms to their “roots” before indexing

In [None]:
ps = PorterStemmer()
ps_stemming_text = utils.do_stemming_words(stemmer=ps, words=wl_text)

In [None]:
word_cloud_porter_stemmer = utils.generate_word_cloud(text=ps_stemming_text, file_name='4_PorterStemmer')

In [None]:
word_cloud_porter_stemmer.show()

<b> Difference between Stemming and lemmatization: </b> <br/>
<p>The goal of both processes is to reduce inflectional forms or to find related forms of a word to a common base form, but the two techniques differ : </p>

>Both stemming and lemmatization allow queries to match different forms of words.  Stemming was commonly implemented with Reduction techniques, though this is not universal.  Lemmatization implies a possibly broader scope of functionality, which may include synonyms

In [None]:
ss = SnowballStemmer("english")
ss_stemming_text = utils.do_stemming_words(stemmer=ss, words=wl_text)

In [None]:
word_cloud_stemmer = utils.generate_word_cloud(text=ss_stemming_text, file_name='4_SnowballStemmer')

In [None]:
word_cloud_stemmer.show()

# Question 2

### Processing data

> First we need to compute the number of occurence for each country present in Hilary's mails.

In [None]:
countries_occurrences = utils.count_countries_occurrences(ps_stemming_text)

In [None]:
df_countries_occurrences = pd.DataFrame.from_dict(countries_occurrences, orient='index')
df_countries_occurrences.columns = ['Occurrences']
df_countries_occurrences.sort_values('Occurrences', ascending=False, inplace=True)
df_countries_occurrences.head(15)

In [None]:
utils.plot_most_quoted_countries(df_countries_occurrences,30)    

As we can notice from the previous graph the number of occurence decrease quickly. <br/>
We need to pay attention at this detail for the sentimental study where an insufficient number of occurence could change the sense of the result.

In [None]:
df_emails_content = df_emails[['ExtractedSubject', 'ExtractedBodyText']]

### Sentimental Study

In [None]:
results = df_emails_content.apply(utils.retrieve_email_sentiment, axis=1)

In [None]:
results_plot = pd.DataFrame(pd.value_counts(results['Type']))
results_plot.plot()

As we can notice from the previous graph the number of Positive value found in mails are greater compare to Neutral and Negative.

In [None]:
results_vader = df_emails_content.apply(utils.retrieve_email_sentiment, args=('Vader',), axis=1)

In [None]:
result_vader_plot = pd.DataFrame(pd.value_counts(results_vader['Type']))
result_vader_plot.plot()

Using vader we got a different distribution of the data. The difference come to the fact that senti_synsets function doesn't take into account the entire sentence and check the sentiments for individuals words only.

In [None]:
countries_sentiment = utils.get_countries_sentiment(results_vader)

In [None]:
df_countries_sentiment = pd.DataFrame.from_dict(countries_sentiment, orient='index')
df_countries_sentiment.columns = ['Sentiment']
df_countries_sentiment.sort_values('Sentiment', ascending=False, inplace=True)
df_countries_sentiment.head(20)

### Plot the two variable together Occurence/sentiments

In [None]:
countries_data = pd.merge(df_countries_occurrences, df_countries_sentiment, how='inner', left_index=True, right_index=True, sort=True)

In [None]:
# We have noticed that sentiments accorded to haiti become larger compared to other countries, we need to adjust 
# the scale of sentiments in order to obtain a result that can be easily observe
min_max_scaler = preprocessing.MinMaxScaler((-1, 1))
countries_data[['Sentiment']] = np.log(countries_data['Sentiment'] + abs(min(countries_data['Sentiment'])) + 10)
countries_data['Sentiment'] = min_max_scaler.fit_transform(countries_data[['Sentiment']].as_matrix())

In [None]:
utils.plot_most_occurence_contry(countries_data,40)

In [None]:
# The separator works only for nb_contry=20
# The line separate the top best feeling about the "nb_contry" contry, and the flop worst feeling selected by
# "nb_contry" contry
utils.plot_sentiment_by_contry(countries_data,None,nb_contry=20)

From the graph we can notice except for few country (Lativia,Serbia,Lybia), bad Feeling is associate to few occurence.
The same things occur to the good feeling where most of them are associated to lot of occurence.

# Question 3

In [None]:
sentences = utils.SENTENCES_DETECTOR.tokenize(ps_stemming_text.strip())

In [None]:
# https://radimrehurek.com/gensim/models/ldamodel.html
# http://christop.club/2014/05/06/using-gensim-for-lda/
# http://stackoverflow.com/questions/15016025/how-to-print-the-lda-topics-models-from-gensim-python
all_text_array = [[word for word in sentence.lower().split()] for sentence in sentences]
dictionary = corpora.Dictionary(all_text_array)

id2word = {}
for word in dictionary.token2id:    
    id2word[dictionary.token2id[word]] = word

corpus = [dictionary.doc2bow(text) for text in all_text_array]
lda = models.LdaModel(corpus, id2word=id2word, num_topics=50)

In [None]:
lda.print_topics(10)

# Bonus

In [None]:
import community
import networkx as nx
import matplotlib.pyplot as plt

#better with karate_graph() as defined in networkx example.
#erdos renyi don't have true community structure
G = nx.erdos_renyi_graph(30, 0.05)

#first compute the best partition
partition = community.best_partition(G)

#drawing
size = float(len(set(partition.values())))
pos = nx.spring_layout(G)
count = 0.
for com in set(partition.values()) :
    count = count + 1.
    list_nodes = [nodes for nodes in partition.keys()
                                if partition[nodes] == com]
    nx.draw_networkx_nodes(G, pos, list_nodes, node_size = 20,
                                node_color = str(count / size))


nx.draw_networkx_edges(G,pos, alpha=0.5)
plt.show()


In [None]:
links = defaultdict(lambda : defaultdict(int))
nodes = set()
for email in df_emails.itertuples():
    senderId = email.SenderPersonId
    if senderId:
        nodes |= {int(senderId)}
        receivers = df_email_receivers.loc[df_email_receivers['EmailId'] == email.Index]
        for receiver in receivers.itertuples():
            nodes |= {int(receiver.PersonId)}
            links[int(senderId)][int(receiver.PersonId)] += 1

In [None]:
list_links = []
for sender, receivers in links.items():
    for receiver, weight in receivers.items():
        list_links.append((sender, receiver))

In [None]:
G = nx.Graph()
G.add_nodes_from(list(nodes))
G.add_edges_from(list_links)

In [None]:
nx.draw_circular(G, node_size=20)
plt.show()