In [1]:
import pandas as pd
import numpy as np
import community
import networkx as nx
import matplotlib.pyplot as plt
import string
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from plotly_helpers import *
import credentials
import collections
from IPython.display import HTML

#nltk.download()
py.sign_in(credentials.PLOTLY_USERNAME, credentials.PLOTLY_API_KEY)

We first read the different CSV files that we load into dataframes.

In [2]:
df_emails = pd.read_csv('hillary-clinton-emails/emails.csv')
df_aliases = pd.read_csv('hillary-clinton-emails/Aliases.csv')
df_emails_receivers = pd.read_csv('hillary-clinton-emails/EmailReceivers.csv')

Then we construct the list of edges among the different senders and receivers for each email sent. That is, we get the sender person's id for each email (node from) and we look into the receivers' id dataframe for the corresponding receivers (nodes to). If the sender id is unknown, we label it as -1.

In [3]:
edges = []
for index, row in df_emails.iterrows():
    emailId, senderId = row[['Id', 'SenderPersonId']]
    
    receiversId = []
    for index, row in df_emails_receivers[df_emails_receivers['EmailId'] == emailId].iterrows():
        receiversId.append(df_aliases[df_aliases['PersonId'] == row['PersonId']]['PersonId'].iloc[0])
    if(np.isnan(senderId)):
        senderId = -1
        
    edges.append(((int)(senderId), receiversId))

Once this list of edges is built, we create the corresponding graph. Then we label each node with its corresponding name by looking in the aliases dataframe and we plot the network graph (the graph is interactive, you can zoom on it to see it in details).

In [4]:
G = nx.Graph()

for sender, receivers in edges:
    for receiver in receivers:
        G.add_edge(sender, receiver)

pos = nx.fruchterman_reingold_layout(G) 
labels = []
for k in pos.keys():
    if(k == -1):
        labels.append('Unknown')
    else:
        labels.append(df_aliases[df_aliases['PersonId'] == k].Alias.values[0])
        
py.iplot(plot_graph('Emails connection network', G, pos, labels))

We then use the community module, to partition the graph into communities.

In [5]:
partition = community.best_partition(G)
values = [partition.get(node) for node in G.nodes()]

print("Number of communities:", max(values))

Number of communities: 17


There is 17 communities, so we can use a list of common colors to attribute a different one for each of them, and print the corresponding community graph (the graph is interactive, you can zoom on it to see it in details).

In [6]:
colors_hex = [
    '#FFB300', # Vivid Yellow
    '#803E75', # Strong Purple
    '#FF6800', # Vivid Orange
    '#A6BDD7', # Very Light Blue
    '#C10020', # Vivid Red
    '#CEA262', # Grayish Yellow
    '#817066', # Medium Gray

    # The following don't work well for people with defective color vision
    '#007D34', # Vivid Green
    '#F6768E', # Strong Purplish Pink
    '#00538A', # Strong Blue
    '#FF7A5C', # Strong Yellowish Pink
    '#53377A', # Strong Violet
    '#FF8E00', # Vivid Orange Yellow
    '#B32851', # Strong Purplish Red
    '#F4C800', # Vivid Greenish Yellow
    '#7F180D', # Strong Reddish Brown
    '#93AA00', # Vivid Yellowish Green
    '#593315', # Deep Yellowish Brown
    '#F13A13', # Vivid Reddish Orange
    '#232C16', # Dark Olive Green
    ]
colors = [colors_hex[i] for i in values]
py.iplot(plot_graph('Emails community network', G, pos, labels, node_color=colors))

Once the communities are set, we build a mapping that maps the id of each community to the ids of each person belonging in this community.

In [7]:
people_per_community = {}

for key, value in sorted(partition.items()):
    people_per_community.setdefault(value, []).append(key)

Now we fetch the common words used by the senders in each community. The fetch_words method is responsible to get all the subjects and body texts written by each sender.

Once we have that, we use the a dictionnary to remove stop words and punctuation. We then print the 20 most common words used by senders in each community. 

In [8]:
def fetch_words(people):
    emails = []
    for people_id in people:
        rows = df_emails[df_emails['SenderPersonId'] == people_id]
        if(len(rows) > 0):
            for val in rows.ExtractedSubject.values:
                emails.append(str(val))
            for val in rows.ExtractedBodyText.values:
                emails.append(str(val))
    return ' '.join(emails)

stop = set(stopwords.words('english'))
stop.update(string.punctuation) #Remove ponctuation
stop.update(['nan', '\'s', '--', '``', 'w', 'fw', 'pls', '\'\'', '-', '—'])

out = ''
for community, people in people_per_community.items():
    community_color = colors_hex[community]
    words = fetch_words(people)
    body_tokenized = [i for i in word_tokenize(words.lower()) if i not in stop]
    if (len(body_tokenized) == 0):
        res = 'No emails sent in this community'
    else:
        res = str(collections.Counter(body_tokenized).most_common()[:20])
    out += '<p style=\"color:' + community_color + '\"> Community ' + community_color + ' : </p> <p>' + res + '</p><br/>'
    
HTML(out)