In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD 
from sklearn.preprocessing import normalize 

In [2]:
emails_dfs=pd.read_csv('./emails.csv')
emails_dfs.shape




(517401, 2)

In [4]:
emails_df=emails_dfs.head(7100)
emails_df

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...
5,allen-p/_sent_mail/1002.,Message-ID: <30965995.1075863688265.JavaMail.e...
6,allen-p/_sent_mail/1003.,Message-ID: <16254169.1075863688286.JavaMail.e...
7,allen-p/_sent_mail/1004.,Message-ID: <17189699.1075863688308.JavaMail.e...
8,allen-p/_sent_mail/101.,Message-ID: <20641191.1075855687472.JavaMail.e...
9,allen-p/_sent_mail/102.,Message-ID: <30795301.1075855687494.JavaMail.e...


Arranger le data set 

In [5]:
def parse_raw_message(raw_message):
    lines = raw_message.split('\n')
    email = {}
    message = ''
    keys_to_extract = ['date','x-from', 'x-to','subject']
    for line in lines:
        if ':' not in line:
            message += line.strip()
            email['body'] = message
        else:
            pairs = line.split(':')
            key = pairs[0].lower()
            val = pairs[1].strip()
            if key in keys_to_extract:
                email[key] = val
    return email

In [6]:
def parse_into_emails(messages):
    emails = [parse_raw_message(message) for message in messages]
    return {
        'date':[email['date'] for email in emails],
        'sender' : [email['x-from'] for email in emails],
        'receiver': [email['x-to'] for email in emails], 
        'subject':[email['subject'] for email in emails],
        'body': [email['body'] for email in emails]
        
        
    }


In [7]:
emails = [parse_raw_message(message) for message in emails_df.message]
for email in emails:
    print (email)

{'date': 'Mon, 14 May 2001 16', 'subject': '', 'x-from': 'Phillip K Allen', 'x-to': 'Tim Belden <Tim Belden/Enron@EnronXGate>', 'body': 'Here is our forecast'}
{'date': 'Fri, 4 May 2001 13', 'subject': 'Re', 'x-from': 'Phillip K Allen', 'x-to': 'John J Lavorato <John J Lavorato/ENRON@enronXgate@ENRON>', 'body': "Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the presenter speaks and the others are quiet just waiting for their turn.   The meetings might be better if held in a round table discussion format.My suggestion for where to go is

{'date': 'Wed, 6 Sep 2000 08', 'subject': 'utilities roll', 'x-from': 'Phillip K Allen', 'x-to': 'pallen70@hotmail.com', 'body': '---------------------- Forwarded by Phillip K Allen/HOU/ECT on 09/06/2000_________________________________________________________________________Share information about yourself, create your own public profile at- utility.xls- utility.xls'}
{'date': 'Wed, 6 Sep 2000 07', 'subject': 'TIME SENSITIVE', 'x-from': 'Phillip K Allen', 'x-to': 'Ina Rangel', 'body': "---------------------- Forwarded by Phillip K Allen/HOU/ECT on 09/06/2000Executive Impact & Influence Program* IMMEDIATE ACTION REQUIRED - Do Not Delete *As part of the Executive Impact and Influence Program, each participantis asked to gather input on the participant's own management styles andpractices as experienced by their immediate manager, each direct report,and up to eight peers/colleagues.You have been requested to provide feedback for a participant attendingthe next program.  Your input (i.e.,

{'date': 'Thu, 12 Apr 2001 03', 'subject': '', 'x-from': 'Phillip K Allen', 'x-to': 'John J Lavorato', 'body': "---------------------- Forwarded by Phillip K Allen/HOU/ECT on 04/12/2001Heizenrader/PDX/ECT@ECTHere is a simplistic spreadsheet.  I didn't drop in the new generation yet,but even without the new plants it looks like Q3 is no worse than last year.Can you take a look and get back to me with the bullish case?thanks,Phillip"}
{'date': 'Thu, 12 Apr 2001 03', 'subject': '', 'x-from': 'Phillip K Allen', 'x-to': 'Jeff Richter, Tim Belden, Tim Heizenrader', 'body': "Here is a simplistic spreadsheet.  I didn't drop in the new generation yet,but even without the new plants it looks like Q3 is no worse than last year.Can you take a look and get back to me with the bullish case?thanks,Phillip"}
{'date': 'Thu, 12 Apr 2001 02', 'subject': 'Re', 'x-from': 'Phillip K Allen', 'x-to': '"Jeff Smith" <jsmith@austintx.com> @ ENRON', 'body': 'I will try and get my dad to take the appraiser into a 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [8]:
email_df = pd.DataFrame(parse_into_emails(emails_df.message))

In [9]:
email_df

Unnamed: 0,date,sender,receiver,subject,body
0,"Mon, 14 May 2001 16",Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,Here is our forecast
1,"Fri, 4 May 2001 13",Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,Re,Traveling to have a business meeting takes the...
2,"Wed, 18 Oct 2000 03",Phillip K Allen,Leah Van Arsdall,Re,test successful. way to go!!!
3,"Mon, 23 Oct 2000 06",Phillip K Allen,Randall L Gay,,"Randy,Can you send me a schedule of the salary..."
4,"Thu, 31 Aug 2000 05",Phillip K Allen,Greg Piper,Re,
5,"Thu, 31 Aug 2000 04",Phillip K Allen,Greg Piper,Re,"Greg,How about either next Tuesday or Thursday..."
6,"Tue, 22 Aug 2000 07",Phillip K Allen,"david.l.johnson@enron.com, John Shafer",,Phillip Allen (pallen@enron.com)Mike Grigsby (...
7,"Fri, 14 Jul 2000 06",Phillip K Allen,Joyce Teixeira,Re,
8,"Tue, 17 Oct 2000 02",Phillip K Allen,Mark Scott,Re,I don't think these are required by the ISP2. ...
9,"Mon, 16 Oct 2000 06",Phillip K Allen,zimam@enron.com,FW,---------------------- Forwarded by Phillip K ...


In [10]:
# Drop emails with empty body, to or from_ columns. 
email_df.drop(email_df.query("body == ''").index, inplace=True)

In [11]:
email_df

Unnamed: 0,date,sender,receiver,subject,body
0,"Mon, 14 May 2001 16",Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,Here is our forecast
1,"Fri, 4 May 2001 13",Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,Re,Traveling to have a business meeting takes the...
2,"Wed, 18 Oct 2000 03",Phillip K Allen,Leah Van Arsdall,Re,test successful. way to go!!!
3,"Mon, 23 Oct 2000 06",Phillip K Allen,Randall L Gay,,"Randy,Can you send me a schedule of the salary..."
5,"Thu, 31 Aug 2000 04",Phillip K Allen,Greg Piper,Re,"Greg,How about either next Tuesday or Thursday..."
6,"Tue, 22 Aug 2000 07",Phillip K Allen,"david.l.johnson@enron.com, John Shafer",,Phillip Allen (pallen@enron.com)Mike Grigsby (...
8,"Tue, 17 Oct 2000 02",Phillip K Allen,Mark Scott,Re,I don't think these are required by the ISP2. ...
9,"Mon, 16 Oct 2000 06",Phillip K Allen,zimam@enron.com,FW,---------------------- Forwarded by Phillip K ...
10,"Mon, 16 Oct 2000 06",Phillip K Allen,"""Buckner, Buck"" <buck.buckner@honeywell.com> @...",Re,"Mr. Buckner,For delivered gas behind San Diego..."
11,"Fri, 13 Oct 2000 06",Phillip K Allen,stagecoachmama@hotmail.com,,"Lucy,Open them and save in the rentroll folder..."


In [12]:
email_df.shape

(6985, 5)

#visualisation de data

In [13]:
print (len(email_df.receiver.unique()))

1195


In [14]:
print (len(email_df.sender.unique()))

566


supprimer from body forwarded et Forwarded

In [15]:
stopwords = ENGLISH_STOP_WORDS.union(['forwarded', 'Forwarded'])

#analyse the body text  avec TF-IDF

In [21]:
# max_df=0.5 means "ignore all terms that appear in more then 50% of the body"
# min_df=2 means "ignore all terms that appear in less then 2 body"
vect = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.4, min_df=2)

In [22]:
X = vect.fit_transform(email_df.body)
features = vect.get_feature_names()

#Clustering avec k-means

In [23]:
#trouver top 10 mot clé pour un body 
def top_tfidf_feats(row, features, top_n=20):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats, columns=['features', 'score'])
    return df

def top_feats_in_body(X, features, row_id, top_n=25):
    row = np.squeeze(X[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)
#exemple pour l'email 1
print (top_feats_in_body(X, features, 1, 10))

     features     score
0    meetings  0.399147
1        trip  0.320652
2         ski  0.289291
3    business  0.278236
4       takes  0.204408
5         try  0.157286
6   stimulate  0.156408
7   presenter  0.153417
8  productive  0.148540
9      speaks  0.144646


In [24]:
#pour les body de tous les emails top 10 mots clé
def top_mean_feats(X, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    if grp_ids:
        D = X[grp_ids].toarray()
    else:
        D = X.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)
print(top_mean_feats(X, features, None, 0.1, 10) )

   features     score
0   message  0.019519
1  original  0.016937
2   phillip  0.015097
3     image  0.013062
4       gas  0.011257
5      john  0.010279
6     allen  0.009441
7      corp  0.008207
8        09  0.007358
9     today  0.006293


In [20]:
#les mots enron thank thanks sont inutiles 
stopwords = ENGLISH_STOP_WORDS.union(['forwarded', 'Forwarded','enron','thank','thanks','com','na','03','email','000','etc','hou','ect'])

In [63]:
n_clusters = 3
clf = KMeans(n_clusters=n_clusters, 
            max_iter=100, 
            init='k-means++', 
            n_init=1)
labels = clf.fit_predict(X)

In [None]:
# Let's plot this with matplotlib to visualize it.
# First we need to make 2D coordinates from the sparse matrix.
X_dense = X.todense()
pca = PCA(n_components=2).fit(X_dense)
coords = pca.transform(X_dense)

# Lets plot it again, but this time we add some color to it.
# This array needs to be at least the length of the n_clusters.
label_colors = ["#2AB0E9", "#2BAF74", "#D7665E", "#CCCCCC", 
                "#D2CA0D", "#522A64", "#A3DB05", "#FC6514"]
colors = [label_colors[i] for i in labels]

#plt.scatter(coords[:, 0], coords[:, 1], c=colors)
# Plot the cluster centers
centroids = clf.cluster_centers_
centroid_coords = pca.transform(centroids)
# plt.scatter(centroid_coords[:, 0], centroid_coords[:, 1], marker='X', s=200, linewidths=2, c='#444d60')
# plt.show()

#Use this to print the top terms per cluster with matplotlib.
plot_tfidf_classfeats_h(top_feats_per_cluster(X, labels, features, 0.1, 25))