In [15]:
import random
import numpy as np
import operator
import pandas as pd
from collections import Counter
import sys
import pickle
import os
import matplotlib.pyplot as plt
%matplotlib inline


### Useful fns 

In [16]:
def save_callback(df, mail_id, sender, receiver, mid_best_cosine, likelihood, p1, p2, p3 ):
    
    df.loc[-1] = [ mail_id, sender, receiver, mid_best_cosine, likelihood, p1, p2, p3]
    df.index = df.index + 1
    
def is_csr_matrix_only_zeroes(my_csr_matrix):
    return(len(my_csr_matrix.nonzero()[0]) == 0)

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

def read_mail(mid, pre_processed = 'pre_processed'):
    if df_cv[df_cv['mid'] == int(mid)].empty == False:
        out = df_cv[df_cv['mid'] == int(mid)][pre_processed].values[0]
    elif df_train[df_train['mid'] == int(mid)].empty == False:
        out = df_train[df_train['mid'] == int(mid)][pre_processed].values[0]
    elif df_test[df_test['mid'] == int(mid)].empty == False:
        out = df_test[df_test['mid'] == int(mid)][pre_processed].values[0]
    else :
        out = "Mail non trouvé... "
    print (out)

In [17]:
# mail 110806
# # 93999    1.0
# read_mail(110806, 'body')
# read_mail(181457, 'body')

In [18]:
mapk([list(np.arange(10))], [list( np.random.permutation(10))], k=10)
mapk([['a', 'b']], [['b', 'c', 'd']], k=10)

0.5

### Loading data and graphs

In [19]:
path_to_data = "Data/"

##########################
# load some of the files #                           
##########################

training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)

training_info = pd.read_csv('training_info_processed.csv', sep=',', header=0)

test_info = pd.read_csv('test_info_processed.csv', sep=',', header=0)

test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)

In [20]:
len(test_info)

2362

Notes :
Careful, there is duplicates in 'mid' for training_info and test_info.

In [21]:
################################
# create some handy structures #                    
################################
                            
# convert training set to dictionary
emails_ids_per_sender = {}
for index, series in training.iterrows():
    row = series.tolist()
    sender = row[0]
    ids = row[1:][0].split(' ')
    emails_ids_per_sender[sender] = ids

# save all unique sender names
all_senders = emails_ids_per_sender.keys()

# create address book with frequency information for each user
address_books = {}
i = 0

if (os.path.isfile('all_users.pkl')) & (os.path.isfile('address_books.pkl')) & (os.path.isfile('all_recs.pkl')):
    all_recs = pickle.load(open('all_recs.pkl', 'rb'))                                   
    all_users = pickle.load(open('all_users.pkl', 'rb'))
    address_books = pickle.load(open('address_books.pkl', 'rb'))
else:
    for sender, ids in emails_ids_per_sender.items():
        recs_temp = []
        for my_id in ids:

            '''Recipients'''
            recipients = training_info[training_info['mid']==int(my_id)]['recipients'].tolist()
            recipients = recipients[0].split(' ')
            # keep only legitimate email addresses
            recipients = [rec for rec in recipients if '@' in rec]
            recs_temp.append(recipients)


            '''mail info'''

        # flatten    
        recs_temp = [elt for sublist in recs_temp for elt in sublist]
        # compute recipient counts
        rec_occ = dict(Counter(recs_temp))
        # order by frequency
        sorted_rec_occ = sorted(rec_occ.items(), key=operator.itemgetter(1), reverse = True)
        # save
        address_books[sender] = sorted_rec_occ

        if i % 10 == 0:
            print (i)
        i += 1

    # save all unique recipient names    
    all_recs = list(set([elt[0] for sublist in address_books.values() for elt in sublist]))

    # save all unique user names 
    all_users = []
    all_users.extend(all_senders)
    all_users.extend(all_recs)
    all_users = list(set(all_users))
    
    pickle.dump(all_recs, open('all_recs.pkl', 'wb')) 
    pickle.dump(all_users, open('all_users.pkl', 'wb')) 
    pickle.dump(address_books, open('address_books.pkl', 'wb')) 

In [22]:
'''Construct the communication graph of senders/receivers'''

import networkx as nx
import pdb

DG_path = 'DG.text'
MG_path = 'MG.text'

# check if it already exists
if (os.path.isfile(DG_path)) & (os.path.isfile(MG_path)):
    DG = pickle.load(open(DG_path))
    MG = pickle.load(open(MG_path))

else:
    DG=nx.DiGraph()
    MG = nx.MultiDiGraph()

    for sender, ids in emails_ids_per_sender.items():
    #     recs_temp = []
        DG.add_node(sender)
        MG.add_node(sender)
        recs_temp = []
        recipients = []
        for my_id in ids:
            recipients = training_info[training_info['mid']==int(my_id)]['recipients'].tolist()
            recipients = recipients[0].split(' ')
            # keep only legitimate email addresses
            recipients = [rec for rec in recipients if "@" in rec]

            DG.add_nodes_from(recipients)
            MG.add_nodes_from(recipients)

            for recipient in recipients:
                MG.add_edge(sender, recipient, email = my_id)
                if DG.has_edge(sender, recipient):
                    # we added this one before, just increase the weight by one
                    DG[sender][recipient]['weight'] += 1
                else:
                    # new edge. add with weight=1
                    DG.add_edge(sender, recipient, weight = 1)
    '''saving graphs'''
    print("Saving DG and MG")
    pickle.dump(DG, open('DG.txt', 'wb'))
    pickle.dump(MG, open('MG.txt', 'wb'))

Saving DG and MG


### NLTK pre-processing

In [23]:
# # from nltk import FreqDist
big_string = training_info['pre_processed'].str.cat(sep=',')
fdistr = Counter(big_string.split(","))

In [40]:
def process_all_body_column(training_info, test=False):
    process = lambda x: pre_process_to_string(x)
    training_info['NNP'] = training_info['body'].apply(process)
    if test :
        training_info.to_csv("test_info_processed.csv")
    else :
        training_info.to_csv("training_info_processed.csv")

In [226]:
'''Mail steming'''
import re
import nltk
import string

nltk.download('punkt') # for tokenization
nltk.download('maxent_treebank_pos_tagger') # for POS tagging
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nameparser.parser import HumanName


[nltk_data] Downloading package punkt to /home/benlet/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /home/benlet/nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/benlet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/benlet/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [26]:
punct = string.punctuation
stemmer = nltk.stem.PorterStemmer()
stpwds = set(nltk.corpus.stopwords.words("english"))

In [206]:
def parse_mails_address_to_names(mail_address):
    name_mail = "".join(letter if letter not in "@" else " " for letter in mail_address).split(" ")[0]
    name_mail = "".join(letter if letter not in punct else "," for letter in name_mail)
    
    return name_mail

In [229]:
parse_mails_address_to_names('amber.keenan@enron.com').split(",")

['amber', 'keenan']

In [65]:
def pre_process(content):
    # Remove formatting
    content =  re.sub("\s+", " ", content)
    # Convert to lower case
#     content = content.lower()
    
    # Replace punctuation by space (preserving intra-word dashes)
    content = "".join(letter if letter not in punct else " " for letter in content )
    
    # Remove punctuation (preserving intra-word dashes)
    content = "".join(letter for letter in content if letter not in punct)
    
    # Remove extra white space
    content = re.sub(" +"," ", content)
    # Remove leading and trailing white space
    content = content.strip()
    # Tokenize and stopword removal
    tokens_keep  = [word for word in content.split() if word not in stpwds] 
    # POS-tag 
    tagged_tokens = nltk.pos_tag(tokens_keep)
#     Keep only nouns and adjectives    
    tokens_keep = [pair[0] for pair in tagged_tokens if (pair[1] in ["NNP","NNPS"])]
                                                         # ["NN","NNS","NNP","NNPS","JJ","JJS","JJR"])]
    
    # Remove extra white space
    tokens_keep = [content.lower() for content in tokens_keep]
    # Apply Porter stemmer
    tokens_keep = [stemmer.stem(token) for token in tokens_keep]
    return tokens_keep

def pre_process_to_string(content):
    return ",".join(pre_process(content))

In [66]:
process_all_body_column(training_info)
process_all_body_column(test_info, test = True)

In [227]:
pre_process_to_string("Kevin/Bob: Here is a quick rundown on the Cons Legal has been assessing the risks of doing")

'kevin,bob,con,legal'

### Creating TF-IDF features

In [29]:
training_info.fillna("", inplace = True)
test_info.fillna("", inplace = True)
nb_training = training_info.shape[0]
nb_test = test_info.shape[0]
all_mails = pd.concat([training_info['pre_processed'], test_info['pre_processed']])

In [30]:
'''TF-IDF on emails'''
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_all_mails = vectorizer.fit_transform(all_mails)
X_all_mails.shape

(45975, 118013)

In [83]:
np.random.seed(9001)
mask = np.random.permutation(nb_training)
train_id = mask[:int(0.99*nb_training)]
cv_id = mask[int(0.99*nb_training):]
print (mask)

[33163 23917 32382 ..., 32286 21413 43065]


In [84]:

X_mails_train = X_all_mails[train_id, :]
X_mails_cv = X_all_mails[cv_id, :]
X_mails_test = X_all_mails[nb_training:, :]
df_train = training_info.iloc[train_id].reset_index()
df_cv = training_info.iloc[cv_id].reset_index()
df_test = test_info.reset_index()
len(df_cv) == X_mails_cv.shape[0]

True

In [33]:
from sklearn.metrics.pairwise import cosine_similarity
def cosine_df(df1, df2, idx1, idx2):
    return cosine_similarity(np.atleast_2d(df1.loc[idx1].values), np.atleast_2d(df2.loc[idx2].values))[0][0]

def cosine_sparse(X1, X2, idx1, idx2):
    if is_csr_matrix_only_zeroes(X1[idx1]) or is_csr_matrix_only_zeroes(X2[idx2]):
        out = 0
    else : 
        out = cosine_similarity(X1[idx1], X2[idx2])[0][0]
        
    return out

### Computing likelihoods 

In [34]:
'''Sender Likelihood'''
k = 10
# frequency based probability
sender = 'sylvia.hu@enron.com'
receiver = 'britt.davis@enron.com'

def total_incoming_mails(receiver_):
    return sum([DG[sender_][receiver_]['weight'] \
                for sender_ in DG.predecessors(receiver_)])

# dictionnary of incoming mails per receiver 
dict_incoming_mails = {}
for recipient in all_recs:
    dict_incoming_mails[recipient] = total_incoming_mails(recipient)
    

def p_fred_S_sachant_R(DG, sender, receiver, k=k):
    
    out = DG[sender][receiver]['weight']/ \
                dict_incoming_mails[receiver]
    
#     save_callback("p(S/R)", None, sender, receiver, None, out )
    return out
    
# co-occurence



In [35]:
p_fred_S_sachant_R(DG=DG, sender=sender, receiver=receiver, k=k)

0.4117647058823529

In [36]:
'''Recipient Likelihood'''

# number of emails received by Receiver / Total # of emails sent

def total_mail_sent(DG=DG):
    A = np.array(list(DG.edges_iter(data='weight', default=1)))
    return np.sum(A[:,2:].flatten().astype(np.int),axis=0)

Total_emails_sent = total_mail_sent()

def p_R(DG, receiver):
    out = dict_incoming_mails[receiver] /Total_emails_sent
#     save_callback("p(R)", None, None, receiver, None, out )
    return out


In [37]:
%%time
p_R(DG=DG, receiver= receiver)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 18.8 µs


6.2967627231646784e-05

In [38]:
'''Email likelihood with TF_IDF'''


def p_e_sachant_r_s_tfidf(email_id, sender, receiver, cv_or_test= 'cv'):
    '''for a given email, the sender of this emails and potential recipients, returns 
    - the maximum cosine_similarity between the given email and all the emails between sender & receiver'''
    out = 1e-15
    best_cosine = None
    if MG.get_edge_data(sender,receiver): 
        '''list all mails between sender and receiver'''
        mail_list = [ a for sublist in \
                     [list(s.values()) for s in MG.get_edge_data(sender,receiver).values()] for a in sublist  ]
        
        mail_list = [mid for mid in mail_list if df_cv[df_cv['mid'] == int(mid)].empty == True  ]
        
        if mail_list:
            mail_tf_idf_scores = []
 
            for mid in mail_list:
                idx_train = df_train[df_train['mid'] == int(mid)].index.values[0]
                if cv_or_test == 'test':
                    idx_test = df_test[df_test['mid'] == int(email_id)].index.values[0]
                    mail_tf_idf_scores.append( cosine_sparse(X_mails_train, X_mails_test, idx_train, idx_test ) )
                else :
                    idx_cv = df_cv[df_cv['mid'] == int(email_id)].index.values[0]
                    mail_tf_idf_scores.append( cosine_sparse(X_mails_train, X_mails_cv, idx_train, idx_cv ) )

            if mail_tf_idf_scores:
                out = np.max(np.array(mail_tf_idf_scores))
                best_cosine = mail_list[np.argmax(np.array(mail_tf_idf_scores))]
                if out ==0 :
                    out = 1e-16
#                 save_callback("p(E/S,R)", email_id, sender, receiver, None, out)
    return out , best_cosine
    

In [39]:
%%time 
# p_e_sachant_r_s_tfidf(email_id, sender, receiver)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.25 µs


## Running Model

### Running the probabilistic model on CV Set 

In [24]:
# import time
# df_proba_cv =pd.DataFrame(columns=["mail_id","sender", "receiver","mid_best_cosine", "likelihood",'P(E/R,S)', 'P(S/E)', 'P(R)'])
# start_time = time.time()
# print("Starting")
# for index, row in df_cv.iterrows():
#     mail_id = df_cv.loc[index]['mid']
#     sender = training[training['mids'].str.contains(str(mail_id))]['sender'].values[0]
    
#     receiver_count = 0
#     for (receiver, _) in address_books[sender][:250]:
        

#         a2 = p_fred_S_sachant_R(DG, sender, receiver)
#         a3 = p_R(DG, receiver)
#         a1, mid_best_cosine = p_e_sachant_r_s_tfidf(mail_id,sender,receiver)
#         out = a1 * a2 * a3 
#         save_callback( df_proba_cv, mail_id, sender, receiver, mid_best_cosine, out, a1, a2, a3)
#     receiver_count += 1
#     print ("CV Mail %d over %d calculated in %d min" % (index, len(df_cv), (time.time() - start_time)/60 ))

# df_proba_cv.to_csv("df_proba_cv.csv", sep=',')

In [85]:
df_proba_cv = pd.read_csv("df_proba_cv.csv", delimiter=',')

In [157]:
top_receivers = 10
def f(weights = [1,1,1]):
    mapk_predicted =[]
    mapk_true = []
    mapk_index = []
    weights_serie = pd.Series(weights)
    prd_scalar = lambda x: np.asarray(x) * np.asarray(weights_serie)
    for index, row in df_cv[:20].iterrows():
#         print(row[0])
        mid_cv = row[3]
#         print("mail %d" %mid_cv)
#         print (df_proba_cv[df_proba_cv['mail_id'] == mid_cv].sort_values(by='P(E/R,S)', ascending = False)[:5]['P(E/R,S)'])
#         df_pow = df_proba_cv[df_proba_cv['mail_id'] == int(mid_cv)][['P(E/R,S)', 'P(S/E)', 'P(R)']].pow(weights)
        
        selected_columns = df_proba_cv[df_proba_cv['mail_id'] == int(mid_cv)][['P(E/R,S)', 'P(S/E)', 'P(R)']]
        loglikelihood = - np.log(selected_columns)
        loglikelihood = loglikelihood.apply(prd_scalar, axis=1)
        
        loglikelihood['likelihood'] = loglikelihood.sum(axis = 1)
        loglikelihood = loglikelihood.sort_values(by='likelihood', ascending = True)
        receiver_names = df_proba_cv.loc[loglikelihood.index][:top_receivers]['receiver'].values
    #     receiver_list_predicted = " ".join(receiver_names)
        receiver_list_predicted = receiver_names

        receiver_list_true = training_info[training_info['mid'] ==mid_cv]['recipients'].values[0].split(" ")
#         print(receiver_list_predicted[:top_receivers])
#         print("######")
#         print(receiver_list_true[:top_receivers])
#         print(apk(receiver_list_true,receiver_list_predicted, k=10))
        mapk_index.append(mid_cv)
        mapk_predicted.append(receiver_list_predicted)
        mapk_true.append(receiver_list_true)
    
    return -mapk(mapk_true, mapk_predicted, k=10) 
            

In [161]:
from scipy.optimize import minimize

weights_ini  = [1, 1, 1]
best_weights = minimize(f, weights_ini, method='Nelder-Mead')

In [162]:
print(- f(best_weights['x']))
print (best_weights['x'])

0.546458333333
[ 1.16822131  0.81410608  0.83203018]


In [160]:
- f([1, 1, 1])

0.50493055555555555

## On Test Set

In [80]:
# import time
# df_proba_test =pd.DataFrame(columns=["mail_id","sender", "receiver","mid_best_cosine", "likelihood",'P(E/R,S)', 'P(S/E)', 'P(R)'])
# start_time = time.time()
# print("Starting")
# for index, row in test.iterrows():
#     name_ids = row.tolist()
#     sender = name_ids[0]
#     # get IDs of the emails for which recipient prediction is needed
#     mids_predict = name_ids[1].split(' ')
#     mids_predict = [int(my_id) for my_id in mids_predict]
    
#     receiver_count = 0
#     for (receiver, _) in address_books[sender][:250]:
#         for mail_id in mids_predict:
            
#             a2 = p_fred_S_sachant_R(DG, sender, receiver)
#             a3 = p_R(DG, receiver)
#             a1, mid_best_cosine = p_e_sachant_r_s_tfidf(mail_id,sender,receiver, 'test')
#             out = a1 * a2 * a3 
#             save_callback( df_proba_test, mail_id, sender, receiver, mid_best_cosine, out, a1, a2, a3)
#         receiver_count += 1
#     print ("Test Sender %d over %d calculated in %d min" % (index, len(df_test), (time.time() - start_time)/60 ))

# df_proba_test.to_csv("df_proba_test.csv", sep=',')

In [163]:
df_proba = pd.read_csv('df_proba_test.csv')

In [164]:
# from IPython.display import Audio
# sound_file = 'reveil.mp3'
# Audio(url=sound_file, autoplay=True)

In [165]:
def multiplylikelihood(weights = [1,1,1]):
    df_pow = df_proba[['P(E/R,S)', 'P(S/E)', 'P(R)']].pow(weights)
    
    df_proba['likelihood'] = df_pow.prod(axis = 1)
    
    

In [167]:
multiplylikelihood(weights = [ 1.16,  0.81, 0.83])

In [168]:
# df_proba[(df_proba['proba_id'] == 'p(R/S,E)') & (df_proba['sender'] =='ginger.dernehl@enron.com')].sort_values(by= 'likelihood' , ascending = False)
df_proba = df_proba.drop(df_proba[(df_proba['receiver'] == df_proba['sender']) ].index)

In [76]:
# df_proba[df_proba['mail_id'] == 298389.0].sort_values(['likelihood'], ascending = False)

In [77]:
# df_proba[(df_proba['mail_id'] == 298389)  \
# #         & (df_proba['sender'] == 'karen.buckley@enron.com') \
# #         & (df_proba['receiver'] == 'c..aucoin@enron.com')  \
# #         & (df_proba['likelihood'] == 0) 
#         ]


In [169]:
submission = pd.read_csv(path_to_data + 'predictions_random.txt')
# df_proba[df_proba['mail_id'] == 298389.0].sort_values(['likelihood'], ascending = False)

In [170]:
# df_proba[df_proba['mail_id'] == mail_id].sort_values(['likelihood'], ascending = False)[:10]['receiver'].values

In [171]:
for index, row in submission.iterrows():
    mail_id = row[0]
    receivers_id = df_proba[df_proba['mail_id'] == mail_id].\
        sort_values(['likelihood'], ascending = False)[:10]['receiver'].values
    receiver_list = " ".join(receivers_id)
    
    submission.loc[index, 'recipients'] = receiver_list
submission.to_csv('submission_test.txt', index=False)

In [71]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [72]:
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [170]:
top_feats_in_doc(X_mails_cv, vectorizer.get_feature_names(), 300, 1)

Unnamed: 0,feature,tfidf
0,z3,0.433911
