In [33]:
import random
import numpy as np
import operator
import pandas as pd
from collections import Counter
import sys
import pickle
import os
import matplotlib.pyplot as plt
%matplotlib inline

### Useful fns 

In [34]:
def save_callback(df, mail_id, sender, receiver, mid_best_cosine, likelihood, p1, p2, p3 ):
    """Function to save computed probabilities in dataframe df, during CV or Test computation"""
    df.loc[-1] = [ mail_id, sender, receiver, mid_best_cosine, likelihood, p1, p2, p3]
    df.index = df.index + 1
    
def is_csr_matrix_only_zeroes(my_csr_matrix):
    """Return True if a sparse matrix is composed only of zeros"""
    return(len(my_csr_matrix.nonzero()[0]) == 0)

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

def read_mail(mid, col_id = 'pre_processed'):
    """ Based on a mail_id, returns its body or pre-processed body """
    if df_cv[df_cv['mid'] == int(mid)].empty == False:
        out = df_cv[df_cv['mid'] == int(mid)][col_id].values[0]
    elif df_train[df_train['mid'] == int(mid)].empty == False:
        out = df_train[df_train['mid'] == int(mid)][col_id].values[0]
    elif df_test[df_test['mid'] == int(mid)].empty == False:
        out = df_test[df_test['mid'] == int(mid)][col_id].values[0]
    else :
        out = "Mail non trouvé... "
    return out

### Loading data and graphs

In [35]:
path_to_data = "Data/"

##########################
# load some of the files #                           
##########################

training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)

training_info = pd.read_csv('training_info_processed.csv', sep=',', header=0)

test_info = pd.read_csv('test_info_processed.csv', sep=',', header=0)

test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)

In [36]:
################################
# create some handy structures #                    
################################
                            
# convert training set to dictionary
emails_ids_per_sender = {}
for index, series in training.iterrows():
    row = series.tolist()
    sender = row[0]
    ids = row[1:][0].split(' ')
    emails_ids_per_sender[sender] = ids

# save all unique sender names
all_senders = emails_ids_per_sender.keys()

# create address book with frequency information for each user
address_books = {}
i = 0


if (os.path.isfile('all_users.pkl')) & (os.path.isfile('address_books.pkl')) & (os.path.isfile('all_recs.pkl')):
    '''save files, so as to avoid computation each time the notebook is opened'''
    all_recs = pickle.load(open('all_recs.pkl', 'rb'))                                   
    all_users = pickle.load(open('all_users.pkl', 'rb'))
    address_books = pickle.load(open('address_books.pkl', 'rb'))
else:
    for sender, ids in emails_ids_per_sender.items():
        recs_temp = []
        for my_id in ids:

            '''Recipients'''
            recipients = training_info[training_info['mid']==int(my_id)]['recipients'].tolist()
            recipients = recipients[0].split(' ')
            # keep only legitimate email addresses
            recipients = [rec for rec in recipients if '@' in rec]
            recs_temp.append(recipients)


            '''mail info'''

        # flatten    
        recs_temp = [elt for sublist in recs_temp for elt in sublist]
        # compute recipient counts
        rec_occ = dict(Counter(recs_temp))
        # order by frequency
        sorted_rec_occ = sorted(rec_occ.items(), key=operator.itemgetter(1), reverse = True)
        # save
        address_books[sender] = sorted_rec_occ

        if i % 10 == 0:
            print (i)
        i += 1

    # save all unique recipient names    
    all_recs = list(set([elt[0] for sublist in address_books.values() for elt in sublist]))

    # save all unique user names 
    all_users = []
    all_users.extend(all_senders)
    all_users.extend(all_recs)
    all_users = list(set(all_users))
    
    pickle.dump(all_recs, open('all_recs.pkl', 'wb')) 
    pickle.dump(all_users, open('all_users.pkl', 'wb')) 
    pickle.dump(address_books, open('address_books.pkl', 'wb')) 

In [38]:
'''Construct the communication graph of senders/receivers'''

import networkx as nx
import pdb

DG_path = 'DG.text' # communication Graph
MG_path = 'MG.text' # Graph giving more in-depth information on communications between sender and receiver

if (os.path.isfile(DG_path)) & (os.path.isfile(MG_path)):
    '''save files, so as to avoid computation each time the notebook is opened'''

    DG = pickle.load(open(DG_path))
    MG = pickle.load(open(MG_path))

else:
    DG=nx.DiGraph()
    MG = nx.MultiDiGraph()

    for sender, ids in emails_ids_per_sender.items():
    #     recs_temp = []
        DG.add_node(sender)
        MG.add_node(sender)
        recs_temp = []
        recipients = []
        for my_id in ids:
            recipients = training_info[training_info['mid']==int(my_id)]['recipients'].tolist()
            recipients = recipients[0].split(' ')
            # keep only legitimate email addresses
            recipients = [rec for rec in recipients if "@" in rec]

            DG.add_nodes_from(recipients)
            MG.add_nodes_from(recipients)

            for recipient in recipients:
                MG.add_edge(sender, recipient, email = my_id)
                if DG.has_edge(sender, recipient):
                    # we added this one before, just increase the weight by one
                    DG[sender][recipient]['weight'] += 1
                else:
                    # new edge. add with weight=1
                    DG.add_edge(sender, recipient, weight = 1)
    '''saving graphs'''
    print("Saving DG and MG")
    pickle.dump(DG, open('DG.txt', 'wb'))
    pickle.dump(MG, open('MG.txt', 'wb'))

Saving DG and MG


### Pre-processing mail bodies with NLTK

In [43]:
# # # from nltk import FreqDist
# big_string = training_info['pre_processed'].str.cat(sep=',')
# fdistr = Counter(big_string.split(","))

In [44]:
'''Mail steming imports'''
import re
import nltk
import string

nltk.download('punkt') # for tokenization
nltk.download('maxent_treebank_pos_tagger') # for POS tagging
nltk.download('stopwords') # stopwords
nltk.download('averaged_perceptron_tagger') # stemmer

[nltk_data] Downloading package punkt to /home/benlet/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /home/benlet/nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/benlet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/benlet/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [45]:
punct = string.punctuation
stemmer = nltk.stem.PorterStemmer()
stpwds = set(nltk.corpus.stopwords.words("english"))

In [52]:
def pre_process(content):
    # Remove formatting
    content =  re.sub("\s+", " ", content)
    # Convert to lower case
    content = content.lower()
    # Replace punctuation by space (preserving intra-word dashes)
    content = "".join(letter if letter not in punct else " " for letter in content )
    # Remove punctuation (preserving intra-word dashes)
    content = "".join(letter for letter in content if letter not in punct)
    # Remove extra white space
    content = re.sub(" +"," ", content)
    # Remove leading and trailing white space
    content = content.strip()
    # Tokenize and stopword removal
    tokens_keep  = [word for word in content.split() if word not in stpwds] 
    # POS-tag 
    tagged_tokens = nltk.pos_tag(tokens_keep)
    # Keep only nouns and adjectives    
    tokens_keep = [pair[0] for pair in tagged_tokens if (pair[1] in ["NN","NNS","NNP","NNPS","JJ","JJS","JJR"])]
    # Apply Porter stemmer
    tokens_keep = [stemmer.stem(token) for token in tokens_keep]
    return tokens_keep

def pre_process_to_string(content):
    return ",".join(pre_process(content))

In [53]:
def process_all_body_column(df, new_col_name, test=False):
    '''Process strings for all training and test bodies
       This will create a new column on df with the pre-processing.
    
    '''
    process = lambda x: pre_process_to_string(x)
    df[new_col_name] = df['body'].apply(process)
    if test :
        df.to_csv("test_info_processed.csv")
    else :
        df.to_csv("training_info_processed.csv")

In [17]:
'''Launch pre-processing'''
# process_all_body_column(training_info, 'pre-processed')
# process_all_body_column(test_info,'pre-processed', test = True)

### First names features

In [54]:
def pre_process(content, naming= False):
    '''From a string, pre-process it to a list of tokens'''
    
    # Separate upper names in cases such as : Clement/BenoitAttached
    content =  re.sub("\s+", " ", content)

#     Convert to lower case
    content = content.lower()
    
    # Replace punctuation by space (preserving intra-word dashes)
    content = "".join(letter if letter not in punct else " " for letter in content )
    
    # Remove punctuation (preserving intra-word dashes)
    content = "".join(letter for letter in content if letter not in punct)
    
    # Remove extra white space
    content = re.sub(" +"," ", content)
    # Remove leading and trailing white space
    content = content.strip()
    # Tokenize and stopword removal
    tokens_keep  = [word for word in content.split() if word not in stpwds] 
    # POS-tag 
    tagged_tokens = nltk.pos_tag(tokens_keep)
#     Keep only nouns 
    tokens_keep = [pair[0] for pair in tagged_tokens if (pair[1] in ["NNP","NNPS"])]
    
    # Remove extra white space
    tokens_keep = [content.lower() for content in tokens_keep]
    return tokens_keep

def pre_process_to_string(content):
    return ",".join(pre_process(content))

In [55]:
'''Launch pre-processing for names'''
# process_all_body_column(training_info, 'NNP')
# process_all_body_column(test_info,'NNP', test = True)

'Launch pre-processing for names'

In [57]:
def parse_mails_address_to_names(mail_address):
    '''from email (eg : 'amber.keenan@enron.com' ) returns ['amber', 'keenan'] '''
    name_mail = "".join(letter if letter not in "@" else " " for letter in mail_address).split(" ")[0]
    name_mail = "".join(letter if letter not in punct else "," for letter in name_mail)
    
    return name_mail

In [58]:
parse_mails_address_to_names('amber.keenan@enron.com').split(",")

['amber', 'keenan']

### Extracting TF-IDF features for emails

In [59]:
training_info.fillna("", inplace = True)
test_info.fillna("", inplace = True)
nb_training = training_info.shape[0]
nb_test = test_info.shape[0]

# create a string with all mails for TF-IDF feature extractor
all_mails = pd.concat([training_info['pre_processed'], test_info['pre_processed']])

In [60]:
'''TF-IDF on emails
   We use TF-IDF extractor from sklearn
'''

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_all_mails = vectorizer.fit_transform(all_mails)
X_all_mails.shape

(45975, 118013)

In [63]:
'''Creating CV set'''
# we seed our random number as we want to have reproducible results
np.random.seed(9001)
mask = np.random.permutation(nb_training)
train_id = mask[:int(0.99*nb_training)]
cv_id = mask[int(0.99*nb_training):]
X_mails_train = X_all_mails[train_id, :]
X_mails_cv = X_all_mails[cv_id, :]
X_mails_test = X_all_mails[nb_training:, :]
df_train = training_info.iloc[train_id].reset_index()
df_cv = training_info.iloc[cv_id].reset_index()
df_test = test_info.reset_index()

In [64]:
from sklearn.metrics.pairwise import cosine_similarity

def cosine_sparse(X1, X2, idx1, idx2):
    '''return cosine similarity between two TFIDF representation of two emails'''
    if is_csr_matrix_only_zeroes(X1[idx1]) or is_csr_matrix_only_zeroes(X2[idx2]):
        out = 0
    else : 
        out = cosine_similarity(X1[idx1], X2[idx2])[0][0]
        
    return out

### Computing likelihoods 

#### 1 _ Sender Likelihood P(S/R)

In [70]:
def total_incoming_mails(receiver_):
    return sum([DG[sender_][receiver_]['weight'] \
                for sender_ in DG.predecessors(receiver_)])

# dictionnary of incoming mails per receiver 
dict_incoming_mails = {}
for recipient in all_recs:
    dict_incoming_mails[recipient] = total_incoming_mails(recipient)
    

def p_fred_S_sachant_R(DG, sender, receiver, k=k):
    
    out = DG[sender][receiver]['weight']/ \
                dict_incoming_mails[receiver]
    
    return out
    

In [68]:
# test 
sender = 'sylvia.hu@enron.com'
receiver = 'britt.davis@enron.com'
p_fred_S_sachant_R(DG=DG, sender=sender, receiver=receiver, k=k)

0.4117647058823529

#### 2 _ Recipient Likelihood P(R)

In [71]:
'''Recipient Likelihood'''

# number of emails received by Receiver / Total # of emails sent

def total_mail_sent(DG=DG):
    A = np.array(list(DG.edges_iter(data='weight', default=1)))
    return np.sum(A[:,2:].flatten().astype(np.int),axis=0)

Total_emails_sent = total_mail_sent()

def p_R(DG, receiver):
    out = dict_incoming_mails[receiver] /Total_emails_sent
    return out


In [72]:
%%time
p_R(DG=DG, receiver= receiver)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 576 µs


6.2967627231646784e-05

#### 3 _ Email Likelihood P(E/R,S) - TFIDF

In [75]:
'''Email likelihood with TF_IDF'''

def p_e_sachant_r_s_tfidf(email_id, sender, receiver, cv_or_test= 'cv'):
    '''
    For a given email, the sender of this emails and potential recipients :  
    returns :  
        - the maximum cosine_similarity between the given email and all the emails between sender & receiver
        - Mail ID for this maximum cosine similarity '''
    out = 1e-15
    best_cosine = None
    if MG.get_edge_data(sender,receiver): 
        '''list all mails between sender and receiver'''
        mail_list = [ a for sublist in \
                     [list(s.values()) for s in MG.get_edge_data(sender,receiver).values()] for a in sublist  ]
        
        mail_list = [mid for mid in mail_list if df_cv[df_cv['mid'] == int(mid)].empty == True  ]
        
        if mail_list:
            mail_tf_idf_scores = []
 
            for mid in mail_list:
                idx_train = df_train[df_train['mid'] == int(mid)].index.values[0]
                if cv_or_test == 'test':
                    idx_test = df_test[df_test['mid'] == int(email_id)].index.values[0]
                    mail_tf_idf_scores.append( cosine_sparse(X_mails_train, X_mails_test, idx_train, idx_test ) )
                else :
                    idx_cv = df_cv[df_cv['mid'] == int(email_id)].index.values[0]
                    mail_tf_idf_scores.append( cosine_sparse(X_mails_train, X_mails_cv, idx_train, idx_cv ) )

            if mail_tf_idf_scores:
                out = np.max(np.array(mail_tf_idf_scores))
                best_cosine = mail_list[np.argmax(np.array(mail_tf_idf_scores))]
                if out ==0 :
                    out = 1e-16
    return out , best_cosine
    

In [76]:
%%time 
# p_e_sachant_r_s_tfidf(email_id, sender, receiver)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.68 µs


## Running Model

### Running the probabilistic model on CV Set 

In [30]:
# import time
# df_proba_cv =pd.DataFrame(columns=["mail_id","sender", "receiver","mid_best_cosine", "likelihood",'P(E/R,S)', 'P(S/E)', 'P(R)'])
# start_time = time.time()
# print("Starting")
# for index, row in df_cv.iterrows():
#     mail_id = df_cv.loc[index]['mid']
#     sender = training[training['mids'].str.contains(str(mail_id))]['sender'].values[0]
    
#     receiver_count = 0
#     for (receiver, _) in address_books[sender][:250]:
        

#         a2 = p_fred_S_sachant_R(DG, sender, receiver)
#         a3 = p_R(DG, receiver)
#         a1, mid_best_cosine = p_e_sachant_r_s_tfidf(mail_id,sender,receiver)
#         out = a1 * a2 * a3 
#         save_callback( df_proba_cv, mail_id, sender, receiver, mid_best_cosine, out, a1, a2, a3)
#     receiver_count += 1
#     print ("CV Mail %d over %d calculated in %d min" % (index, len(df_cv), (time.time() - start_time)/60 ))

# df_proba_cv.to_csv("df_proba_cv.csv", sep=',')

In [77]:
# df_proba_cv = pd.read_csv("df_proba_cv.csv", delimiter=',')

In [83]:
# df_proba_cv

In [84]:
top_receivers = 10
def f(weights = [1,1,1]):
    mapk_predicted =[]
    mapk_true = []
    mapk_index = []
    weights_serie = pd.Series(weights)
    prd_scalar = lambda x: np.asarray(x) * np.asarray(weights_serie)
    for index, row in df_cv[:20].iterrows():
        mid_cv = row[4]
        
        selected_columns = df_proba_cv[df_proba_cv['mail_id'] == int(mid_cv)][['P(E/R,S)', 'P(S/E)', 'P(R)']]
        loglikelihood = - np.log(selected_columns)
        loglikelihood = loglikelihood.apply(prd_scalar, axis=1)
        
        loglikelihood['likelihood'] = loglikelihood.sum(axis = 1)
        loglikelihood = loglikelihood.sort_values(by='likelihood', ascending = True)
        receiver_names = df_proba_cv.loc[loglikelihood.index][:top_receivers]['receiver'].values
        receiver_list_predicted = receiver_names

        receiver_list_true = training_info[training_info['mid'] ==mid_cv]['recipients'].values[0].split(" ")
        mapk_index.append(mid_cv)
        mapk_predicted.append(receiver_list_predicted)
        mapk_true.append(receiver_list_true)
    
    return -mapk(mapk_true, mapk_predicted, k=10) 
            

In [86]:
# from scipy.optimize import minimize

# weights_ini  = [1, 1, 1]
# best_weights = minimize(f, weights_ini, method='Nelder-Mead')

In [162]:
print(- f(best_weights['x']))
print (best_weights['x'])

0.546458333333
[ 1.16822131  0.81410608  0.83203018]


In [160]:
- f([1, 1, 1])

0.50493055555555555

## On Test Set

In [80]:
import time
df_proba_test =pd.DataFrame(columns=["mail_id","sender", "receiver","mid_best_cosine", "likelihood",'P(E/R,S)', 'P(S/E)', 'P(R)'])
start_time = time.time()
print("Starting")
for index, row in test.iterrows():
    name_ids = row.tolist()
    sender = name_ids[0]
    # get IDs of the emails for which recipient prediction is needed
    mids_predict = name_ids[1].split(' ')
    mids_predict = [int(my_id) for my_id in mids_predict]
    
    receiver_count = 0
    for (receiver, _) in address_books[sender][:250]:
        for mail_id in mids_predict:
            
            a2 = p_fred_S_sachant_R(DG, sender, receiver)
            a3 = p_R(DG, receiver)
            a1, mid_best_cosine = p_e_sachant_r_s_tfidf(mail_id,sender,receiver, 'test')
            out = a1 * a2 * a3 
            save_callback( df_proba_test, mail_id, sender, receiver, mid_best_cosine, out, a1, a2, a3)
        receiver_count += 1
    print ("Test Sender %d over %d calculated in %d min" % (index, len(df_test), (time.time() - start_time)/60 ))

df_proba_test.to_csv("df_proba_test.csv", sep=',')

In [21]:
df_proba_test = pd.read_csv('df_proba_test.csv')

In [22]:
# removing of addresses with '..' inside
ix_to_false = df_proba_test[ (df_proba_test['P(S/E)'] == True) & (df_proba_test['receiver'].str.contains(str("\.\.")))].index
df_proba_test.loc[ix_to_false, 'P(S/E)'] = False
df_proba_test['name_at_beginning'].replace(False, 0, inplace= True)
df_proba_test['name_in_body'].replace(False, 0, inplace= True)
df_proba_test['name_in_body'].replace(True, 1, inplace= True)
df_proba_test['name_in_body'].replace(True, 1, inplace= True)

df_proba = df_proba_test

In [87]:
def multiplylikelihood(weights = [1,1,1]):
    '''This function allows to fine tune the weights applied to the test'''
    df_pow = df_proba[['P(E/R,S)', 'P(S/E)', 'P(R)']].pow(weights)
    
    df_proba['likelihood'] = df_pow.prod(axis = 1)
    
    df_proba['likelihood'] = df_proba['likelihood'] + df_proba['name_at_beginning'] + df_proba['name_in_body']
    
    

In [88]:
'''Best weights found in CV'''
multiplylikelihood(weights = [ 1.16,  0.81, 0.83])

In [26]:
# df_proba[(df_proba['proba_id'] == 'p(R/S,E)') & (df_proba['sender'] =='ginger.dernehl@enron.com')].sort_values(by= 'likelihood' , ascending = False)
df_proba = df_proba.drop(df_proba[(df_proba['receiver'] == df_proba['sender']) ].index)

In [29]:
submission = pd.read_csv(path_to_data + 'predictions_random.txt')

In [31]:
for index, row in submission.iterrows():
    mail_id = row[0]
    receivers_id = df_proba[df_proba['mail_id'] == mail_id].\
        sort_values(['likelihood'], ascending = False)[:10]['receiver'].values
    receiver_list = " ".join(receivers_id)
    
    submission.loc[index, 'recipients'] = receiver_list
submission.to_csv('submission_test.txt', index=False)

In [89]:

def multiplylikelihood(weights = [1,1,1,1,1]):
    df_pow = df_proba[['P(E/R,S)', 'P(S/E)', 'P(R)','name_at_beginning', 'name_in_body']].pow(weights)
    
    df_proba['likelihood'] = df_pow.prod(axis = 1)
    
# df_proba[(df_proba['proba_id'] == 'p(R/S,E)') & (df_proba['sender'] =='ginger.dernehl@enron.com')].sort_values(by= 'likelihood' , ascending = False)
df_proba = df_proba.drop(df_proba[(df_proba['receiver'] == df_proba['sender']) ].index)