In [35]:
import random
import operator
import pandas as pd
import numpy as np
import time
from collections import Counter
import math

In [2]:
import nltk
import string
import re 
from nltk.corpus import stopwords
from nltk import pos_tag

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Load files

In [4]:
path_to_data = 'data/'

training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)
training_info = pd.read_csv(path_to_data + 'training_info.csv', sep=',', header=0)
test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)
test_info = pd.read_csv(path_to_data + 'test_info.csv', sep=',', header=0)

In [5]:
print "training      : columns", training.columns.values, "- length", len(training)
print "training_info : columns", training_info.columns.values, "- length", len(training_info)
print "test          : columns", test.columns.values, "- length", len(test)
print "test_info     : columns", test_info.columns.values, "- length", len(test_info)

training      : columns ['sender' 'mids'] - length 125
training_info : columns ['mid' 'date' 'body' 'recipients'] - length 43613
test          : columns ['sender' 'mids'] - length 125
test_info     : columns ['mid' 'date' 'body'] - length 2362


# Create some handy structures

In [243]:
# convert training set to dictionary
emails_ids_per_sender = {}
for index, series in training.iterrows():
    row = series.tolist()
    sender = row[0]
    ids = row[1:][0].split(' ')
    emails_ids_per_sender[sender] = ids

# save all unique sender names
all_senders = emails_ids_per_sender.keys()

In [245]:
# create address book with frequency information for each user
address_books = {}
i = 0
for sender, ids in emails_ids_per_sender.iteritems():
    recs_temp = []
    for my_id in ids:
        recipients = training_info[training_info['mid'] == int(my_id)]['recipients'].tolist()
        recipients = recipients[0].split(' ')
        # keep only legitimate email addresses
        recipients = [rec for rec in recipients if '@' in rec]
        recs_temp.append(recipients)
    # flatten
    recs_temp = [elt for sublist in recs_temp for elt in sublist]
    # compute recipient counts
    rec_occ = dict(Counter(recs_temp))
    # order by frequency
    sorted_rec_occ = sorted(rec_occ.items(), key=operator.itemgetter(1), reverse=True)
    # save
    address_books[sender] = sorted_rec_occ

    if i % 10 == 0:
        print i
    i += 1

0
10
20
30
40
50
60
70
80
90
100
110
120


In [246]:
# save all unique recipient names
all_recs = list(set([elt[0] for sublist in address_books.values() for elt in sublist]))
print "number of unique recipients :", len(all_recs)
print "number of unique senders :", len(all_senders)
print 'number of senders that are receivers :',len([elt for elt in all_senders if elt in all_recs])

number of unique recipients : 9779
number of unique senders : 125
number of senders that are receivers : 121


In [247]:
# save all unique user names
all_users = []
all_users.extend(all_senders)
all_users.extend(all_recs)
all_users = list(set(all_users))

print "number of unique usernames :", len(all_users)

number of unique usernames : 9783


In [248]:
# convert training set to dictionary
emails_ids_per_sender_test = {}
for index, series in test.iterrows():
    row = series.tolist()
    sender = row[0]
    ids = row[1:][0].split(' ')
    emails_ids_per_sender_test[sender] = ids

# save all unique sender names
all_senders_test = emails_ids_per_sender_test.keys()

In [249]:
set(all_senders)==set(all_senders_test)

True

# Analyse data

Number of mails sent

In [256]:
training['n_mails_sent'] = training['mids'].apply(lambda s : len(s.split(' ')))

In [257]:
training['n_mails_sent'].describe()

count     125.000000
mean      348.904000
std       515.821916
min        67.000000
25%       111.000000
50%       168.000000
75%       390.000000
max      4350.000000
Name: n_mails_sent, dtype: float64

Dates

In [13]:
min(test_info['date']),max(test_info['date'])

('2001-11-02 05:25:29', '2002-06-24 13:15:28')

In [14]:
min(training_info['date']),max(training_info['date'])

('0001-08-26 22:16:36', '2001-11-01 19:12:34')

Mails length

In [15]:
training_info['n_words']=training_info['body'].apply(lambda x : len(x.split(' ')))

In [16]:
training_info['n_words'].describe()

count    43613.000000
mean       204.850366
std        621.085870
min          1.000000
25%         24.000000
50%         70.000000
75%        184.000000
max      11062.000000
Name: n_words, dtype: float64

# Clean emails

This section aims at cleaning every email body :
- remove stopwords
- keep only nouns and adjectives
- stem words

In [19]:
def clean_text_simple(text, remove_stopwords=True, pos_filtering=True, stemming=True):
    
    punct = string.punctuation.replace('-', '')
    
    # replace tabs with white spaces
    text = text.replace('\t',' ')
    # convert to lower case
    text = text.lower()
    # remove punctuation (preserving intra-word dashes)
    text = ''.join(l for l in text if l not in punct)
    # strip extra white space
    text = re.sub(' +',' ',text)
    # strip leading and trailing white space
    text = text.strip()
    
    
    if(text.isspace() or len(text)==0):
        return []
    
    # tokenize (split based on whitespace)
    tokens = text.split(' ')
    
    if pos_filtering == True:
        # apply POS-tagging
        tagged_tokens = pos_tag(tokens)
        # retain only nouns and adjectives
        tokens_keep = []
        for i in range(len(tagged_tokens)):
            item = tagged_tokens[i]
            if (
            item[1] == 'NN' or #noun
            item[1] == 'NNS' or #noun plural
            item[1] == 'NNP' or #proper noun
            item[1] == 'NNPS' or #proper noun plural
            item[1] == 'JJ' or #adjective
            item[1] == 'JJS' or #adjective plural
            item[1] == 'JJR' #?
            ):
                tokens_keep.append(item[0])
        tokens = tokens_keep
    if remove_stopwords:
        stpwds = stopwords.words('english')
        # remove stopwords
        tokens = [token for token in tokens if token not in stpwds]
    if stemming:
        stemmer = nltk.stem.PorterStemmer()
        # apply Porter's stemmer
        tokens_stemmed = list()
        for token in tokens:
            if token=='oed' or token =='aed':
                continue
            tokens_stemmed.append(stemmer.stem(token))
        tokens = tokens_stemmed

    return(tokens)

In [None]:
training_info['body_cleaned']=0

In [None]:
#takes approximately 25min...
i = 0
t0=time.clock()
for doc in training_info['body']:
    training_info.loc[i,'body_cleaned'] = ' '.join(clean_text_simple(doc))
    if i%1000==0:
        print i, 'elapsed : ',time.clock()-t0
    i+=1

In [52]:
test_info['body_cleaned']=0

In [None]:
i = 0
t0=time.clock()
for doc in test_info['body']:
    test_info.loc[i,'body_cleaned'] = ' '.join(clean_text_simple(doc))
    if i%1000==0:
        print i, 'elapsed : ',time.clock()-t0
    i+=1

In [None]:
# Save to avoid preprocessing again
test_info[['mid','date','body_cleaned']].to_csv(path_to_data+'test_info_cleaned.csv')
training_info[['mid','date','body_cleaned','recipients']].to_csv(path_to_data+'training_info_cleaned.csv')

# Build TF-IDF features

Compute TF-IDF feature vectors from both train and test mails

In [6]:
test_info_cleaned = pd.read_csv(path_to_data+'test_info_cleaned.csv',index_col=0)
test_info_cleaned['body_cleaned'] = test_info_cleaned['body_cleaned'].replace(np.nan,'')

training_info_cleaned = pd.read_csv(path_to_data+'training_info_cleaned.csv',index_col=0)
training_info_cleaned['body_cleaned'] = training_info_cleaned['body_cleaned'].replace(np.nan,'')

In [92]:
# put all mails from train and test in one array
all_mails = pd.concat((test_info_cleaned['body_cleaned'],training_info_cleaned['body_cleaned'])).values

stpwds = stopwords.words('english')
tfidf_vectorizer = TfidfVectorizer(stop_words = stpwds)

doc_term_matrix = tfidf_vectorizer.fit_transform(all_mails)
doc_term_train = doc_term_matrix[range(len(training_info_cleaned)),:]
doc_term_test = doc_term_matrix[range(len(training_info_cleaned),len(training_info_cleaned)+len(test_info_cleaned)),:]

# sanity check
print doc_term_test.shape[0]==len(test_info_cleaned)
print doc_term_train.shape[0]==len(training_info_cleaned)

True
True


# Make prediction

For each `mail_pred` that has to be predicted :
   - Get sender and feature vector 
   - For each `mail_train` in training that was sent by sender :
       - Compute the similarity `sim` with `mail_pred`
       - For each recipient of `mail_train` :
           - add 1 to `score_freq`
           - add `sim` to `score_sim`

In [195]:
# will contain email ids, predictions for tf-idf prediction
predictions_per_sender = {}

# number of recipients to predict
k = 10

scores_freq = {}
scores_sim = {}
for index, row in test.iterrows():
    
    print index
    sender = row.tolist()[0]
    predictions = []
    
    # the possible recipients considered are the ones already in address book
    possible_recipients = [elt[0] for elt in address_books[sender]]
    
    # get IDs of the emails for which recipient prediction is needed
    ids_predict = row.tolist()[1].split(' ')
    ids_predict = [int(my_id) for my_id in ids_predict]
    
    for id_pred in ids_predict :
        
        # initialize the scores for all possible recipients to zero
        scores_freq[id_pred]={}
        scores_sim[id_pred]={}
        for rec in possible_recipients:
            scores_freq[id_pred][rec]=0
            scores_sim[id_pred][rec]=0
        
        #get the feature vector corresponding to the email to predict
        row_pred = test_info.loc[test_info['mid']==int(id_pred)].index.tolist()[0]
        vect_pred = doc_term_test[row_pred,:].todense().tolist()[0]
        
        #get the ids of mails in training sent by sender
        ids_train_sender = training.loc[training['sender']==sender,'mids'].split(' ')
        
        #for each one, compute similarity, and increase score of all recipients
        for id_train in ids_train_sender:
            
            #get the feature vector corresponding to train email
            row_train = training_info.loc[training_info['mid']==int(id_train)].index.tolist()[0]
            vect_train = doc_term_train[row_train,:].todense().tolist()[0]
            
            #compute similarity measure (vectors are already normalized)
            sim = np.dot(vect_pred,vect_train)
                
            #increase the scores of recipients of the email
            recipients = training_info[training_info['mid'] == int(id_train)]['recipients'].tolist()
            recipients = recipients[0].split(' ')
            recipients = [rec for rec in recipients if '@' in rec]
            
            for rec in recipients:
                scores_freq[id_pred][rec]=scores[id_pred][rec]+1
                scores_sim[id_pred][rec]=scores_sim[id_pred][rec]+sim

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124


In [259]:
training

Unnamed: 0,sender,mids,n_mails_sent
0,karen.buckley@enron.com,158713 158697 200301 158679 278595 298162 2002...,156
1,amr.ibrahim@enron.com,215241 3437 215640 3506 191790 3517 3520 3562 ...,87
2,andrea.ring@enron.com,270705 270706 270707 270708 270709 270710 2707...,124
3,sylvia.hu@enron.com,111444 111422 183084 111412 111347 110883 1105...,116
4,phillip.platter@enron.com,327074 327384 327385 264443 274124 274125 2741...,83
5,richard.shapiro@enron.com,119822 125344 120633 323342 323343 119762 1203...,519
6,megan.parker@enron.com,361536 361285 358251 358253 358254 358255 3582...,109
7,david.forster@enron.com,17976 17969 17967 17966 18142 17965 17955 1795...,345
8,mike.maggi@enron.com,330758 330578 287305 287306 287307 287460 2874...,84
9,justin.rostant@enron.com,396265 10683 252783 252782 10631 156898 252121...,106


In [208]:
id_to_pred = test_info['mid'].values

In [216]:
sum_freq = 0
sum_sim = 0
count = 0

for id_pred in id_to_pred:
    for rec, score_sum in scores_freq[id_pred].iteritems():
        sum_freq = sum_freq + scores_freq[id_pred][rec]
        sum_sim = sum_sim + scores_sim[id_pred][rec]
        count+=1

In [236]:
# Save the scores
import json
with open('Data/scores_freq.txt', 'w') as fp:
    json.dump(scores_freq, fp)
with open('Data/scores_sim.txt', 'w') as fp:
    json.dump(scores_sim, fp)

In [217]:
sum_freq

5318741.0

In [218]:
sum_sim

38793.541216463775

In [253]:
#Equalize the contributions of emails sent and text similarity to score
k=10
factor = sum_freq/sum_sim
predictions = {}
for id_pred in id_to_pred:
    for rec, score_sum in scores_freq[id_pred].iteritems():
        scores[id_pred][rec]=scores_freq[id_pred][rec]+factor*scores_sim[id_pred][rec]
        
    sorted_scores = sorted(scores[id_pred].items(), key=operator.itemgetter(1), reverse=True)
    predictions[id_pred]=[elt[0] for elt in sorted_scores[:k]]

In [254]:
path_to_results = 'results/'

with open(path_to_results + 'predictions_mail_similarity_tf_idf_2.txt', 'wb') as my_file:
    my_file.write('mid,recipients' + '\n')
    for id_pred, pred in predictions.iteritems():
            my_file.write(str(id_pred) + ',' + ' '.join(pred) + '\n')

# Baseline

In [250]:
# will contain email ids, predictions for random baseline, and predictions for frequency baseline
predictions_per_sender_baseline = {}

# number of recipients to predict
k = 10

for index, row in test.iterrows():
    name_ids = row.tolist()
    sender = name_ids[0]
    # get IDs of the emails for which recipient prediction is needed
    ids_predict = name_ids[1].split(' ')
    ids_predict = [int(my_id) for my_id in ids_predict]
    random_preds = []
    freq_preds = []
    # select k most frequent recipients for the user
    k_most = [elt[0] for elt in address_books[sender][:k]]
    for id_predict in ids_predict:
        # select k users at random
        random_preds.append(random.sample(all_users, k))
        # for the frequency baseline, the predictions are always the same
        freq_preds.append(k_most)
    predictions_per_sender_baseline[sender] = [ids_predict, random_preds, freq_preds]

In [None]:
# Write predictions in proper format for Kaggle
path_to_results = 'results/'

with open(path_to_results + 'predictions_random.txt', 'wb') as my_file:
    my_file.write('mid,recipients' + '\n')
    for sender, preds in predictions_per_sender_baseline.iteritems():
        ids = preds[0]
        random_preds = preds[1]
        for index, my_preds in enumerate(random_preds):
            my_file.write(str(ids[index]) + ',' + ' '.join(my_preds) + '\n')

with open(path_to_results + 'predictions_frequency.txt', 'wb') as my_file:
    my_file.write('mid,recipients' + '\n')
    for sender, preds in predictions_per_sender_baseline.iteritems():
        ids = preds[0]
        freq_preds = preds[2]
        for index, my_preds in enumerate(freq_preds):
            my_file.write(str(ids[index]) + ',' + ' '.join(my_preds) + '\n')