In [22]:
import random
import operator
import pandas as pd
from collections import Counter
path_to_data = 'data/'

### Load the Data Files :

In [63]:
training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)
training_info = pd.read_csv(path_to_data + 'training_info.csv', sep=',', header=0)
test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)
test_info = pd.read_csv(path_to_data + 'test_info.csv', sep=',', header=0)

### Create some handy structure :

In [34]:
# convert training set to dictionary
emails_ids_per_sender = {}
for index, series in training.iterrows():
    row = series.tolist()
    sender = row[0]
    ids = row[1:][0].split(' ')
    emails_ids_per_sender[sender] = ids
#emails_ids_per_sender

In [35]:
# save all unique sender names
all_senders = emails_ids_per_sender.keys()

In [57]:
# create address book with frequency information for each user
address_books = {}
i = 0

for sender, ids in emails_ids_per_sender.iteritems():
    recs_temp = []
    for my_id in ids:
        recipients = training_info[training_info['mid']==int(my_id)]['recipients'].tolist()
        recipients = recipients[0].split(' ')
        # keep only legitimate email addresses
        recipients = [rec for rec in recipients if '@' in rec]
        recs_temp.append(recipients)
    # flatten    
    recs_temp = [elt for sublist in recs_temp for elt in sublist]
    # compute recipient counts
    rec_occ = dict(Counter(recs_temp))

    # order by frequency
    sorted_rec_occ = sorted(rec_occ.items(), key=operator.itemgetter(1), reverse = True)
    # save
    address_books[sender] = sorted_rec_occ
    
    if i % 10 == 0:
        print i
    i += 1

0
10
20
30
40
50
60
70
80
90
100
110
120


In [60]:
# save all unique recipient names    
all_recs = list(set([elt[0] for sublist in address_books.values() for elt in sublist]))

In [9]:
# save all unique user names 
all_users = []
all_users.extend(all_senders)
all_users.extend(all_recs)
all_users = list(set(all_users))

### Baseline

In [16]:
# will contain email ids, predictions for random baseline, and predictions for frequency baseline
predictions_per_sender = {}
# number of recipients to predict
k = 10

In [17]:
for index, row in test.iterrows():
    name_ids = row.tolist()
    sender = name_ids[0]
    # get IDs of the emails for which recipient prediction is needed
    ids_predict = name_ids[1].split(' ')
    ids_predict = [int(my_id) for my_id in ids_predict]
    random_preds = []
    freq_preds = []
    # select k most frequent recipients for the user
    k_most = [elt[0] for elt in address_books[sender][:k]]
    for id_predict in ids_predict:
        # select k users at random
        random_preds.append(random.sample(all_users, k))
        # for the frequency baseline, the predictions are always the same
        freq_preds.append(k_most)
    predictions_per_sender[sender] = [ids_predict,random_preds,freq_preds]

In [92]:
predictions_per_sender.values()[0][0]

[298389,
 332383,
 298390,
 284071,
 366982,
 81773,
 81791,
 53502,
 284078,
 285309,
 284037,
 52060,
 199873,
 81820,
 53513,
 94338,
 390529,
 267637,
 162488,
 274873]

### Write predictions in proper format for Kaggle 

In [83]:
path_to_results = 'results/'

with open(path_to_results + 'predictions_random.txt', 'wb') as my_file:
    my_file.write('mid,recipients' + '\n')
    for sender, preds in predictions_per_sender.iteritems():
        ids = preds[0]
        random_preds = preds[1]
        for index, my_preds in enumerate(random_preds):
            my_file.write(str(ids[index]) + ',' + ' '.join(my_preds) + '\n')

with open(path_to_results + 'predictions_frequency.txt', 'wb') as my_file:
    my_file.write('mid,recipients' + '\n')
    for sender, preds in predictions_per_sender.iteritems():
        ids = preds[0]
        freq_preds = preds[2]
        for index, my_preds in enumerate(freq_preds):
            my_file.write(str(ids[index]) + ',' + ' '.join(my_preds) + '\n')

In [94]:
test_info

Unnamed: 0,mid,date,body
0,1577,2001-11-19 06:59:51,Note: Stocks of heating oil are very high for...
1,1750,2002-03-05 08:46:57,"Kevin Hyatt and I are going for ""sghetti"" at S..."
2,1916,2002-02-13 14:17:39,This was forwarded to me and it is funny. - Wi...
3,2094,2002-01-22 11:33:56,I will be in to and happy to assist too. I ma...
4,2205,2002-01-11 07:12:19,Thanks. I needed a morning chuckle.
5,2297,2002-01-11 14:37:19,Note: Westpath Expansion plans filed at NEBTr...
6,5300,2001-11-26 14:13:01,Here are Peggy s slides. -----Original Message...
7,5333,2001-11-19 13:44:18,Here s the information. -----Original Message-...
8,6583,2002-01-18 05:00:48,I would like to know where and how this is goi...
9,7460,2001-11-12 16:43:31,"Richard: Per Elliot s e-mail below, do you hav..."
