# Here I'll just implement that :
http://dl.acm.org/citation.cfm?doid=2600428.2609514

In [143]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from ast import literal_eval
%matplotlib inline

In [208]:
#df = pd.read_csv('data/dummy2.csv')
df = pd.read_csv('data/train.csv')
df['recipient_id'] = df['recipient_id'].apply(literal_eval)
X, y = df[['sender_id', 'body']].values, df['recipient_id'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

In [209]:
df.head()

Unnamed: 0,sender,sender_id,mid,date,body,recipient_id,recipients
0,christian.yoder@enron.com,104,60,2000-07-25 08:14:00,Legal has been assessing the risks of doing bl...,"[125, 126, 127, 117, 128, 129, 130, 131, 132]",robert.badeer@enron.com murray.o neil@enron.co...
1,heather.dunton@enron.com,113,66,2000-08-03 02:56:00,Attached is a spreadsheet to estimate export f...,"[50, 125, 126, 127, 133, 28, 117, 134, 135, 13...",kim.ward@enron.com robert.badeer@enron.com mur...
2,janel.guerrero@enron.com,49,74,2000-08-15 05:37:00,Kevin/Bob: Here is a quick rundown on the cons...,"[125, 146, 147]",robert.badeer@enron.com john.massey@enron.com ...
3,tim.belden@enron.com,117,80,2000-08-20 14:12:00,check this out and let everyone know what s up...,"[125, 130]",robert.badeer@enron.com jeff.richter@enron.com
4,christian.yoder@enron.com,104,83,2000-08-22 08:17:00,Further to your letter to us (addressed to Mr....,"[148, 149, 125, 150]",pgillman@schiffhardin.com kamarlantes@calpx.co...


Total number of addresses is 9874.

### First, let's compute the social graph information we need

In [212]:
n_train = len(X_train)
n_senders, n_people = 125, 9874
#n_senders, n_people = 1, 4
graph = np.zeros((n_senders, n_people))
for i in range(n_train):
    sender = X_train[i][0]
    # found the bug: y instead of y_train, that was shuffled.
    for recipient in y_train[i]:
        graph[sender][recipient] += 1
        
total_received = np.sum(graph, axis=0)
# if someone never received message, we put 1 instead of 0 to avoid arthmetical errors...
for i in range(len(total_received)):
    if total_received[i] == 0:
        total_received[i] = 1

graph_n = graph/total_received

In [213]:
graph

array([[ 1.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [214]:
print(np.sum(graph == 0.), n_senders*n_people)

(1213924, 1234250)


The graph is really sparse (as one could expect), so changing the format would be a good idea if it was very large. But here operations are quite fast already.

### Now let's do the bag-of-word, tokenizing,... things

In [215]:
print(X_train[3][1])

Bunda,I will call them later this morning when they open to give me a price.. then I will call Louis Shanks to see if they will match the price.  I will let you know as soon  as I find out somethingLoveME 


In [216]:
from sklearn.feature_extraction.text import CountVectorizer

In [218]:
vect = CountVectorizer(min_df=1, binary=True)
X_body = vect.fit_transform(X_train[:,1])

In [219]:
from collections import defaultdict

n_words = X_body.shape[1]
fwrs = [defaultdict(lambda: defaultdict(lambda: 0.)) for i in range(n_words)]
fwr = [defaultdict(lambda: 0.) for i in range(n_words)]
fw = np.array(np.sum(X_body, axis=0))[0].astype(float)

In [220]:
import itertools

cx = X_body.tocoo()
for i,j in itertools.izip(cx.row, cx.col):
    for recipient_id in y_train[i]:
        fwrs[j][X_train[i][0]][recipient_id] += 1
        fwr[j][recipient_id] += 1

In [221]:
l, g, b = .6, .2, .2
terms = [defaultdict(lambda: defaultdict(lambda: 0)) for i in range(n_words)]
for w in range(n_words):
    for s,d in fwrs[w].iteritems():
        for r,v in d.iteritems():
            terms[w][s][r] = np.log(l*v/max(1., graph[s][r]) + g*fwr[w][r]/total_received[r] + b*fw[w]/n_train)

### Fit is done, let's predict on test

Predict on only a small part of the test because we don't know how to compute the probabilities fast enough for now.

In [246]:
n_small = 1000
X_test_body = vect.transform(X_test[:n_small,1])

In [247]:
X_test_body.shape

(1000, 153808)

In [248]:
y_pred = np.zeros((X_test_body.shape[0], 10), dtype=int)

In [249]:
recipient_lists = [set() for i in range(125)]
for i in range(n_senders):
    for j in range(n_people):
        if graph[i][j] != 0:
            recipient_lists[i].add(j)

In [250]:
cxt = X_test_body.tocoo()
cur = 0
for i,j in itertools.izip(cxt.row, cxt.col):
    cur += len(recipient_lists[X_test[i][0]])
cur

27009280

In [251]:
from time import time

start = time()
cxt = X_test_body.tocoo()
probas = [defaultdict(lambda: 0) for i in range(n_small)]
cur = 0
for i,j in itertools.izip(cxt.row, cxt.col):
    s = X_test[i][0]
    for r in recipient_lists[s]:
        if(probas[i][r] == 0):
            probas[i][r] = np.log(graph[s][r])
        if terms[j][s][r] == 0:
            probas[i][r] += np.log(g*fwr[j][r]/total_received[r] + b*fw[j]/n_train)
        probas[i][r] += terms[j][s][r]
print time() - start

54.7495641708


In [252]:
terms[2][0]

defaultdict(<function __main__.<lambda>>, {})

In [253]:
X_test[40][0]

85

In [254]:
recipient_lists[38]

{38}

In [255]:
import operator

In [256]:
for i in range(n_small):
    bests = sorted(probas[i].iteritems(), key=operator.itemgetter(1), reverse=True)[:10]
    #if(len(bests) < 10):
    #    print(i, bests)
    for j in range(min(len(bests), 10)):
        y_pred[i][j] = bests[j][0]

In [257]:
(graph[X_test[7][0]] > 0).sum()

370

In [258]:
sender_favorites = graph.argsort()[:,-10:][:,::-1]
y_pred2 = sender_favorites[X_test[:n_small,0].astype(int)]

In [259]:
def ap(recommanded, real):
    real_set = set(real)
    cur = 0.
    n = len(recommanded)
    ans = 0.
    for k in range(1, n+1):
        if recommanded[k-1] in real_set:
            cur += 1
            ans += cur/k
    return ans/min(n, len(real))

def MAP(recommanded, real):
    ans = 0.
    for i in range(len(recommanded)):
        ans += ap(recommanded[i], real[i])
    return ans/len(recommanded)

print "With only the social graph (baseline)"
print MAP(y_pred2, y_test[:n_small])
print "With the social graph + Naive Bayes on word occurances"
print MAP(y_pred, y_test[:n_small])

With only the social graph (baseline)
0.293037162698
With the social graph + Naive Bayes on word occurances
0.417350182036
