In [4]:
import numpy as np
import pandas as pd
from ast import literal_eval
from time import time
from collections import defaultdict
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
df = pd.read_csv('data/train.csv').sort_values('date')
df['recipient_id'] = df['recipient_id'].apply(literal_eval)

In [6]:
df.head()

Unnamed: 0,sender,sender_id,mid,date,body,recipient_id,recipients
5304,enron_update@concureworkplace.com,124,47361,0001-08-26 22:16:36,The following reports have been waiting for yo...,[377],kimberly.watson@enron.com
5305,enron_update@concureworkplace.com,124,47362,0001-08-27 22:21:02,The following reports have been waiting for yo...,[377],kimberly.watson@enron.com
5306,enron_update@concureworkplace.com,124,47363,0001-08-28 22:25:35,The following reports have been waiting for yo...,[377],kimberly.watson@enron.com
5285,enron_update@concureworkplace.com,124,45909,0001-09-13 22:24:08,Employee Name: Kimberly WatsonReport Name: E...,[377],kimberly.watson@enron.com
8934,enron_update@concureworkplace.com,124,82030,0001-09-17 09:24:00,The following expense report is ready for appr...,[121],barry.tycholiz@enron.com


### Here I'll try some ideas from there:
http://ieeexplore.ieee.org/document/6273570/

In [4]:
def make_dataset(df, split_ratio=.1):
    X, y = df[['sender_id', 'date']].values, df['recipient_id'].values
    
    X_full = np.array([np.datetime64(s[1]) for s in X]).astype('int64')
    
    X_lol, y_lol = defaultdict(lambda: []), defaultdict(lambda: [])
    recipient_indexes = []
    for i in range(len(X)):
        X_lol[X[i][0]].append(X_full[i])
        y_lol[X[i][0]].append(y[i])
        
    X_train, X_test, y_train, y_test = [], [], [], []
    
    for i in range(125):
        cur_X = np.array(X_lol[i])
        train_size = len(cur_X) - int(.1*len(cur_X))
        X_train.append(cur_X[:train_size])
        X_test.append(cur_X[train_size:])
        cur_y = np.array(y_lol[i])
        y_train.append(cur_y[:train_size])
        y_test.append(cur_y[train_size:])
    
    return X_train, X_test, y_train, y_test, recipient_indexes

In [5]:
from time import time
start = time()
X_train, X_test, y_train, y_test, recipient_indexes = make_dataset(df)
print time() - start

0.07044506073


In [6]:
def predict_sender(X_train, y_train, X_test, l=1.5):
    n_senders, n_people = 125, 9874
    y_pred = []
    for i in range(len(X_test)):
        graph = np.zeros(n_people)
        for j in range(len(X_train)):
            for r in y_train[j]:
                graph[r] += (X_test[i] - X_train[j])**(-l)
        y_pred.append(np.argsort(graph)[::-1][:10])
    return y_pred

def predict(X_train, y_train, X_test, l=1.5, verbose=False):
    y_pred = [[] for i in range(125)]
    for s in range(125):
        if verbose:
            print s
        y_pred[s] = predict_sender(X_train[s], y_train[s], X_test[s], l)
    return y_pred

In [7]:
def ap(recommanded, real):
    real_set = set(real)
    cur = 0.
    n = len(recommanded)
    ans = 0.
    for k in range(1, n+1):
        if recommanded[k-1] in real_set:
            cur += 1
            ans += cur/k
    return ans/min(n, len(real))

def MAP(recommanded, real):
    ans = 0.
    for i in range(len(recommanded)):
        ans += ap(recommanded[i], real[i])
    return ans/len(recommanded)

In [8]:
def score_sender(y_pred, y_test):
    score = 0.
    for j in range(len(y_pred)):
        score += ap(y_pred[j], y_test[j])
    return score

def score(y_pred, y_test):
    scores = np.empty(125, dtype=float)
    c = 0
    for s in range(125):
        c += len(X_test[s])
        for j in range(len(y_test[s])):
            scores[s] += ap(y_pred[s][j], y_test[s][j])
    return np.sum(scores)/c, scores/c

In [9]:
lambdas = np.empty(125)
for s in range(125):
    best_score = 0.
    best_l = 0.
    for l in np.linspace(0, 10, 21):
        score = score_sender(predict_sender(X_train[s], y_train[s], X_test[s], l), y_test[s])
        if(best_score < score):
            best_score = score
            best_l = l
            
    for l in np.linspace(0, 3, 21):
        score = score_sender(predict_sender(X_train[s], y_train[s], X_test[s], l), y_test[s])
        if(best_score < score):
            best_score = score
            best_l = l
    print("Best l for {}: {}".format(s, best_l))
    lambdas[s] = best_l



Best l for 0: 0.5
Best l for 1: 9.5
Best l for 2: 1.2
Best l for 3: 1.35
Best l for 4: 1.5
Best l for 5: 0.75
Best l for 6: 1.35
Best l for 7: 0.0
Best l for 8: 0.0
Best l for 9: 0.5
Best l for 10: 1.5
Best l for 11: 3.0
Best l for 12: 5.5
Best l for 13: 6.0
Best l for 14: 4.0
Best l for 15: 0.15
Best l for 16: 0.6
Best l for 17: 0.6
Best l for 18: 2.4
Best l for 19: 0.5
Best l for 20: 0.3
Best l for 21: 1.0
Best l for 22: 2.5
Best l for 23: 3.5
Best l for 24: 1.35
Best l for 25: 0.5
Best l for 26: 0.15
Best l for 27: 5.5
Best l for 28: 1.0
Best l for 29: 1.35
Best l for 30: 0.6
Best l for 31: 1.5
Best l for 32: 0.0
Best l for 33: 0.5
Best l for 34: 2.5
Best l for 35: 0.0
Best l for 36: 0.5
Best l for 37: 0.0
Best l for 38: 0.0
Best l for 39: 0.6
Best l for 40: 4.5
Best l for 41: 0.15
Best l for 42: 2.0
Best l for 43: 0.15
Best l for 44: 0.9
Best l for 45: 1.95
Best l for 46: 0.6
Best l for 47: 10.0
Best l for 48: 4.0
Best l for 49: 1.0
Best l for 50: 0.5
Best l for 51: 1.0
Best l for 

In [138]:
def predict2(X_train, y_train, X_test, l, verbose=False):
    y_pred = [[] for i in range(125)]
    for s in range(125):
        if verbose:
            print s
        y_pred[s] = predict_sender(X_train[s], y_train[s], X_test[s], l[s])
    return y_pred

In [139]:
y_pred = predict2(X_train, y_train, X_test, lambdas, verbose=False)



In [140]:
scor, scores = score(y_pred, y_test)
scor

TypeError: 'float' object is not callable

In [18]:
df_test = pd.read_csv('data/test.csv').sort_values('date')
X_test2 = df_test.values

In [19]:
X, y = df[['sender_id', 'date']].values, df['recipient_id'].values
    
X_full = np.array([np.datetime64(s[1]) for s in X]).astype('int64')
    
X_train, y_train = defaultdict(lambda: []), defaultdict(lambda: [])
recipient_indexes = []
for i in range(len(X)):
    X_train[X[i][0]].append(X_full[i])
    y_train[X[i][0]].append(y[i])

In [20]:
y_pred2 = np.zeros((len(X_test2), 10))
for i in range(len(X_test2)):
    s = X_test2[i][1]
    graph = np.zeros(9874)
    for j in range(len(X_train[s])):
        for r in y_train[s][j]:
            graph[r] += (np.datetime64(X_test2[i][3]).astype('int64') - X_train[s][j])**(-lambdas[s])
    y_pred2[i] = np.argsort(graph)[::-1][:10]

In [21]:
recipient_ids = {}
for l in df[['recipient_id', 'recipients']].values:
    a = l[1].split()
    for i in range(len(a)):
        recipient_ids[l[0][i]] = a[i]
        
with open('data/sub_memory_model_1.txt', 'w') as f:
    f.write('mid,recipients\n')
    mids = df_test['mid'].values
    for i in range(len(y_pred2)):
        f.write('{},'.format(mids[i]))
        for r in y_pred2[i]:
            f.write(recipient_ids[r] + ' ')
        f.write('\n')

In [22]:
len(X_test2)

2362

In [24]:
df.head()

Unnamed: 0,sender,sender_id,mid,date,body,recipient_id,recipients
5304,enron_update@concureworkplace.com,124,47361,0001-08-26 22:16:36,The following reports have been waiting for yo...,[377],kimberly.watson@enron.com
5305,enron_update@concureworkplace.com,124,47362,0001-08-27 22:21:02,The following reports have been waiting for yo...,[377],kimberly.watson@enron.com
5306,enron_update@concureworkplace.com,124,47363,0001-08-28 22:25:35,The following reports have been waiting for yo...,[377],kimberly.watson@enron.com
5285,enron_update@concureworkplace.com,124,45909,0001-09-13 22:24:08,Employee Name: Kimberly WatsonReport Name: E...,[377],kimberly.watson@enron.com
8934,enron_update@concureworkplace.com,124,82030,0001-09-17 09:24:00,The following expense report is ready for appr...,[121],barry.tycholiz@enron.com


In [29]:
df_test.values[df_test['sender'].values == 'enron_update@concureworkplace.com']

array([['enron_update@concureworkplace.com', 124, 79469,
        '2001-11-13 07:31:36',
        'The following expense report is ready for approval:Employee Name: Zachary D. CostelloStatus last changed by:  Automated AdministratorExpense Report Name: ZCReport Total: $65.40Amount Due Employee: $65.40To approve this expense report, click on the following link for Concur Expense.http://expensexms.enron.com'],
       ['enron_update@concureworkplace.com', 124, 79487,
        '2001-11-15 07:09:40',
        'The following expense report is ready for approval:Employee Name: Matthew  . CommonsStatus last changed by:  Automated AdministratorExpense Report Name: MC111501Report Total: $104.62Amount Due Employee: $104.62To approve this expense report, click on the following link for Concur Expense.http://expensexms.enron.com'],
       ['enron_update@concureworkplace.com', 124, 50379,
        '2001-11-16 16:44:15',
        'The following expense report is ready for approval:Employee Name: Susan J. M

In [32]:
len(df.values[df['sender'].values == 'enron_update@concureworkplace.com'])

334

In [17]:
df[['body', 'recipients']].values

array([[ 'The following reports have been waiting for your approval for more than 4 days.  Please review.Owner: Lorraine LindbergReport Name: May-Aug01Days In Mgr. Queue: 16',
        'kimberly.watson@enron.com'],
       [ 'The following reports have been waiting for your approval for more than 4 days.  Please review.Owner: Lorraine LindbergReport Name: May-Aug01Days In Mgr. Queue: 17',
        'kimberly.watson@enron.com'],
       [ 'The following reports have been waiting for your approval for more than 4 days.  Please review.Owner: Lorraine LindbergReport Name: May-Aug01Days In Mgr. Queue: 18',
        'kimberly.watson@enron.com'],
       ..., 
       [ 'Here are my thoughts.  I have to be honest, I m struggling with the Transco / RTO split.Jim',
        'janel.guerrero@enron.com l..nicolay@enron.com sarah.novosel@enron.com'],
       [ 'Luiz --I don t think that I ll send this given the current direction of the PUC to move forward with TX.  I do think that these points are important 

In [20]:
lambdas = []
with open('lol', 'r') as f:
    for l in f:
        lambdas.append(float(l.split()[-1]))

In [23]:
np.savetxt('lambdas.txt', lambdas)