In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from collections import defaultdict
from time import time
import scipy
%matplotlib inline

In [34]:
df = pd.read_csv('data/train.csv').sort_values('date')
df['recipient_id'] = df['recipient_id'].apply(literal_eval)
X, X_time, y = df[['sender_id', 'body', 'date']].values, df['date'].values, df['recipient_id'].values

df_test = pd.read_csv('data/test.csv').sort_values('date')
X2 = df_test[['sender_id', 'body', 'date', 'mid']].values

In [35]:
# Removing digits from the corpus
for i in range(len(X)):
    X[i][1] = ''.join([d for d in X[i][1] if not d.isdigit()])
    
for i in range(len(X2)):
    X2[i][1] = ''.join([d for d in X2[i][1] if not d.isdigit()])

In [36]:
n_neighbors = 70
X_senders = [[] for i in range(125)]
y_senders = [[] for i in range(125)]
X_senders2 = [[] for i in range(125)]
test_mids = [[] for i in range(125)]
X_time = [[] for i in range(125)]
X_time2 = [[] for i in range(125)]
X_train_text = [[] for i in range(125)]
X_test_text = [[] for i in range(125)]

vect = TfidfVectorizer(min_df=10)
X_body_train = vect.fit_transform(X[:, 1])
X_body_test = vect.transform(X2[:, 1])

norm1 = scipy.sparse.linalg.norm(X_body_train, axis=1)
for i in range(len(norm1)):
    if norm1[i] == 0.:
        norm1[i] = 1.
X_body_train = X_body_train.multiply(scipy.sparse.csr_matrix(1./norm1.reshape(-1, 1)))

norm2 = scipy.sparse.linalg.norm(X_body_test, axis=1)
for i in range(len(norm2)):
    if norm2[i] == 0.:
        norm2[i] = 1.
X_body_test = X_body_test.multiply(scipy.sparse.csr_matrix(1./norm2.reshape(-1, 1)))

for i in range(len(X)):
    X_senders[X[i][0]].append(i)
    X_time[X[i][0]].append(np.datetime64(X[i][2]))
    y_senders[X[i][0]].append(y[i])
    X_train_text[X[i][0]].append(X[i][1])
    
for i in range(len(X2)):
    X_senders2[X2[i][0]].append(i)
    X_time2[X2[i][0]].append(np.datetime64(X2[i][2]))
    test_mids[X2[i][0]].append(X2[i][3])
    X_test_text[X2[i][0]].append(X2[i][1])
    
y_train, X_train_tf, X_test_tf= [], [], []
X_train_time, X_test_time = [], []
for s in range(125):
    X_senders[s] = X_body_train[np.array(X_senders[s]), :]
    X_senders2[s] = X_body_test[np.array(X_senders2[s]), :]
    X_time[s] = np.array(X_time[s]).astype('int64')
    X_time2[s] = np.array(X_time2[s]).astype('int64')
    
    X_train_time.append(X_time[s])
    X_test_time.append(X_time2[s])
    X_train_tf.append(X_senders[s])
    X_test_tf.append(X_senders2[s])
    
    y_train.append(y_senders[s])

In [37]:
recipient_ids = {}
for l in df[['recipient_id', 'recipients']].values:
    a = l[1].split()
    for i in range(len(a)):
        recipient_ids[l[0][i]] = a[i]
        
n_people = max(recipient_ids.keys())+1
recipient_names = []
for i in range(n_people):
    if i not in recipient_ids:
        recipient_names.append([])
        continue
    address = recipient_ids[i]
    s = address.split('@')[0].split('.')
    if len(s) == 2:
        recipient_names.append(s)
    else:
        recipient_names.append([])

In [38]:
def name_in_header(name, mail):
    mail = mail[:30].lower()
    for s in name:
        if s.lower() in mail:
            return 1.
    return 0.

In [39]:
len(y_train)

125

In [40]:
sent_by_sender = np.zeros((125, n_people))
for s in range(125):
    for l in y_train[s]:
        for r in l:
            sent_by_sender[s][r] += 1
s = np.sum(sent_by_sender, axis=1)
for i in range(len(s)):
    if s[i] == 0.:
        s[i] = 1.
sent_by_sender /= np.sum(sent_by_sender, axis=1).reshape(-1, 1)

baseline = np.argsort(sent_by_sender)[:, ::-1][:, :10]

In [43]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from time import time

n_neigbhors = 70
l = 1.5
X_train_full, y_train_full = [], []

mail_times = [[[] for j in range(n_people)] for i in range(125)]
for s in range(125):
    for j in range(len(X_train_time[s])):
        for r in y_train[s][j]:
            mail_times[s][r].append(X_train_time[s][j])

start = time()

for s in range(125):
    cosine_similarities_matrix = X_train_tf[s].dot(X_train_tf[s].transpose())
    #print cosine_similarities_matrix.diagonal()
    
    X_train, y_train_true = np.empty((0, 7)), np.empty(0)
    
    for j in range(30, len(X_train_text[s])):
        cosine_similarities = np.array(cosine_similarities_matrix[j].todense())[0]
            
        # don't forget to not take the first one, which is the mail itself and will always have a similarity equal to 1.
        closests = np.argsort(cosine_similarities)[::-1][1:n_neighbors+1]
        candidates_local_keys = {}
        cur = 0
        for m in closests:
            for r in y_train[s][m]:
                if r not in candidates_local_keys:
                    candidates_local_keys[r] = cur
                    cur += 1
        n_candidates = len(candidates_local_keys)
        
        features = np.zeros((n_candidates, 7))
        for m in closests:
            for r in y_train[s][m]:
                if r in candidates_local_keys:
                    features[candidates_local_keys[r]][0] += 1.
                    features[candidates_local_keys[r]][1] += cosine_similarities[m]
                    features[candidates_local_keys[r]][3] += X_train_time[s][j] - X_train_time[s][m]
                    features[candidates_local_keys[r]][5] += cosine_similarities[m]*(X_train_time[s][j] - X_train_time[s][m])
        
        for r in candidates_local_keys:
                features[candidates_local_keys[r]][2] = name_in_header(recipient_names[r], X_train_text[s][j])
                features[candidates_local_keys[r]][4] = sent_by_sender[s][r]
                    
                a = np.array(mail_times[s][r])
                #if X_train_time[s][j] in a[a < X_train_time[s][j]]:
                #    print 'lol', s, j, r
                features[candidates_local_keys[r]][6] = ((X_train_time[s][j] - a[a < X_train_time[s][j]])**(-l)).sum()
                if len(mail_times[s][r]) != 0:
                    features[candidates_local_keys[r]][6] /= len(mail_times[s][r])
                
        for i in range(n_candidates):
            if features[i][0] == 0.:
                features[i][0] = 1.
        
        features[:, 1] /= features[:, 0]
        features[:, 3] /= features[:, 0]
        features[:, 0] /= len(closests)
        
        X_train = np.vstack((X_train, features))
        
        y_mail = np.zeros(n_candidates)
        for r in candidates_local_keys:
            y_mail[candidates_local_keys[r]] = 1. if r in y_train[s][j] else 0.
        
        y_train_true = np.hstack((y_train_true, y_mail))
    
    X_train_full.append(X_train)
    y_train_full.append(y_train_true)
    print s, len(X_train_text[s]), len(X_train)
print time() - start

0 156 15177
1 390 19386
2 87 8146
3 124 2750
4 116 9898
5 83 2635
6 519 71638
7 109 541
8 345 104158
9 84 503
10 140 3787
11 70 2234
12 97 11822
13 405 13648
14 300 60221
15 104 9477
16 283 12348
17 606 24151
18 350 29593
19 101 14327
20 169 44811
21 523 32646
22 2473 163416
23 1458 52935
24 522 41401
25 756 57809
26 167 6683
27 109 7046
28 444 21307
29 526 9996
30 484 31675
31 425 16564
32 331 952
33 88 6488
34 347 21513
35 273 26720
36 191 7357
37 398 22655
38 77 47
39 463 42342
40 79 982
41 88 3076
42 128 5501
43 192 8440
44 252 12607
45 407 16646
46 159 8495
47 148 5503
48 308 37778
49 159 29895
50 578 28647
51 104 9057
52 180 7150
53 67 2246
54 792 46070
55 690 32117
56 84 5060
57 201 171
58 164 9865
59 1099 66999
60 107 5017
61 106 8254
62 98 10407
63 153 12436
64 934 57817
65 118 1267
66 1677 163920
67 693 57107
68 258 13340
69 150 4621
70 138 6898
71 83 4617
72 106 4176
73 147 16018
74 92 62
75 122 4817
76 131 15255
77 211 7054
78 180 11386
79 150 5812
80 73 1711
81 168 5688
82

In [44]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from time import time

start = time()

n_neigbhors = 70
l = 1.5
X_test_full, starting_ids, keys_to_rs = [], [], []

for s in range(125):
    cosine_similarities_matrix = X_test_tf[s].dot(X_train_tf[s].transpose())
    y_pred_sender = np.empty((len(X_test_text[s]), 10))
    
    X_test_sender = np.empty((0, 7))
    sender_starting_ids = []
    sender_keys_to_r = []

    for j in range(len(X_test_text[s])):
        sender_starting_ids.append(len(X_test_sender))
        cosine_similarities = np.array(cosine_similarities_matrix[j].todense())[0]

        closests = np.argsort(cosine_similarities)[::-1][:n_neighbors]

        candidates_local_keys = {}
        cur = 0
        for m in closests:
            for r in y_train[s][m]:
                if r not in candidates_local_keys:
                    candidates_local_keys[r] = cur
                    cur += 1
        n_candidates = len(candidates_local_keys)

        features = np.zeros((n_candidates, 7))
        for m in closests:
            for r in y_train[s][m]:
                if r in candidates_local_keys:
                    features[candidates_local_keys[r]][0] += 1.
                    features[candidates_local_keys[r]][1] += cosine_similarities[m]
                    features[candidates_local_keys[r]][3] += X_test_time[s][j] - X_train_time[s][m]
                    features[candidates_local_keys[r]][5] += cosine_similarities[m]*(X_test_time[s][j] - X_train_time[s][m])

        for r in candidates_local_keys:
                features[candidates_local_keys[r]][2] = name_in_header(recipient_names[r], X_test_text[s][j])
                features[candidates_local_keys[r]][4] = sent_by_sender[s][r]
                
                a = np.array(mail_times[s][r])
                #if X_train_time[s][j] in a[a < X_train_time[s][j]]:
                #    print 'lol', s, j, r
                features[candidates_local_keys[r]][6] = ((X_test_time[s][j] - a[a < X_test_time[s][j]])**(-l)).sum()
                if len(mail_times[s][r]) != 0:
                    features[candidates_local_keys[r]][6] /= len(mail_times[s][r])

        for i in range(n_candidates):
            if features[i][0] == 0.:
                features[i][0] = 1.

        features[:, 1] /= features[:, 0]
        features[:, 3] /= features[:, 0]
        features[:, 0] /= len(closests)

        X_test_sender = np.vstack((X_test_sender, features))
        
        keys_to_r = {candidates_local_keys[r]:r for r in candidates_local_keys}
        sender_keys_to_r.append(keys_to_r)
    
    X_test_full.append(X_test_sender)
    sender_starting_ids.append(len(X_test_sender))
    starting_ids.append(sender_starting_ids)
    keys_to_rs.append(sender_keys_to_r)
    print s
print time() - start

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
8.21510601044


In [45]:
y_pred = []
max_leaf_nodes = 40
n_estimators = 500
for s in range(125):
    clf = RandomForestClassifier(n_estimators=n_estimators, max_leaf_nodes=max_leaf_nodes, n_jobs=-1).fit(X_train_full[s], y_train_full[s])
        
    y_pred_sender = np.empty((len(X_test_text[s]), 10))
        
    if len(X_test_text[s]) == 0:
            pass
    elif clf.n_classes_ == 1:
        y_pred_sender = np.array([baseline[s] for i in range(len(X_test_text[s]))])
    else:
        
        raw_pred = clf.predict_proba(X_test_full[s])[:, 1]
            
        for j in range(len(X_test_text[s])):
            pre = list(raw_pred[starting_ids[s][j]:starting_ids[s][j+1]].argsort()[::-1][:10])
            for i in range(len(pre)):
                pre[i] = keys_to_rs[s][j][pre[i]]

            # if we don't have enough candidates, fill with baseline
            cur = 0
            while len(pre) < 10:
                if baseline[s][cur] not in keys_to_rs[s][j].values():
                    pre.append(baseline[s][cur])
                cur += 1
            y_pred_sender[j] = np.array(pre)
    y_pred.append(y_pred_sender)

In [46]:
recipient_ids = {}
for l in df[['recipient_id', 'recipients']].values:
    a = l[1].split()
    for i in range(len(a)):
        recipient_ids[l[0][i]] = a[i]
        
with open('data/sub_tfidf_knn_dates_two_more_features_RF_1.txt', 'w') as f:
    f.write('mid,recipients\n')
    for s in range(125):
        for i in range(len(y_pred[s])):
            f.write('{},'.format(test_mids[s][i]))
            for r in y_pred[s][i]:
                f.write(recipient_ids[r] + ' ')
            f.write('\n')

When augmenting number of trees from 50 to 200 and reducing number of leaf nodes from 50 to 40 the score got higher. Which could we were overfiting or that we just needed more trees.

Using xgboost might be the way to pass the 0.4 bar !

### Let's try some validation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from collections import defaultdict
from time import time
import scipy
%matplotlib inline

In [2]:
df = pd.read_csv('data/train.csv').sort_values('date')
df['recipient_id'] = df['recipient_id'].apply(literal_eval)

train_size = len(df) - int((.1*len(df)))

X, X_time, y = df[['sender_id', 'body', 'date']].values[:train_size], df['date'].values[:train_size], df['recipient_id'].values[:train_size]

X2 = df[['sender_id', 'body', 'date', 'mid']].values[train_size:]

y_final_test = [[] for i in range(125)]
A = df['recipient_id'].values[train_size:]
for i in range(len(A)):
    y_final_test[X2[i][0]].append(A[i])

In [29]:
# Removing digits from the corpus
for i in range(len(X)):
    X[i][1] = ''.join([d for d in X[i][1] if not d.isdigit()])
    
for i in range(len(X2)):
    X2[i][1] = ''.join([d for d in X2[i][1] if not d.isdigit()])
    
n_neighbors = 70
X_senders = [[] for i in range(125)]
y_senders = [[] for i in range(125)]
X_senders2 = [[] for i in range(125)]
test_mids = [[] for i in range(125)]
X_time = [[] for i in range(125)]
X_time2 = [[] for i in range(125)]
X_train_text = [[] for i in range(125)]
X_test_text = [[] for i in range(125)]

vect = TfidfVectorizer(min_df=10)
X_body_train = vect.fit_transform(X[:, 1])
X_body_test = vect.transform(X2[:, 1])

norm1 = scipy.sparse.linalg.norm(X_body_train, axis=1)
for i in range(len(norm1)):
    if norm1[i] == 0.:
        norm1[i] = 1.
X_body_train = X_body_train.multiply(scipy.sparse.csr_matrix(1./norm1.reshape(-1, 1)))

norm2 = scipy.sparse.linalg.norm(X_body_test, axis=1)
for i in range(len(norm2)):
    if norm2[i] == 0.:
        norm2[i] = 1.
X_body_test = X_body_test.multiply(scipy.sparse.csr_matrix(1./norm2.reshape(-1, 1)))

for i in range(len(X)):
    X_senders[X[i][0]].append(i)
    X_time[X[i][0]].append(np.datetime64(X[i][2]))
    y_senders[X[i][0]].append(y[i])
    X_train_text[X[i][0]].append(X[i][1])
    
for i in range(len(X2)):
    X_senders2[X2[i][0]].append(i)
    X_time2[X2[i][0]].append(np.datetime64(X2[i][2]))
    test_mids[X2[i][0]].append(X2[i][3])
    X_test_text[X2[i][0]].append(X2[i][1])
    
y_train, X_train_tf, X_test_tf= [], [], []
X_train_time, X_test_time = [], []
for s in range(125):
    X_senders[s] = X_body_train[np.array(X_senders[s]), :]
    X_senders2[s] = X_body_test[np.array(X_senders2[s]), :]
    X_time[s] = np.array(X_time[s]).astype('int64')
    X_time2[s] = np.array(X_time2[s]).astype('int64')
    
    X_train_time.append(X_time[s])
    X_test_time.append(X_time2[s])
    X_train_tf.append(X_senders[s])
    X_test_tf.append(X_senders2[s])
    
    y_train.append(y_senders[s])
    
    
    
recipient_ids = {}
for l in df[['recipient_id', 'recipients']].values:
    a = l[1].split()
    for i in range(len(a)):
        recipient_ids[l[0][i]] = a[i]
        
n_people = max(recipient_ids.keys())+1
recipient_names = []
for i in range(n_people):
    if i not in recipient_ids:
        recipient_names.append([])
        continue
    address = recipient_ids[i]
    s = address.split('@')[0].split('.')
    if len(s) == 2:
        recipient_names.append(s)
    else:
        recipient_names.append([])
        
def name_in_header(name, mail):
    mail = mail[:30].lower()
    for s in name:
        if s.lower() in mail:
            return 1.
    return 0.


sent_by_sender = np.zeros((125, n_people))
for s in range(125):
    for l in y_train[s]:
        for r in l:
            sent_by_sender[s][r] += 1
s = np.sum(sent_by_sender, axis=1)
for i in range(len(s)):
    if s[i] == 0.:
        s[i] = 1.
sent_by_sender /= np.sum(sent_by_sender, axis=1).reshape(-1, 1)

baseline = np.argsort(sent_by_sender)[:, ::-1][:, :10]

from time import time

n_neigbhors = 70
l = 1.5
X_train_full, y_train_full = [], []

start = time()

mail_times = [[[] for j in range(n_people)] for i in range(125)]
for s in range(125):
    for j in range(len(X_train_time[s])):
        for r in y_train[s][j]:
            mail_times[s][r].append(X_train_time[s][j])

for s in range(125):
    cosine_similarities_matrix = X_train_tf[s].dot(X_train_tf[s].transpose())
    #print cosine_similarities_matrix.diagonal()
    
    X_train, y_train_true = np.empty((0, 7)), np.empty(0)
    
    for j in range(30, len(X_train_text[s])):
        cosine_similarities = np.array(cosine_similarities_matrix[j].todense())[0]
            
        # don't forget to not take the first one, which is the mail itself and will always have a similarity equal to 1.
        closests = np.argsort(cosine_similarities)[::-1][1:n_neighbors+1]
        candidates_local_keys = {}
        cur = 0
        for m in closests:
            for r in y_train[s][m]:
                if r not in candidates_local_keys:
                    candidates_local_keys[r] = cur
                    cur += 1
        n_candidates = len(candidates_local_keys)
        
        features = np.zeros((n_candidates, 7))
        for m in closests:
            for r in y_train[s][m]:
                if r in candidates_local_keys:
                    features[candidates_local_keys[r]][0] += 1.
                    features[candidates_local_keys[r]][1] += cosine_similarities[m]
                    features[candidates_local_keys[r]][3] += X_train_time[s][j] - X_train_time[s][m]
                    features[candidates_local_keys[r]][5] += cosine_similarities[m]*(X_train_time[s][j] - X_train_time[s][m])
        
        for r in candidates_local_keys:
                features[candidates_local_keys[r]][2] = name_in_header(recipient_names[r], X_train_text[s][j])
                features[candidates_local_keys[r]][4] = sent_by_sender[s][r]
                    
                a = np.array(mail_times[s][r])
                #if X_train_time[s][j] in a[a < X_train_time[s][j]]:
                #    print 'lol', s, j, r
                features[candidates_local_keys[r]][6] = ((X_train_time[s][j] - a[a < X_train_time[s][j]])**(-l)).sum()
                if len(mail_times[s][r]) != 0:
                    features[candidates_local_keys[r]][6] /= len(mail_times[s][r])
                
        for i in range(n_candidates):
            if features[i][0] == 0.:
                features[i][0] = 1.
        
        features[:, 1] /= features[:, 0]
        features[:, 3] /= features[:, 0]
        features[:, 0] /= len(closests)
        
        X_train = np.vstack((X_train, features))
        
        y_mail = np.zeros(n_candidates)
        for r in candidates_local_keys:
            y_mail[candidates_local_keys[r]] = 1. if r in y_train[s][j] else 0.
        
        y_train_true = np.hstack((y_train_true, y_mail))
    
    X_train_full.append(X_train)
    y_train_full.append(y_train_true)
    print s, len(X_train_text[s]), len(X_train)
print time() - start

0 97 11313
1 343 16634
2 75 6534
3 122 2711
4 103 10195
5 80 2540
6 382 50753
7 102 432
8 338 101477
9 58 222
10 122 2437
11 65 1255
12 83 9613
13 377 12408
14 284 57382
15 40 1139
16 156 6475
17 522 21480
18 327 26650
19 89 11945
20 162 42759
21 513 32081
22 2408 158503
23 1441 51977
24 508 40243
25 683 53136
26 147 6067
27 106 6913
28 400 19435
29 432 3635
30 450 29796
31 399 15585
32 156 133
33 71 2732
34 339 20935
35 260 24628
36 172 6662
37 336 18691
38 77 47
39 368 32333
40 45 254
41 78 2442
42 103 5233
43 165 6874
44 223 11442
45 379 15612
46 103 4431
47 134 4806
48 271 38965
49 145 26633
50 499 24537
51 94 7859
52 161 6413
53 57 1555
54 754 44453
55 548 24648
56 72 3857
57 191 161
58 135 7769
59 1010 60677
60 92 4371
61 105 8026
62 98 10412
63 150 12426
64 842 51205
65 114 1222
66 1390 138814
67 563 44164
68 234 12263
69 115 3165
70 131 6134
71 57 2202
72 102 3964
73 142 15443
74 68 38
75 119 4525
76 79 8232
77 184 6016
78 166 10523
79 138 5109
80 56 959
81 155 5270
82 120 2090

In [31]:
X_test_full, starting_ids, keys_to_rs = [], [], []

for s in range(125):
    cosine_similarities_matrix = X_test_tf[s].dot(X_train_tf[s].transpose())
    y_pred_sender = np.empty((len(X_test_text[s]), 10))
    
    X_test_sender = np.empty((0, 7))
    sender_starting_ids = []
    sender_keys_to_r = []

    for j in range(len(X_test_text[s])):
        sender_starting_ids.append(len(X_test_sender))
        cosine_similarities = np.array(cosine_similarities_matrix[j].todense())[0]

        closests = np.argsort(cosine_similarities)[::-1][:n_neighbors]

        candidates_local_keys = {}
        cur = 0
        for m in closests:
            for r in y_train[s][m]:
                if r not in candidates_local_keys:
                    candidates_local_keys[r] = cur
                    cur += 1
        n_candidates = len(candidates_local_keys)

        features = np.zeros((n_candidates, 7))
        for m in closests:
            for r in y_train[s][m]:
                if r in candidates_local_keys:
                    features[candidates_local_keys[r]][0] += 1.
                    features[candidates_local_keys[r]][1] += cosine_similarities[m]
                    features[candidates_local_keys[r]][3] += X_test_time[s][j] - X_train_time[s][m]
                    features[candidates_local_keys[r]][5] += cosine_similarities[m]*(X_test_time[s][j] - X_train_time[s][m])

        for r in candidates_local_keys:
                features[candidates_local_keys[r]][2] = name_in_header(recipient_names[r], X_test_text[s][j])
                features[candidates_local_keys[r]][4] = sent_by_sender[s][r]
                
                a = np.array(mail_times[s][r])
                #if X_train_time[s][j] in a[a < X_train_time[s][j]]:
                #    print 'lol', s, j, r
                features[candidates_local_keys[r]][6] = ((X_test_time[s][j] - a[a < X_test_time[s][j]])**(-l)).sum()
                if len(mail_times[s][r]) != 0:
                    features[candidates_local_keys[r]][6] /= len(mail_times[s][r])

        for i in range(n_candidates):
            if features[i][0] == 0.:
                features[i][0] = 1.

        features[:, 1] /= features[:, 0]
        features[:, 3] /= features[:, 0]
        features[:, 0] /= len(closests)

        X_test_sender = np.vstack((X_test_sender, features))
        
        keys_to_r = {candidates_local_keys[r]:r for r in candidates_local_keys}
        sender_keys_to_r.append(keys_to_r)
    
    X_test_full.append(X_test_sender)
    sender_starting_ids.append(len(X_test_sender))
    starting_ids.append(sender_starting_ids)
    keys_to_rs.append(sender_keys_to_r)
    print s

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124


In [32]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from time import time

def ap(recommanded, real):
    real_set = set(real)
    cur = 0.
    n = len(recommanded)
    ans = 0.
    for k in range(1, n+1):
        if recommanded[k-1] in real_set:
            cur += 1
            ans += cur/k
    return ans/min(n, len(real))

def MAP(recommanded, real):
    ans = 0.
    for i in range(len(recommanded)):
        ans += ap(recommanded[i], real[i])
    return ans/len(recommanded)

for max_leaf_nodes in [10, 20, 30, 40, 50, 60, 70]:
    start = time()
    y_pred = []
    for s in range(125):
        clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=max_leaf_nodes, n_jobs=-1).fit(X_train_full[s], y_train_full[s])
        
        y_pred_sender = np.empty((len(X_test_text[s]), 10))
        
        if len(X_test_text[s]) == 0:
               pass
        elif clf.n_classes_ == 1:
            y_pred_sender = np.array([baseline[s] for i in range(len(X_test_text[s]))])
        else:
            
            raw_pred = clf.predict_proba(X_test_full[s])[:, 1]
            
            for j in range(len(X_test_text[s])):
                pre = list(raw_pred[starting_ids[s][j]:starting_ids[s][j+1]].argsort()[::-1][:10])
                for i in range(len(pre)):
                    pre[i] = keys_to_rs[s][j][pre[i]]

                # if we don't have enough candidates, fill with baseline
                cur = 0
                while len(pre) < 10:
                    if baseline[s][cur] not in keys_to_rs[s][j].values():
                        pre.append(baseline[s][cur])
                    cur += 1
                y_pred_sender[j] = np.array(pre)
        y_pred.append(y_pred_sender)
        
        
    score = 0.
    c = 0.
    for s in range(125):
        c += len(y_pred[s])
        for j in range(len(y_pred[s])):
            score += ap(y_pred[s][j], y_final_test[s][j])

    print "Time elapsed = {}".format(time() - start)
    print "max_leaf_nodes = {}, score = {}".format(max_leaf_nodes, score/c)

Time elapsed = 378.560666084
max_leaf_nodes = 10, score = 0.37931250043
Time elapsed = 392.004215956
max_leaf_nodes = 20, score = 0.38477198702
Time elapsed = 397.803869963
max_leaf_nodes = 30, score = 0.38585974604
Time elapsed = 412.018404007
max_leaf_nodes = 40, score = 0.387989604491
Time elapsed = 409.016401052
max_leaf_nodes = 50, score = 0.386617896883


KeyboardInterrupt: 

With RF:

n_estimators = 500

max_leaf_nodes = 10, score = 0.376055204752

max_leaf_nodes = 20, score = 0.380046711475

max_leaf_nodes = 30, score = 0.381856818218

max_leaf_nodes = 40, score = 0.381058513544

max_leaf_nodes = 50, score = 0.384284125066

max_leaf_nodes = 60, score = 0.381552354931

max_leaf_nodes = 70, score = 0.382649338012

n_estimators = 200

max_leaf_nodes = 20, score = 0.379784765737

max_leaf_nodes = 30, score = 0.379737073331

max_leaf_nodes = 40, score = 0.37990337857

max_leaf_nodes = 50, score = 0.38024079325

max_leaf_nodes = 60, score = 0.378710003567

In [9]:
def ap(recommanded, real):
    real_set = set(real)
    cur = 0.
    n = len(recommanded)
    ans = 0.
    for k in range(1, n+1):
        if recommanded[k-1] in real_set:
            cur += 1
            ans += cur/k
    return ans/min(n, len(real))

def MAP(recommanded, real):
    ans = 0.
    for i in range(len(recommanded)):
        ans += ap(recommanded[i], real[i])
    return ans/len(recommanded)

In [28]:
score = 0.
c = 0.
for s in range(s):
    c += len(y_pred[s])
    for j in range(len(y_pred[s])):
        score += ap(y_pred[s][j], y_final_test[s][j])

score/c

0.3639194329296734