In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval
from time import time
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('data/train.csv').sort_values('date')
df['recipient_id'] = df['recipient_id'].apply(literal_eval)

In [3]:
df.head()

Unnamed: 0,sender,sender_id,mid,date,body,recipient_id,recipients
5304,enron_update@concureworkplace.com,124,47361,0001-08-26 22:16:36,The following reports have been waiting for yo...,[377],kimberly.watson@enron.com
5305,enron_update@concureworkplace.com,124,47362,0001-08-27 22:21:02,The following reports have been waiting for yo...,[377],kimberly.watson@enron.com
5306,enron_update@concureworkplace.com,124,47363,0001-08-28 22:25:35,The following reports have been waiting for yo...,[377],kimberly.watson@enron.com
5285,enron_update@concureworkplace.com,124,45909,0001-09-13 22:24:08,Employee Name: Kimberly WatsonReport Name: E...,[377],kimberly.watson@enron.com
8934,enron_update@concureworkplace.com,124,82030,0001-09-17 09:24:00,The following expense report is ready for appr...,[121],barry.tycholiz@enron.com


In [4]:
from collections import defaultdict

senders_recipients = defaultdict(lambda: set())
for l in df.values:
    for r in l[5]:
        senders_recipients[l[1]].add(r)
        
c = 0
for s, r in senders_recipients.iteritems():
    c += len(r)
print float(c)/125

186.328


In [5]:
from collections import defaultdict

senders_counts = np.zeros(125)
for l in df.values:
    senders_counts[l[1]] += 1

print np.sum(senders_counts**2)

48209709.0


It looks possible (but a bit long) to train one svm per sender.

### Here I implement this idea for each sender, and add the time as 4 additional features.
http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/

We aim at builing a ranking generator for each sender. For this we use word2vec to get reasonable features, and we interpret the task as a regression task, using one hot encoding on the recipients. Next we might try to train an embedding on the recipients, which seems to be best solution to fit our needs.

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix

def ohe(y):
    seen = set()
    indexes = []
    for l in y:
        for r in l:
            if not r in seen:
                seen.add(r)
                indexes.append(r)
    rindexes = {indexes[i]: i for i in range(len(indexes))}
    
    y_ohe = np.zeros((len(y), len(indexes)), dtype=float)
    for i in range(len(y)):
        l = y[i]
        for r in l:
            y_ohe[i][rindexes[r]] = 1.
    return y_ohe, indexes

def make_dataset(df, w2v, d, split_ratio=.1, min_df=10):
    stemmer = PorterStemmer
    X, X_dates, y = df[['sender_id', 'body']].values, df['date'].values, df['recipient_id'].values
    
    print "stemming..."
    stemmer = PorterStemmer()
    for i in range(len(X)):
        X[i][1] = ' '.join(map(lambda x: str(stemmer.stem(x)), word_tokenize(X[i][1])))
    
    print "vectorizing..."
    vect = CountVectorizer(min_df=min_df, stop_words='english')
    #vect = TfidfVectorizer(min_df=min_df, stop_words='english')
    X_body = vect.fit_transform(X[:,1])
    
    print "w2v..."
    voc = vect.vocabulary_
    coef = np.array([s if s != 0. else 1. for s in X_body.sum(axis=1)])
    translate_matrix = np.empty((len(voc), d), dtype=float)
    for w, i in voc.iteritems():
        translate_matrix[i] = w2v[w] if w in w2v else np.zeros(d)
    vectorized = X_body.dot(translate_matrix)/coef.reshape(-1, 1)
    
    # not sure if better to divide by coef here
    # vectorized = X_body.dot(translate_matrix)
    
    date_features = np.array([np.array([s[:4], s[5:7], s[8:10], s[11:13]]) for s in X_dates]).astype(float)
    
    X_full = np.hstack((vectorized, date_features))
    
    X_lol, y_lol = defaultdict(lambda: []), defaultdict(lambda: [])
    recipient_indexes = []
    for i in range(len(X)):
        X_lol[X[i][0]].append(X_full[i])
        y_lol[X[i][0]].append(y[i])
    
    print "one hot encoding..."
    recipient_indexes = []
    for i in range(125):
        y_lol[i], indexes = ohe(y_lol[i])
        recipient_indexes.append(indexes)
        
        
    X_train, X_test, y_train, y_test = [], [], [], []
    
    for i in range(125):
        cur_X = StandardScaler().fit_transform(np.array(X_lol[i]))
        train_size = len(cur_X) - int(.1*len(cur_X))
        X_train.append(cur_X[:train_size])
        X_test.append(cur_X[train_size:])
        cur_y = np.array(y_lol[i])
        y_train.append(cur_y[:train_size])
        y_test.append(cur_y[train_size:])
        if(y_train[i].shape[1] == 1):
            y_train[i] = y_train[i].ravel()
            y_test[i] = y_test[i].ravel()
    
    train_size = len(X) - int(.1*len(X))
    return X_train, X_test, y_train, y_test, recipient_indexes

In [7]:
with open("glove.6B.50d.txt", "rb") as lines:
    w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
           for line in lines}

In [8]:
from time import time
start = time()
X_train, X_test, y_train, y_test, recipient_indexes = make_dataset(df, w2v, 50)
print time() - start

stemming...
vectorizing...
w2v...
one hot encoding...




141.879356861


In [9]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
def ap(recommanded, real):
    real_set = set(real)
    cur = 0.
    n = len(recommanded)
    ans = 0.
    for k in range(1, n+1):
        if recommanded[k-1] in real_set:
            cur += 1
            ans += cur/k
    return ans/min(n, len(real))

In [32]:
from time import time

start = time()
models = []
cur = 0
for i in range(125):
    print("{} done".format(cur))
    print("Next shape is {}".format(y_train[i].shape))
    models.append(RandomForestRegressor(n_estimators=50, max_leaf_nodes=300).fit(X_train[i], y_train[i]))
    cur += 1
print time() - start

y_pred = []
for i in range(125):
    y_pred.append(models[i].predict(X_test[i]))
    
score, c = 0, 0
for i in range(125):
    for j in range(len(y_test[i])):
        real = np.argsort(y_test[i][j])[-10:]
        recommanded = list(np.argsort(y_pred[i][j])[-10:])
        while len(recommanded) < 10:
            recommanded.append(np.max(recommanded) + 1)
        score += ap(recommanded, real)
        c += 1

print("Score on test: {}".format(score/c))

y_pred_train = []
for i in range(125):
    y_pred_train.append(models[i].predict(X_train[i]))
    
score, c = 0, 0
for i in range(125):
    for j in range(len(y_train[i])):
        real = np.argsort(y_train[i][j])[-10:]
        recommanded = list(np.argsort(y_pred_train[i][j])[-10:])
        while len(recommanded) < 10:
            recommanded.append(np.max(recommanded) + 1)
        score += ap(recommanded, real)
        c += 1
        
print("Score on train: {}".format(score/c))

0 done
Next shape is (141, 544)
1 done
Next shape is (351, 123)
2 done
Next shape is (79, 152)
3 done
Next shape is (112, 57)
4 done
Next shape is (105, 351)
5 done
Next shape is (75, 63)
6 done
Next shape is (468, 394)
7 done
Next shape is (99, 10)
8 done
Next shape is (311, 841)
9 done
Next shape is (76, 11)
10 done
Next shape is (126, 67)
11 done
Next shape is (63, 56)
12 done
Next shape is (88, 183)
13 done
Next shape is (365, 143)
14 done
Next shape is (270, 399)
15 done
Next shape is (94, 134)
16 done
Next shape is (255, 111)
17 done
Next shape is (546, 200)
18 done
Next shape is (315, 219)
19 done
Next shape is (91, 211)
20 done
Next shape is (153, 354)
21 done
Next shape is (471, 266)
22 done
Next shape is (2226, 350)
23 done
Next shape is (1313, 199)
24 done
Next shape is (470, 406)
25 done
Next shape is (681, 273)
26 done
Next shape is (151, 83)
27 done
Next shape is (99, 147)
28 done
Next shape is (400, 151)
29 done
Next shape is (474, 112)
30 done
Next shape is (436, 332)
3

In [28]:
np.argsort(models[0].feature_importances_)[::-1], np.sort(models[0].feature_importances_)[::-1]

(array([38, 20, 47, 40, 35, 29, 18, 11, 17, 16, 24, 48, 26, 13, 12,  3, 49,
        14,  7, 46, 25, 39, 19, 27, 44,  2, 45,  8,  5,  1, 36, 34, 37, 41,
        21, 52, 23, 22, 53, 32, 43,  0, 31,  4, 15, 28,  6, 30, 10, 42,  9,
        51, 33, 50]),
 array([ 0.15898292,  0.14493459,  0.09903551,  0.04498742,  0.0439194 ,
         0.04068918,  0.03101346,  0.03048066,  0.02914683,  0.02686461,
         0.02177885,  0.02126114,  0.02085528,  0.0183163 ,  0.01774724,
         0.01701305,  0.01600744,  0.01420805,  0.01322368,  0.01249123,
         0.01205261,  0.01169876,  0.01156549,  0.01110409,  0.01085911,
         0.0100729 ,  0.00996809,  0.00964845,  0.00769202,  0.00722216,
         0.0071816 ,  0.00686154,  0.00655002,  0.00611224,  0.00500391,
         0.00453447,  0.00363094,  0.00358627,  0.00314438,  0.00299074,
         0.00298612,  0.00277851,  0.00259674,  0.00228275,  0.00226095,
         0.00222349,  0.00209709,  0.00196365,  0.0018023 ,  0.0014566 ,
         0.00143284,

In [30]:
X_train[0][0][51]

-1.9938843787299956

In [31]:
np.argsort(models[1].feature_importances_)[::-1], np.sort(models[1].feature_importances_)[::-1]

(array([51, 52, 53, 33,  9, 37, 47, 24, 26, 43, 35, 11, 30, 25, 13, 12, 44,
        23, 42,  6, 31,  5, 45, 49, 50, 46, 27, 20,  0,  8, 17, 15, 16, 34,
        41,  3, 29, 21,  2, 10, 38, 14, 28, 32,  7,  4, 36, 18, 19, 48,  1,
        39, 22, 40]),
 array([ 0.0599165 ,  0.0473159 ,  0.04363581,  0.0412366 ,  0.03679398,
         0.02962343,  0.02944725,  0.02757881,  0.0264243 ,  0.02560782,
         0.0255568 ,  0.02331099,  0.02326877,  0.02313611,  0.02225735,
         0.01979412,  0.0195791 ,  0.01943066,  0.01796719,  0.01756959,
         0.01726809,  0.01669323,  0.01625402,  0.01608755,  0.01584779,
         0.01580132,  0.01563665,  0.01558137,  0.01513603,  0.01361722,
         0.01360294,  0.01353925,  0.01345529,  0.01312137,  0.01277925,
         0.01241071,  0.01225903,  0.01194206,  0.01186344,  0.01169529,
         0.01162795,  0.01162045,  0.01155281,  0.01104398,  0.01099812,
         0.01097765,  0.01025029,  0.01004152,  0.00981778,  0.00940808,
         0.00907113,