# Proper implementation of first model to be used with cross validation

http://dl.acm.org/citation.cfm?doid=2600428.2609514

In [1]:
from sklearn.base import BaseEstimator, ClassifierMixin
from collections import defaultdict
import itertools
import operator
import numpy as np
from sklearn.preprocessing import OneHotEncoder

def ap(recommanded, real):
    real_set = set(real)
    cur = 0.
    n = len(recommanded)
    ans = 0.
    for k in range(1, n+1):
        if recommanded[k-1] in real_set:
            cur += 1
            ans += cur/k
    return ans/min(n, len(real))

def MAP(recommanded, real):
    ans = 0.
    for i in range(len(recommanded)):
        ans += ap(recommanded[i], real[i])
    return ans/len(recommanded)

class first_model(BaseEstimator, ClassifierMixin):
    def __init__(self, l=.6, g=.2, n_senders=125, n_people=9874, min_df=10):
        self.l = l
        self.g = g
        self.b = 1-l-g
        self.n_senders = n_senders
        self.n_people = n_people
        self.min_df = min_df
        self.graph = np.zeros((n_senders, n_people))
        self.vect = CountVectorizer(min_df=min_df, binary=True, stop_words='english')
        
    def fit(self, X, y):
        # Encoding the text messages (might be modified)
        X_body = self.vect.fit_transform(X[:,1])
        
        # Computing social graph
        n = len(X)
        self.n_train = n
        for i in range(n):
            sender = X[i][0]
            for recipient in y[i]:
                self.graph[sender][recipient] += 1

        self.total_received = np.sum(self.graph, axis=0)
        # if someone never received any message, we put 1 instead of 0 to avoid arthmetical errors...
        for i in range(len(self.total_received)):
            if self.total_received[i] == 0:
                self.total_received[i] = 1
        
        
        # X_body is a document-occurance bag-of-word representation of the text made by the user.
        # Computing the probabilities for the text-based part of the model
        n_words = X_body.shape[1]
        self.fwrs = [defaultdict(lambda: defaultdict(lambda: 0.)) for i in range(n_words)]
        self.fwr = [defaultdict(lambda: 0.) for i in range(n_words)]
        self.fw = np.array(np.sum(X_body, axis=0))[0].astype(float)
        
        cx = X_body.tocoo()
        for i,j in itertools.izip(cx.row, cx.col):
            for recipient_id in y[i]:
                self.fwrs[j][X[i][0]][recipient_id] += 1
                self.fwr[j][recipient_id] += 1
                
        # That's the long part
        self.terms = [defaultdict(lambda: defaultdict(lambda: 0)) for i in range(n_words)]
        for w in range(n_words):
            for s,d in self.fwrs[w].iteritems():
                for r,v in d.iteritems():
                    self.terms[w][s][r] = np.log(self.l*v/max(1., self.graph[s][r])
                                                 + self.g*self.fwr[w][r]/self.total_received[r]
                                                 + self.b*self.fw[w]/n)
                    
        self.recipient_lists = [set() for i in range(125)]
        for i in range(self.n_senders):
            for j in range(self.n_people):
                if self.graph[i][j] != 0:
                    self.recipient_lists[i].add(j)
                    
        self.best_recipients = np.argsort(self.total_received)[-10:]
        return self
        
    def predict(self, X):
        # Encoding the text messages (might be modified)
        X_body = self.vect.transform(X[:,1])
        
        cxt = X_body.tocoo()
        probas = [defaultdict(lambda: 0) for i in range(len(X))]
        # This is the longuest part O(W*n_people) = "O(a_lot*10.000)"
        for i,j in itertools.izip(cxt.row, cxt.col):
            s = X[i][0]
            for r in self.recipient_lists[s]:
                if(probas[i][r] == 0):
                    probas[i][r] = np.log(self.graph[s][r])
                if self.terms[j][s][r] == 0:
                    self.terms[j][s][r] = np.log(self.g*self.fwr[j][r]/self.total_received[r] + self.b*self.fw[j]/self.n_train)
                probas[i][r] += self.terms[j][s][r]
                
        y = np.zeros((X_body.shape[0], 10), dtype=int)
        for i in range(len(X)):
            bests = sorted(probas[i].iteritems(), key=operator.itemgetter(1), reverse=True)[:10]
            for j in range(min(len(bests), 10)):
                y[i][j] = bests[j][0]
            # If this sender sent mails to less than 10 people, we fill with the most popular recipients.
            for j in range(min(len(bests), 10), 10):
                y[i][j] = self.best_recipients[j-min(len(bests), 10)]
        return y
    
    def score(self, X, y):
        return MAP(self.predict(X), y)
    
    def get_params(self, deep):
        return {'l':self.l, 'g':self.g, 'n_senders':self.n_senders, 'n_people':self.n_people, 'min_df':self.min_df}
    
    def set_params(self, l=.6, g=.2, n_senders=125, n_people=9874, min_df=10):
        self.l = l
        self.g = g
        self.b = 1-l-g
        self.n_senders = n_senders
        self.n_people = n_people
        self.min_df = min_df

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from time import time
from sklearn.model_selection import GridSearchCV
%matplotlib inline

In [3]:
df = pd.read_csv('data/train.csv')
df['recipient_id'] = df['recipient_id'].apply(literal_eval)
X, y = df[['sender_id', 'body']].values, df['recipient_id'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)

In [4]:
model = first_model()
start = time()
model.fit(X_train, y_train)
print time() - start

44.6635320187


In [5]:
start = time()
n = 10
y_pred = model.predict(X_test[:n])
print time() - start

0.592405080795


In [23]:
start = time()
print model.score(X_test, y_test)
print time() - start

0.455980249819
182.50704217


In [None]:
parameters = {'l':[.5, .6, .7], 'g':[.2]}
clf = GridSearchCV(first_model(), parameters, n_jobs=1, cv=10)
start = time()
clf.fit(X, y)
print time() - start()