# Flow: recommendation of new content to users

#### We develop a model to give new recommendations. The algorithm needs to present twenty new previously unseen items for each user. The baseline model given to beat throws a MAP (Mean Average Precision) of 0,014. 

We load the dataset. A data cleaning step is made in preparation for the later training step. 

In [1]:
import urllib.request
import zipfile
import pandas as pd
from datetime import datetime
import numpy as np
import tensorflow as tf

In [2]:
url = 'https://firebasestorage.googleapis.com/v0/b/z2tma61d2a74hya815w9x621uszb3a.appspot.com/o/RecoSys_dataset.zip?alt=media&token=46d5c550-0e95-44d7-83c5-02ffe948be75'
local_zip = 'RecoSys_dataset.zip'
urllib.request.urlretrieve(url, local_zip)
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall()
zip_ref.close()

In [3]:
train = pd.read_csv('train.csv', sep=',')
train.head()

Unnamed: 0,customer_id,account_id,device_type,asset_id,tunein,tuneout,resume
0,0,90627,STATIONARY,18332.0,2021-02-18 22:52:00.0,2021-02-18 23:35:00.0,0
1,0,90627,STATIONARY,24727.0,2021-03-24 23:17:00.0,2021-03-25 00:01:00.0,0
2,1,3387,STB,895.0,2021-03-15 10:05:00.0,2021-03-15 10:23:00.0,0
3,1,3387,STB,895.0,2021-03-15 10:23:00.0,2021-03-15 11:18:00.0,1
4,1,3387,STB,26062.0,2021-03-16 09:24:00.0,2021-03-16 09:44:00.0,0


In [4]:
train.isna().sum()

customer_id     0
account_id      0
device_type    29
asset_id       22
tunein          0
tuneout         0
resume          0
dtype: int64

In [5]:
train = train.drop('customer_id', axis=1, inplace=False)
train = train.drop('device_type', axis=1, inplace=False)
train = train.drop('resume', axis=1, inplace=False)
train = train.dropna(subset=['asset_id'], inplace=False)
train = train.reset_index(drop=True, inplace=False)
train.head()

Unnamed: 0,account_id,asset_id,tunein,tuneout
0,90627,18332.0,2021-02-18 22:52:00.0,2021-02-18 23:35:00.0
1,90627,24727.0,2021-03-24 23:17:00.0,2021-03-25 00:01:00.0
2,3387,895.0,2021-03-15 10:05:00.0,2021-03-15 10:23:00.0
3,3387,895.0,2021-03-15 10:23:00.0,2021-03-15 11:18:00.0
4,3387,26062.0,2021-03-16 09:24:00.0,2021-03-16 09:44:00.0


In [6]:
tunein = train.loc[:, ['tunein']].applymap(lambda date_time:date_time[:-2]).values.flatten()
tuneout = train.loc[:, ['tuneout']].applymap(lambda date_time:date_time[:-2]).values.flatten()
screen_time = []
for i in train.index:
    diff = datetime.strptime(tuneout[i], '%Y-%m-%d %H:%M:%S') - datetime.strptime(tunein[i], '%Y-%m-%d %H:%M:%S')
    screen_time.append(diff.total_seconds()/60)
train.loc[:,'screen_time'] = screen_time
train.head()

Unnamed: 0,account_id,asset_id,tunein,tuneout,screen_time
0,90627,18332.0,2021-02-18 22:52:00.0,2021-02-18 23:35:00.0,43.0
1,90627,24727.0,2021-03-24 23:17:00.0,2021-03-25 00:01:00.0,44.0
2,3387,895.0,2021-03-15 10:05:00.0,2021-03-15 10:23:00.0,18.0
3,3387,895.0,2021-03-15 10:23:00.0,2021-03-15 11:18:00.0,55.0
4,3387,26062.0,2021-03-16 09:24:00.0,2021-03-16 09:44:00.0,20.0


In [7]:
train = train.drop(['tunein', 'tuneout'], axis=1, inplace=False)
train.head()

Unnamed: 0,account_id,asset_id,screen_time
0,90627,18332.0,43.0
1,90627,24727.0,44.0
2,3387,895.0,18.0
3,3387,895.0,55.0
4,3387,26062.0,20.0


In [8]:
train.loc[:, ['account_id']] = train.loc[:, ['account_id']].applymap(lambda account_id:int(account_id)).values.flatten()
train.loc[:, ['asset_id']] = train.loc[:, ['asset_id']].applymap(lambda asset_id:int(asset_id)).values.flatten()
train.head()

Unnamed: 0,account_id,asset_id,screen_time
0,90627,18332,43.0
1,90627,24727,44.0
2,3387,895,18.0
3,3387,895,55.0
4,3387,26062,20.0


In [9]:
train = train.groupby(by=['account_id', 'asset_id'])['screen_time'].sum().reset_index()
train.head()

Unnamed: 0,account_id,asset_id,screen_time
0,0,6397,48.0
1,0,13056,65.0
2,0,15900,128.0
3,0,29811,79.0
4,0,29897,16.0


In [10]:
np.max(train.loc[:,'account_id'].unique())

113880

-----

Here we load the metadata provided. A data cleaning step is also made. 

In [11]:
metadata = pd.read_csv('metadata.csv', sep=';')
metadata.head()

Unnamed: 0,asset_id,content_id,title,reduced_title,episode_title,show_type,released_year,country_of_origin,category,keywords,...,language_rating,dialog_rating,fv_rating,pay_per_view,pack_premium_1,pack_premium_2,create_date,modify_date,start_vod_date,end_vod_date
0,15188,0.0,Ep:17 Tiempos Compulsivos,Tiempos_Compul_E17,Episodio 17,Serie,2012.0,AR,Drama,"Trastornos,Médicos,Tragicómica,Telenovela,Enfe...",...,N,N,N,N,N,N,2017-12-01T10:18:15.0Z,2019-01-26T06:37:18.0Z,2017-12-01T00:00:00.0Z,2020-12-01T23:59:59.0Z
1,24940,1.0,7 Cajas,7_Cajas,,Película,2012.0,PY,Suspenso/Acción,"Latinoamérica,Pobreza,Crimen,Pandillas",...,N,N,N,Y,N,N,2017-12-19T20:58:15.0Z,2019-09-17T19:02:03.0Z,2017-12-15T00:00:00.0Z,2022-12-14T23:59:59.0Z
2,21939,2.0,La Maldición de las Hormigas Gigantes,La_Maldicion_de_las,,Película,2016.0,FI,Terror/Comedia,"Criaturas,Plagas,Adolescentes,Fantasía,Video J...",...,N,N,N,N,N,N,2018-02-16T13:51:07.0Z,2020-04-28T14:16:38.0Z,2018-01-25T00:00:00.0Z,2020-12-01T23:59:59.0Z
3,9005,3.0,Una Mujer Fantástica,Una_Mujer_Fantastic,,Película,2017.0,CL,Drama,"LGBT,Mujeres,Latinoamérica",...,N,N,N,N,Y,N,2018-05-26T11:58:44.0Z,2019-11-15T03:00:23.0Z,2018-05-27T00:00:00.0Z,2021-04-30T23:59:59.0Z
4,7391,4.0,Star Trek,Star_Trek,,Película,2009.0,US,Ciencia Ficción/Aventura,"Fantasía,Galaxia,Futurismo,Aliens,Criaturas",...,N,N,N,Y,N,N,2019-05-03T20:07:24.0Z,2020-04-09T04:37:29.0Z,2019-05-02T00:00:00.0Z,2020-12-31T23:59:59.0Z


In [12]:
metadata.isna().sum()

asset_id                  0
content_id               21
title                     0
reduced_title             0
episode_title          4147
show_type                 4
released_year             0
country_of_origin         4
category                  0
keywords                  2
description               2
reduced_desc              0
cast_first_name        8732
credits_first_name    12554
run_time_min              0
audience                  1
made_for_tv               0
close_caption             0
sex_rating                0
violence_rating           0
language_rating           0
dialog_rating             0
fv_rating                 0
pay_per_view              0
pack_premium_1            0
pack_premium_2            0
create_date               0
modify_date               0
start_vod_date            0
end_vod_date              0
dtype: int64

In [13]:
metadata = metadata.loc[:,['asset_id', 'content_id', 'run_time_min', 'title']]
metadata = metadata.dropna(subset=['content_id'], inplace=False)
metadata = metadata.reset_index(drop=True, inplace=False)
metadata.head()

Unnamed: 0,asset_id,content_id,run_time_min,title
0,15188,0.0,48.0,Ep:17 Tiempos Compulsivos
1,24940,1.0,105.0,7 Cajas
2,21939,2.0,82.0,La Maldición de las Hormigas Gigantes
3,9005,3.0,99.0,Una Mujer Fantástica
4,7391,4.0,126.0,Star Trek


In [14]:
metadata.loc[:,['run_time_min']] = metadata.loc[:,['run_time_min']].applymap(lambda run_time: np.nan if run_time==0. else run_time).values.flatten()
metadata = metadata.dropna(subset=['run_time_min'], inplace=False)
metadata = metadata.reset_index(drop=True, inplace=False)
metadata.head()

Unnamed: 0,asset_id,content_id,run_time_min,title
0,15188,0.0,48.0,Ep:17 Tiempos Compulsivos
1,24940,1.0,105.0,7 Cajas
2,21939,2.0,82.0,La Maldición de las Hormigas Gigantes
3,9005,3.0,99.0,Una Mujer Fantástica
4,7391,4.0,126.0,Star Trek


In [15]:
metadata.loc[:, ['asset_id']] = metadata.loc[:, ['asset_id']].applymap(lambda account_id:int(account_id)).values.flatten()
metadata.loc[:, ['content_id']] = metadata.loc[:, ['content_id']].applymap(lambda asset_id:int(asset_id)).values.flatten()
metadata.head()

Unnamed: 0,asset_id,content_id,run_time_min,title
0,15188,0,48.0,Ep:17 Tiempos Compulsivos
1,24940,1,105.0,7 Cajas
2,21939,2,82.0,La Maldición de las Hormigas Gigantes
3,9005,3,99.0,Una Mujer Fantástica
4,7391,4,126.0,Star Trek


In [16]:
np.max(metadata.loc[:,'content_id'].unique())

4371

-----

After the cleaning process we've just made, we perform a data preprocessing step, merging all the information we have in order to get a training dataset. We use the time spent on each content per user relative to the duration of that content as the rating.

In [17]:
metadata_train = metadata.merge(right=train, how='inner', on='asset_id')
metadata_train.head()

Unnamed: 0,asset_id,content_id,run_time_min,title,account_id,screen_time
0,18689,749,57.0,T:1 Ep:03 The White Princess,97018,24.0
1,25352,118,68.0,T:1 Ep:07 Presunto Culpable,59418,15.0
2,29669,118,66.0,T:1 Ep:06 Presunto Culpable,58187,106.0
3,1639,774,30.0,T:4 Ep:05 Sex and the City,24537,4.0
4,16292,771,59.0,Ep:273 Huérfanas,9268,34.0


In [18]:
metadata_train = metadata_train.drop('asset_id', axis=1, inplace=False)
metadata_train.head()

Unnamed: 0,content_id,run_time_min,title,account_id,screen_time
0,749,57.0,T:1 Ep:03 The White Princess,97018,24.0
1,118,68.0,T:1 Ep:07 Presunto Culpable,59418,15.0
2,118,66.0,T:1 Ep:06 Presunto Culpable,58187,106.0
3,774,30.0,T:4 Ep:05 Sex and the City,24537,4.0
4,771,59.0,Ep:273 Huérfanas,9268,34.0


In [19]:
metadata_train = metadata_train.groupby(by=['content_id','account_id'])['run_time_min','screen_time'].sum().reset_index()
metadata_train.head()

Unnamed: 0,content_id,account_id,run_time_min,screen_time
0,0,1422,46.0,56.0
1,0,1431,47.0,4.0
2,0,5959,47.0,21.0
3,0,8006,543.0,134.0
4,0,9403,93.0,69.0


In [20]:
ratings_train = metadata_train.loc[:, ['screen_time']].values.flatten()/metadata_train.loc[:, ['run_time_min']].values.flatten()
metadata_train.loc[:,'ratings'] = np.minimum(ratings_train,1.)
metadata_train.head()

Unnamed: 0,content_id,account_id,run_time_min,screen_time,ratings
0,0,1422,46.0,56.0,1.0
1,0,1431,47.0,4.0,0.085106
2,0,5959,47.0,21.0,0.446809
3,0,8006,543.0,134.0,0.246777
4,0,9403,93.0,69.0,0.741935


In [21]:
metadata_train = metadata_train.loc[:,['account_id', 'content_id', 'ratings']]
metadata_train.head()

Unnamed: 0,account_id,content_id,ratings
0,1422,0,1.0
1,1431,0,0.085106
2,5959,0,0.446809
3,8006,0,0.246777
4,9403,0,0.741935


-----

Here we obtain the most popular content according to the ratings just calculated and display their titles.

In [22]:
top = list(metadata_train.groupby(by='content_id').count().sort_values(by=['account_id'], ascending=False).index)
top[:20]

[2040,
 3806,
 3900,
 4133,
 1983,
 729,
 2942,
 3210,
 3381,
 2160,
 3598,
 3384,
 1020,
 1316,
 1462,
 1877,
 4362,
 3690,
 1971,
 116]

In [23]:
for i in top[:20]:
    print(metadata.query('content_id == {}'.format(i)).title)
    print()

13944    T:1 Ep:01 This is Us
13945    T:1 Ep:02 This is Us
13946    T:1 Ep:03 This is Us
13947    T:1 Ep:04 This is Us
13948    T:1 Ep:05 This is Us
                 ...         
29778    T:5 Ep:07 This is Us
30001    T:5 Ep:08 This is Us
30213    T:5 Ep:09 This is Us
31832    T:5 Ep:10 This is Us
32532    T:5 Ep:11 This is Us
Name: title, Length: 83, dtype: object

28965    Cosa de minas
Name: title, dtype: object

29589    T:1 Ep:01 The Collapse
29590    T:1 Ep:02 The Collapse
29591    T:1 Ep:03 The Collapse
29592    T:1 Ep:04 The Collapse
29593    T:1 Ep:05 The Collapse
29594    T:1 Ep:06 The Collapse
29595    T:1 Ep:07 The Collapse
29596    T:1 Ep:08 The Collapse
Name: title, dtype: object

31098    T:1 Ep:01 El nudo
31099    T:1 Ep:02 El nudo
31168    T:1 Ep:04 El nudo
31169    T:1 Ep:07 El nudo
31170    T:1 Ep:08 El nudo
31171    T:1 Ep:09 El nudo
31172    T:1 Ep:03 El nudo
31173    T:1 Ep:05 El nudo
31174    T:1 Ep:06 El nudo
31175    T:1 Ep:10 El nudo
31176    T:1 Ep:11 El nud

-----

Here we define the dataset for training.

In [24]:
X_train = metadata_train.loc[:,['account_id', 'content_id']].values
Y_train = metadata_train.loc[:,['ratings']].values

-----

Here we define the model to train. We will use collaborative filtering.

In [25]:
class Embedding():
    
    def __init__(self, input_dim, rank):
        self.input_dim = input_dim
        self.rank = rank
        self.build()
        
    def add_weight(self, shape):
        w_init = tf.random.normal(shape=shape, mean=0.0, stddev=0.05, dtype="float32")
        return tf.Variable(initial_value=w_init, trainable=True)
        
    def build(self):
        self.w = self.add_weight(shape=(self.input_dim,self.rank))
        self.weights = [self.w]

    def __call__(self, inputs):
        idx = tf.reshape(inputs,[-1])
        return tf.gather(self.w, indices=idx, axis=0)
    
class Dot():
    
    def __init__(self):
        self.build()
        
    def build(self):
        self.weights = []
        
    def __call__(self, inputs):
        self.a1 = inputs[0]
        self.a2 = inputs[1]
        return tf.reduce_sum(self.a1*self.a2, axis=1, keepdims=True)
        
class LossFunction():
        
    def __init__(self, model, alpha):
        self.model = model
        self.alpha = alpha
            
    def __call__(self, y_true, y_pred):
        loss = tf.reduce_mean(tf.square(y_true-y_pred))
        regularization1 = tf.reduce_sum(tf.square(self.model.weights[0]))
        regularization2 = tf.reduce_sum(tf.square(self.model.weights[1]))
        return loss + self.alpha*regularization1 + self.alpha*regularization2 
    
class Optimizer():

    def __init__(self, model, learning_rate, beta_1, beta_2, epsilon):
        self.model = model
        self.learning_rate = learning_rate
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.epsilon= epsilon
        self.stop_training = False
        self.build()
        
    def add_weight(self, shape):
        w_init = tf.zeros(shape=shape, dtype="float32")
        return  tf.Variable(initial_value=w_init, trainable=False)
    
    def build(self):
        self.weights = []
        for weight in self.model.weights:
            m = self.add_weight(shape=weight.shape)
            v = self.add_weight(shape=weight.shape)
            self.weights.append([m,v])
            
    def apply(self, grads, weights):
        for i in range(len(weights)):
            w = weights[i]
            grad_w = grads[i]
            m = self.weights[i][0]
            v = self.weights[i][1]
            self.weights[i][0].assign(self.beta_1*m + (1-self.beta_1)*grad_w)  
            self.weights[i][1].assign(self.beta_2*v + (1-self.beta_2)*grad_w*grad_w)
            m_ = (1/(1-self.beta_1))*self.weights[i][0]
            v_ = (1/(1-self.beta_2))*self.weights[i][1]
            weights[i].assign(w - self.learning_rate*m_/(tf.math.sqrt(v_)+self.epsilon))
            
    def train_step(self, X, Y):
        with tf.GradientTape() as tape:
            H = self.model(X)
            loss = self.model.loss(Y, H)
        grads = tape.gradient(loss, self.model.weights)
        self.apply(grads, self.model.weights)
        H = self.model(X)
        loss = self.model.loss(Y, H)
        logs = {'loss': loss}
        return logs  
    
class Callback1():

    def __init__(self, model, verbose):
        self.model = model
        self.verbose = verbose
    
    def on_epoch_begin(self, epoch, logs=None):
        self.start_time = tf.timestamp()
        
    def on_epoch_end(self, epoch, logs=None):
        if self.verbose:
            now = tf.timestamp()
            time = now - self.start_time
            tf.print('Epochs {}/{} - Loss: {}'.format(epoch+1, self.model.epochs, logs['loss']))
            tf.print('--- {}s ---'.format(tf.round(1000*time)/1000))

class Callback2():
        
    def __init__(self, model, patience, error, reduce_factor, min_learning_rate):
        self.model = model
        self.patience = patience
        self.error = error
        self.reduce_factor = reduce_factor
        self.min_learning_rate = min_learning_rate
                        
    def on_epoch_end(self, epoch, logs):
        if epoch==0:
            self.loss = logs['loss']
            self.non_decreasing_epochs = 0
        else:
            if ((self.loss-logs['loss'])>self.error):
                self.loss = logs['loss']
                self.non_decreasing_epochs = 0
            else:
                self.non_decreasing_epochs = self.non_decreasing_epochs+1
        if (self.non_decreasing_epochs == self.patience):
            if (self.model.optimizer.learning_rate>self.min_learning_rate):
                self.model.optimizer.learning_rate = self.reduce_factor*self.model.optimizer.learning_rate
                self.non_decreasing_epochs = 0
        
class Callback3():
        
    def __init__(self, model, patience, error):
        self.model = model
        self.patience = patience
        self.error = error
        
    def on_epoch_end(self, epoch, logs):
        if epoch==0:
            self.loss = logs['loss']
            self.non_decreasing_epochs = 0
        else:
            if ((self.loss-logs['loss'])>self.error):
                self.loss = logs['loss']
                self.non_decreasing_epochs = 0
            else:
                self.non_decreasing_epochs = self.non_decreasing_epochs+1
        if (self.non_decreasing_epochs == self.patience):
            self.model.optimizer.stop_training = True
            
            
class RecommenderSystem():
    
    def __init__(self, users_dim, content_dim, rank):
        self.users_dim = users_dim
        self.content_dim = content_dim
        self.rank = rank
        self.build()
     
    def build(self):
        self.h1 = Embedding(input_dim=self.users_dim, rank=self.rank)
        self.h2 = Embedding(input_dim=self.content_dim, rank=self.rank)
        self.h3 = Dot()
        self.layers = [self.h1, self.h2, self.h3]
        self.weights = []
        for layer in self.layers:
            for weight in layer.weights:
                self.weights.append(weight)
        
    def __call__(self, inputs):
        x0 = inputs[0]
        x1 = self.h1(x0)
        z0 = inputs[1]
        z1 = self.h2(z0)
        y = self.h3([x1,z1]) 
        return y 
        
    def train_setup(self, epochs, learning_rate, alpha, beta_1, beta_2, epsilon, verbose):
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.alpha = alpha
        self.beta_1 = beta_1 
        self.beta_2 = beta_2 
        self.epsilon = epsilon
        self.verbose = verbose
        self.loss = LossFunction(model=self, alpha=self.alpha)
        self.optimizer = Optimizer(model=self, learning_rate=self.learning_rate, beta_1=self.beta_1, beta_2=self.beta_2, epsilon=self.epsilon) 
        self.callbacks = [Callback1(model=self, verbose=self.verbose),
                          Callback2(model=self, patience=100, error=0.0001, reduce_factor=0.1, min_learning_rate=0.001),
                          Callback3(model=self, patience=200, error=0.0001)]
        
    def fit(self, X, Y, epochs=1000, learning_rate=0.01, alpha=1.0, beta_1=0.9, beta_2=0.999, epsilon=1e-07, verbose=True):
        self.train_setup(epochs, learning_rate, alpha, beta_1, beta_2, epsilon,verbose)
        if verbose:
            tf.print('Train on {} samples'.format(X.shape[0]))
        for epoch in range(epochs):
            self.callbacks[0].on_epoch_begin(epoch)
            logs = self.optimizer.train_step([tf.constant(X[:,[0]], dtype="int32"), tf.constant(X[:,[1]], dtype="int32")], tf.constant(Y, dtype="float32"))
            for callback in self.callbacks:
                callback.on_epoch_end(epoch, logs)
            if self.optimizer.stop_training:
                break
        self.ratings = tf.matmul(self.weights[0],tf.transpose(self.weights[1]))
            
    def predict(self, inputs):
        return self(inputs).numpy()
        
    def evaluate(self, X, Y):
        loss = self.loss(tf.constant(Y, dtype="float32"), self(X))
        return [loss.numpy()]

In [26]:
recommender_system = RecommenderSystem(users_dim=113880+1, content_dim=4371+1, rank=32)
recommender_system.fit(X_train, Y_train)

Train on 971368 samples
Epochs 1/1000 - Loss: 0.4429590106010437
--- 1.022s ---
Epochs 2/1000 - Loss: 0.436658650636673
--- 0.52s ---
Epochs 3/1000 - Loss: 0.42687663435935974
--- 0.499s ---
Epochs 4/1000 - Loss: 0.4123416841030121
--- 0.463s ---
Epochs 5/1000 - Loss: 0.3919801712036133
--- 0.464s ---
Epochs 6/1000 - Loss: 0.365105539560318
--- 0.462s ---
Epochs 7/1000 - Loss: 0.331586092710495
--- 0.495s ---
Epochs 8/1000 - Loss: 0.29203569889068604
--- 0.506s ---
Epochs 9/1000 - Loss: 0.24803781509399414
--- 0.455s ---
Epochs 10/1000 - Loss: 0.2023719698190689
--- 0.487s ---
Epochs 11/1000 - Loss: 0.15911704301834106
--- 0.485s ---
Epochs 12/1000 - Loss: 0.123328797519207
--- 0.515s ---
Epochs 13/1000 - Loss: 0.09971373528242111
--- 0.46s ---
Epochs 14/1000 - Loss: 0.08997158706188202
--- 0.456s ---
Epochs 15/1000 - Loss: 0.09047398716211319
--- 0.453s ---
Epochs 16/1000 - Loss: 0.09371320903301239
--- 0.457s ---
Epochs 17/1000 - Loss: 0.0931812971830368
--- 0.478s ---
Epochs 18/1000

--- 0.467s ---
Epochs 141/1000 - Loss: 0.006148600019514561
--- 0.469s ---
Epochs 142/1000 - Loss: 0.006116870325058699
--- 0.494s ---
Epochs 143/1000 - Loss: 0.006085558794438839
--- 0.473s ---
Epochs 144/1000 - Loss: 0.0060546561144292355
--- 0.472s ---
Epochs 145/1000 - Loss: 0.0060241553001105785
--- 0.468s ---
Epochs 146/1000 - Loss: 0.005994046106934547
--- 0.468s ---
Epochs 147/1000 - Loss: 0.005964321084320545
--- 0.467s ---
Epochs 148/1000 - Loss: 0.005934972316026688
--- 0.47s ---
Epochs 149/1000 - Loss: 0.005905990954488516
--- 0.469s ---
Epochs 150/1000 - Loss: 0.005877370946109295
--- 0.471s ---
Epochs 151/1000 - Loss: 0.005849103908985853
--- 0.466s ---
Epochs 152/1000 - Loss: 0.005821183789521456
--- 0.472s ---
Epochs 153/1000 - Loss: 0.005793603602796793
--- 0.484s ---
Epochs 154/1000 - Loss: 0.00576635729521513
--- 0.473s ---
Epochs 155/1000 - Loss: 0.005739438347518444
--- 0.469s ---
Epochs 156/1000 - Loss: 0.005712840240448713
--- 0.472s ---
Epochs 157/1000 - Loss: 0

--- 0.475s ---
Epochs 278/1000 - Loss: 0.003816758282482624
--- 0.474s ---
Epochs 279/1000 - Loss: 0.0038074927870184183
--- 0.475s ---
Epochs 280/1000 - Loss: 0.0037982857320457697
--- 0.464s ---
Epochs 281/1000 - Loss: 0.003789137117564678
--- 0.469s ---
Epochs 282/1000 - Loss: 0.0037800457794219255
--- 0.474s ---
Epochs 283/1000 - Loss: 0.003771011484786868
--- 0.47s ---
Epochs 284/1000 - Loss: 0.003762033535167575
--- 0.482s ---
Epochs 285/1000 - Loss: 0.0037531114649027586
--- 0.478s ---
Epochs 286/1000 - Loss: 0.0037442443426698446
--- 0.472s ---
Epochs 287/1000 - Loss: 0.0037354324012994766
--- 0.47s ---
Epochs 288/1000 - Loss: 0.0037266737781465054
--- 0.478s ---
Epochs 289/1000 - Loss: 0.0037179698701947927
--- 0.481s ---
Epochs 290/1000 - Loss: 0.003709318581968546
--- 0.475s ---
Epochs 291/1000 - Loss: 0.0037007194478064775
--- 0.473s ---
Epochs 292/1000 - Loss: 0.0036921727005392313
--- 0.467s ---
Epochs 293/1000 - Loss: 0.003683677641674876
--- 0.475s ---
Epochs 294/1000 -

--- 0.471s ---
Epochs 414/1000 - Loss: 0.002928501693531871
--- 0.487s ---
Epochs 415/1000 - Loss: 0.002923879073932767
--- 0.478s ---
Epochs 416/1000 - Loss: 0.0029192755464464426
--- 0.472s ---
Epochs 417/1000 - Loss: 0.0029146906454116106
--- 0.476s ---
Epochs 418/1000 - Loss: 0.0029101248364895582
--- 0.47s ---
Epochs 419/1000 - Loss: 0.0029055781196802855
--- 0.474s ---
Epochs 420/1000 - Loss: 0.002901049330830574
--- 0.472s ---
Epochs 421/1000 - Loss: 0.0028965394012629986
--- 0.472s ---
Epochs 422/1000 - Loss: 0.0028920480981469154
--- 0.47s ---
Epochs 423/1000 - Loss: 0.0028875742573291063
--- 0.469s ---
Epochs 424/1000 - Loss: 0.002883118810132146
--- 0.473s ---
Epochs 425/1000 - Loss: 0.0028786815237253904
--- 0.489s ---
Epochs 426/1000 - Loss: 0.0028742621652781963
--- 0.489s ---
Epochs 427/1000 - Loss: 0.0028698602691292763
--- 0.475s ---
Epochs 428/1000 - Loss: 0.0028654758352786303
--- 0.478s ---
Epochs 429/1000 - Loss: 0.0028611095622181892
--- 0.476s ---
Epochs 430/1000

--- 0.476s ---
Epochs 550/1000 - Loss: 0.0024345568381249905
--- 0.483s ---
Epochs 551/1000 - Loss: 0.0024316960480064154
--- 0.473s ---
Epochs 552/1000 - Loss: 0.002428844105452299
--- 0.472s ---
Epochs 553/1000 - Loss: 0.0024260003119707108
--- 0.475s ---
Epochs 554/1000 - Loss: 0.0024231658317148685
--- 0.471s ---
Epochs 555/1000 - Loss: 0.0024203392677009106
--- 0.47s ---
Epochs 556/1000 - Loss: 0.0024175215512514114
--- 0.472s ---
Epochs 557/1000 - Loss: 0.00241471198387444
--- 0.472s ---
Epochs 558/1000 - Loss: 0.002411910565569997
--- 0.476s ---
Epochs 559/1000 - Loss: 0.0024091179948300123
--- 0.476s ---
Epochs 560/1000 - Loss: 0.002406333340331912
--- 0.475s ---
Epochs 561/1000 - Loss: 0.0024035570677369833
--- 0.489s ---
Epochs 562/1000 - Loss: 0.002400789177045226
--- 0.47s ---
Epochs 563/1000 - Loss: 0.0023980294354259968
--- 0.474s ---
Epochs 564/1000 - Loss: 0.0023952778428792953
--- 0.495s ---
Epochs 565/1000 - Loss: 0.0023925339337438345
--- 0.478s ---
Epochs 566/1000 -

--- 0.478s ---
Epochs 686/1000 - Loss: 0.0021105839405208826
--- 0.478s ---
Epochs 687/1000 - Loss: 0.0021086016204208136
--- 0.474s ---
Epochs 688/1000 - Loss: 0.0021066232584416866
--- 0.47s ---
Epochs 689/1000 - Loss: 0.0021046497859060764
--- 0.475s ---
Epochs 690/1000 - Loss: 0.0021026814356446266
--- 0.484s ---
Epochs 691/1000 - Loss: 0.0021007175091654062
--- 0.483s ---
Epochs 692/1000 - Loss: 0.0020987584721297026
--- 0.482s ---
Epochs 693/1000 - Loss: 0.0020968043245375156
--- 0.491s ---
Epochs 694/1000 - Loss: 0.0020948543678969145
--- 0.468s ---
Epochs 695/1000 - Loss: 0.0020929095335304737
--- 0.471s ---
Epochs 696/1000 - Loss: 0.002090969355776906
--- 0.477s ---
Epochs 697/1000 - Loss: 0.002089033368974924
--- 0.478s ---
Epochs 698/1000 - Loss: 0.0020871025044471025
--- 0.477s ---
Epochs 699/1000 - Loss: 0.0020851758308708668
--- 0.473s ---
Epochs 700/1000 - Loss: 0.002083253813907504
--- 0.478s ---
Epochs 701/1000 - Loss: 0.0020813364535570145
--- 0.475s ---
Epochs 702/10

--- 0.478s ---
Epochs 822/1000 - Loss: 0.0018785475986078382
--- 0.478s ---
Epochs 823/1000 - Loss: 0.0018770805327221751
--- 0.471s ---
Epochs 824/1000 - Loss: 0.0018756167264655232
--- 0.475s ---
Epochs 825/1000 - Loss: 0.0018741555977612734
--- 0.477s ---
Epochs 826/1000 - Loss: 0.001872697495855391
--- 0.483s ---
Epochs 827/1000 - Loss: 0.0018712421879172325
--- 0.474s ---
Epochs 828/1000 - Loss: 0.0018697900231927633
--- 0.489s ---
Epochs 829/1000 - Loss: 0.0018683405360206962
--- 0.474s ---
Epochs 830/1000 - Loss: 0.0018668940756469965
--- 0.471s ---
Epochs 831/1000 - Loss: 0.0018654506420716643
--- 0.478s ---
Epochs 832/1000 - Loss: 0.001864010002464056
--- 0.471s ---
Epochs 833/1000 - Loss: 0.0018625721568241715
--- 0.473s ---
Epochs 834/1000 - Loss: 0.0018611372215673327
--- 0.471s ---
Epochs 835/1000 - Loss: 0.0018597050802782178
--- 0.472s ---
Epochs 836/1000 - Loss: 0.0018582759657874703
--- 0.476s ---
Epochs 837/1000 - Loss: 0.001856849412433803
--- 0.473s ---
Epochs 838/1

Epochs 957/1000 - Loss: 0.0017038038931787014
--- 0.497s ---
Epochs 958/1000 - Loss: 0.00170266127679497
--- 0.496s ---
Epochs 959/1000 - Loss: 0.001701520523056388
--- 0.469s ---
Epochs 960/1000 - Loss: 0.0017003817483782768
--- 0.466s ---
Epochs 961/1000 - Loss: 0.001699244836345315
--- 0.471s ---
Epochs 962/1000 - Loss: 0.001698110019788146
--- 0.48s ---
Epochs 963/1000 - Loss: 0.0016969769494608045
--- 0.48s ---
Epochs 964/1000 - Loss: 0.001695845858193934
--- 0.475s ---
Epochs 965/1000 - Loss: 0.0016947166295722127
--- 0.471s ---
Epochs 966/1000 - Loss: 0.0016935892635956407
--- 0.485s ---
Epochs 967/1000 - Loss: 0.0016924637602642179
--- 0.479s ---
Epochs 968/1000 - Loss: 0.001691340235993266
--- 0.478s ---
Epochs 969/1000 - Loss: 0.0016902184579521418
--- 0.483s ---
Epochs 970/1000 - Loss: 0.0016890988918021321
--- 0.495s ---
Epochs 971/1000 - Loss: 0.0016879806062206626
--- 0.483s ---
Epochs 972/1000 - Loss: 0.0016868647653609514
--- 0.553s ---
Epochs 973/1000 - Loss: 0.0016857

The recommmendations we are going to give will consist of twenty items. We select ten of the most popular. The other ten will be the most popular items per user, provided by the model just trained.

In [27]:
recommender_system_ratings = recommender_system.ratings.numpy()

with open('output.csv','w') as f:
    for i in range(113880+1):
        my_predictions = recommender_system_ratings[i,:]
        ix = list(np.argsort(my_predictions)[::-1])
        r = list(metadata_train.query('account_id == {}'.format(i)).sort_values(by=['ratings'], ascending=False).loc[:,'content_id'].values)
        r_ = [t for t in ix if t not in r][:10]
        r__ = [t for t in top if t not in r+r_][:10]
        f.write(str(i))
        f.write(':')
        f.write(str(r__+r_))
        f.write('\n')

-----

The last step is to calculate  the MAP (Mean Average Precision). We load the test information provided and perform the calculation.

In [28]:
with open('test.csv','r') as f:
    test_csv = f.readlines()
labels = []
for line in test_csv:
    label_line = []
    for n in line.split(':')[1][1:-2].split(','):
        if n!='':
            label_line.append(int(n))
    labels.append(label_line)

In [29]:
with open('output.csv','r') as f:
    base_model_csv = f.readlines()
preds = []
for line in base_model_csv:
    pred_line = []
    for n in line.split(':')[1][1:-2].split(','):
        pred_line.append(int(n))
    preds.append(pred_line)

In [30]:
aps = [] 
for pred, label in zip(preds, labels):
    n = len(pred) 
    arange = np.arange(n, dtype=np.int32) + 1. 
    rel_k = np.in1d(pred[:n], label) 
    tp = np.ones(rel_k.sum(), dtype=np.int32).cumsum() 
    denom = arange[rel_k] 
    if len(label)!=0:
        ap = (tp / denom).sum() / len(label) 
    else:
        ap = 0.
    aps.append(ap)
np.mean(aps)

0.021896437213015325


#### As we can see, our model beats the provided baseline model MAP, so it could be considered a candidate solution for the original problem presented. Additional exploration of hyperparameters would be necessary to make a final conclusion.