# Cargar Movilens

In [None]:
!wget https://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip ml-100k.zip

In [None]:
import pandas as pd
import numpy as np 
from tqdm.notebook import tqdm
from sklearn.metrics import ndcg_score

train = pd.read_csv('ml-100k/u1.base', header=None, sep='\t')
train.columns = ['user', 'movie', 'rating', 'extra']

test = pd.read_csv('ml-100k/u1.test', header=None, sep='\t')
test.columns = ['user', 'movie', 'rating', 'extra']

train = train[train['user'].isin(set(test['user']))]
train = train[train['movie'].isin(set(test['movie']))]
test = test[test['user'].isin(set(train['user']))]
test = test[test['movie'].isin(set(train['movie']))]

In [None]:
max_show = 10
train.describe()

# Procesar

En esta sección se preprocesan los datos para asignarle un indice en base 0 tanto a los usuarios como a las películas.

In [None]:
print(len(set(train['user'])))
print(len(set(test['user'])))

print(len(set(train['movie'])))
print(len(set(test['movie'])))

print(len(set(test['movie'])-set(train['movie'])))
print(len(set(test['user'])-set(train['user'])))

In [None]:
users = list(set(train['user']))
users.sort()
u_id = {u: i for i, u in enumerate(users)}

movies = list(set(train['movie']))
movies.sort()
m_id = {m: i for i, m in enumerate(movies)}

In [None]:
u_train = [u_id[u] for u in train['user']]
u_test = [u_id[u] for u in test['user']]

m_train = [m_id[m] for m in train['movie']]
m_test = [m_id[m] for m in test['movie']]

r_train = train['rating']
r_test = test['rating']

In [None]:
def test_matrix(users, movies, u_test, m_test, r_test):
    mat = np.zeros((len(users), len(movies)))
    for u, m, r in zip(u_test, m_test, r_test):
        mat[u, m] = r
    return mat 

In [None]:
def predict(model, users, movies, u_train, m_train):
    mat = np.zeros((len(users), len(movies)))
    for u in tqdm(users):
        u_p = np.repeat(np.asarray([u]), len(movies))[:, np.newaxis]
        m_p = np.arange(len(movies))[:, np.newaxis]
        pred = model.predict([u_p, m_p], batch_size=len(movies))
        mat[u_p[:,0], m_p[:, 0]] = pred[:, 0]
    print(mat.shape)
    for u, m in zip(u_train, m_train):
        mat[u, m] = 0
    return mat

# Modelo Basado en factorización de matrices

En estos modelos se asume que existen carácteristicas latentes que relacionan a los usuarios con los items.

||Película 1|Película 2|Película 3|Película 4|Película 5|
|-|-|-|-|-|-| 
|Usuario 1|?|4|?|3|?|
|Usuario 2|1|2|?|?|?|
|Usuario 3|3|?|4|?|3|
|Usuario 4|5|4|?|4|?|
|Usuario 5|3|?|2|?|5|


$Ratings=U \cdot M^T$

Donde $U$ es una matrix de $Usuarios x Factores$ y $M$ es una matriz de $Películas x Factores$



In [None]:
from tensorflow.keras.layers import Dense, Concatenate, Flatten, Dot, Input, Embedding
from tensorflow.keras.models import Model

iu = Input((1,), name='user_i')
ue = Flatten()(Embedding(len(set(users)), 50, name='emb_user')(iu))

im = Input((1,), name='movie_i')
me = Flatten()(Embedding(len(set(movies)),50, name='emb_movie')(im))

d = Dot(axes=-1)([ue, me])

model = Model([iu, im], d)

model.compile(loss='mse', optimizer='nadam', metrics=['mae'])
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(model, show_shapes=True, show_layer_names=True, to_file='model.png')
from IPython.display import Image
Image(retina=True, filename='model.png')

In [None]:
model.fit([np.expand_dims(u_train, axis=-1), np.expand_dims(m_train, axis=-1)], np.expand_dims(train['rating'], axis=-1),
          epochs=10, batch_size=128,
          validation_data=([np.expand_dims(u_test, axis=-1), np.expand_dims(m_test, axis=-1)], 
                           np.expand_dims(test['rating'], axis=-1)))

In [None]:
r_pred = model.predict([np.expand_dims(u_test, axis=-1), np.expand_dims(m_test, axis=-1)])

for t, p in zip(test['rating'][:max_show], r_pred[:max_show, 0]):
    print("Real: {} Predicho: {}".format(t, p))

In [None]:
real = test_matrix(u_id.values(), m_id.values(), u_test, m_test, test['rating'])
pred = predict(model, u_id.values(), m_id.values(), u_train, m_train)

In [None]:
print(ndcg_score(real, pred, k=10))
print(ndcg_score(real, pred, k=20))
print(ndcg_score(real, pred, k=30))
print(ndcg_score(real, pred, k=100))

#Modelo Factorización + Bias

Similar a lo presentado anteriorimente, pero asume que además de las matrices de factores, existen valores de bias del usuario y de las películas. La predicción está dada por:

$Rating_{u, i}=U_u \cdot M_i + Bias_u + Bias_i$

In [None]:
from tensorflow.keras.layers import Input, Embedding, Dot, Add, Flatten, Activation
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2

emb_size = 50

iu = Input((1,), name='user_i')
ue = Embedding(len(set(users)), emb_size, name='emb_user')(iu)
ub = Embedding(len(set(users)), 1, name='bias_user')(iu)

im = Input((1,), name='movie_i')
me = Embedding(len(set(movies)), emb_size, name='emb_movie')(im)
mb = Embedding(len(set(movies)), 1, name='bias_movie')(im)

dot = Dot(axes=-1)([ue, me])

biases = Add()([dot, ub, mb])

out = Flatten()(biases)

model = Model([iu, im], out)

model.compile(loss='mse', optimizer='nadam', metrics=['mae'])
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(model, show_shapes=True, show_layer_names=True, to_file='model.png')
from IPython.display import Image
Image(retina=True, filename='model.png')

In [None]:
model.fit([np.expand_dims(u_train, axis=-1), np.expand_dims(m_train, axis=-1)], np.expand_dims(train['rating'], axis=-1),
          epochs=10, batch_size=128,
          validation_data=([np.expand_dims(u_test, axis=-1), np.expand_dims(m_test, axis=-1)], 
                           np.expand_dims(test['rating'], axis=-1)))

In [None]:
r_pred = model.predict([np.expand_dims(u_test, axis=-1), np.expand_dims(m_test, axis=-1)])

for t, p in zip(test['rating'][:max_show], r_pred[:max_show, 0]):
    print("Real: {} Predicho: {}".format(t, p))

In [None]:
real = test_matrix(u_id.values(), m_id.values(), u_test, m_test, test['rating'])
pred = predict(model, u_id.values(), m_id.values(), u_train, m_train)

In [None]:
print(ndcg_score(real, pred, k=10))
print(ndcg_score(real, pred, k=20))
print(ndcg_score(real, pred, k=30))
print(ndcg_score(real, pred, k=100))

# Modelo DL

También se puede definir algún modelo arbitrario.

In [None]:
from tensorflow.keras.layers import Dense, Concatenate, Flatten, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2

iu = Input((1,), name='user_i')
ue = Embedding(len(set(users)), 50, name='emb_user')(iu)

im = Input((1,), name='movie_i')
me = Embedding(len(set(movies)),50, name='emb_movie')(im)

f = Concatenate(axis=-1)([Flatten()(ue), Flatten()(me)])

d = Dense(50)(f)
d = Dense(50)(d)
d = Dense(50)(d)
d = Dense(1)(d)

model = Model([iu, im], d)

model.compile(loss='mse', optimizer='nadam', metrics=['mae'])
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(model, show_shapes=True, show_layer_names=True, to_file='model.png')
from IPython.display import Image
Image(retina=True, filename='model.png')

In [None]:
model.fit([np.expand_dims(u_train, axis=-1), np.expand_dims(m_train, axis=-1)], np.expand_dims(train['rating'], axis=-1),
          epochs=10, batch_size=128,
          validation_data=([np.expand_dims(u_test, axis=-1), np.expand_dims(m_test, axis=-1)], 
                           np.expand_dims(test['rating'], axis=-1)))

In [None]:
r_pred = model.predict([np.expand_dims(u_test, axis=-1), np.expand_dims(m_test, axis=-1)])

for t, p in zip(test['rating'][:max_show], r_pred[:max_show, 0]):
    print("Real: {} Predicho: {}".format(t, p))

In [None]:
real = test_matrix(u_id.values(), m_id.values(), u_test, m_test, test['rating'])
pred = predict(model, u_id.values(), m_id.values(), u_train, m_train)

In [None]:
print(ndcg_score(real, pred, k=10))
print(ndcg_score(real, pred, k=20))
print(ndcg_score(real, pred, k=30))
print(ndcg_score(real, pred, k=100))

# Modelo Binario con Bias 

En este caso es similar, solo que se decide si una película le va a interesar al usuario o no. Poniendo como límite las 3 estrellas.

In [None]:
min_r = min(r_train)
max_r = max(r_train)

r_train = (r_train - min_r) / (max_r - min_r)
r_test = (r_test - min_r) / (max_r - min_r)
r_test_full = r_test

r_train = np.where(r_train > 0.5, 1, 0)
r_test = np.where(r_test > 0.5, 1, 0)

In [None]:
from tensorflow.keras.layers import Input, Embedding, Dot, Add, Flatten, Activation
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2

emb_size = 50

iu = Input((1,), name='user_i')
ue = Embedding(len(set(users)), emb_size, name='emb_user')(iu)
ub = Embedding(len(set(users)), 1, name='bias_user')(iu)

im = Input((1,), name='movie_i')
me = Embedding(len(set(movies)), emb_size, name='emb_movie')(im)
mb = Embedding(len(set(movies)), 1, name='bias_movie')(im)

dot = Dot(axes=-1)([ue, me])

biases = Add()([dot, ub, mb])

out = Activation('sigmoid')(Flatten()(biases))

model = Model([iu, im], out)

model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['mae'])
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(model, show_shapes=True, show_layer_names=True, to_file='model.png')
from IPython.display import Image
Image(retina=True, filename='model.png')

In [None]:
print(u_train[:10])
print(m_train[:10])
print(r_train[:10])

In [None]:
model.fit([np.expand_dims(u_train, axis=-1), np.expand_dims(m_train, axis=-1)], np.expand_dims(r_train, axis=-1),
          epochs=10, batch_size=512,
          validation_data=([np.expand_dims(u_test, axis=-1), np.expand_dims(m_test, axis=-1)], np.expand_dims(r_test, axis=-1)))

In [None]:
r_pred = model.predict([np.expand_dims(u_test, axis=-1), np.expand_dims(m_test, axis=-1)])

for t, p in zip(test['rating'][:max_show], r_pred[:max_show, 0]):
    print("Real: {} Predicho: {}".format(t, p* (max_r - min_r) + min_r))

In [None]:
real = test_matrix(u_id.values(), m_id.values(), u_test, m_test, r_test_full)
pred = predict(model, u_id.values(), m_id.values(), u_train, m_train)

In [None]:
from sklearn.metrics import ndcg_score

print(ndcg_score(real, pred, k=10))
print(ndcg_score(real, pred, k=20))
print(ndcg_score(real, pred, k=30))
print(ndcg_score(real, pred, k=100))

# Modelo DL

In [None]:
from tensorflow.keras.layers import Dense, Concatenate, Flatten, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2

iu = Input((1,), name='user_i')
ue = Embedding(len(set(users)), 50, name='emb_user')(iu)

im = Input((1,), name='movie_i')
me = Embedding(len(set(movies)),50, name='emb_movie')(im)

f = Concatenate(axis=-1)([Flatten()(ue), Flatten()(me)])

d = Dense(50)(f)
d = Dense(50)(d)
d = Dense(50)(d)
d = Dense(1, activation='sigmoid')(d)

model = Model([iu, im], d)

model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['mae'])
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(model, show_shapes=True, show_layer_names=True, to_file='model.png')
from IPython.display import Image
Image(retina=True, filename='model.png')

In [None]:
model.fit([np.expand_dims(u_train, axis=-1), np.expand_dims(m_train, axis=-1)], np.expand_dims(r_train, axis=-1),
          epochs=10, batch_size=128,
          validation_data=([np.expand_dims(u_test, axis=-1), np.expand_dims(m_test, axis=-1)], np.expand_dims(r_test, axis=-1)))

In [None]:
r_pred = model.predict([np.expand_dims(u_test, axis=-1), np.expand_dims(m_test, axis=-1)])

for t, p in zip(test['rating'][:max_show], r_pred[:max_show, 0]):
    print("Real: {} Predicho: {}".format(t, p* (max_r - min_r) + min_r))

In [None]:
real = test_matrix(u_id.values(), m_id.values(), u_test, m_test, r_test_full)
pred = predict(model, u_id.values(), m_id.values(), u_train, m_train)

In [None]:
print(ndcg_score(real, pred, k=10))
print(ndcg_score(real, pred, k=20))
print(ndcg_score(real, pred, k=30))
print(ndcg_score(real, pred, k=100))

# Bibliografía extra:

* [Factorización de matrices](https://developers.google.com/machine-learning/recommendation/collaborative/matrix)
* [Wide & Deep Learning](https://ai.googleblog.com/2016/06/wide-deep-learning-better-together-with.html)
* [Wide & Deep Learning for Recommender Systems](https://arxiv.org/abs/1606.07792)
* [Deep Matrix Factorization](https://arxiv.org/abs/2010.00380)
* [xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems](https://arxiv.org/abs/1803.05170)
* [Nvidia: How to Build a Winning Recommendation System Part 1](https://developer.nvidia.com/blog/how-to-build-a-winning-recommendation-system-part-1/)
* [Nvidia: How to Build a Winning Recommendation System Part 2](https://developer.nvidia.com/blog/how-to-build-a-winning-recommendation-system-part-2-deep-learning-for-recommender-systems/)
*[Nvidia: How to Build a Winning Recommendation System Part 3](https://developer.nvidia.com/blog/how-to-build-a-winning-deep-learning-powered-recommender-system-part-3/)