In [4]:
import pandas as pd
import numpy as np

results = pd.read_csv("data/ml-25m/ratings.csv").dropna().head(11000)
results['rating'] = results['rating'] / results['rating'].max()

train_results = results.head(10000)
test_results = results.tail(1000)

results

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,1.0,1147880044
1,1,306,0.7,1147868817
2,1,307,1.0,1147868828
3,1,665,1.0,1147878820
4,1,899,0.7,1147868510
...,...,...,...,...
10995,80,2265,0.2,993231363
10996,80,2268,0.6,993230164
10997,80,2269,0.4,993327172
10998,80,2273,0.8,993231707


In [104]:
from tensorflow.keras.layers import StringLookup, Embedding, Flatten, ReLU, Dot
from tensorflow.keras import Input, Model
from tensorflow.keras.utils import set_random_seed
from tensorflow.keras.activations import sigmoid
from tensorflow.keras import regularizers
from tensorflow.keras import initializers


set_random_seed(42)
K = 2
user_regularization_lambda = 0.0001
movie_regularization_lambda = 0.0001

user = Input(shape = (1,), dtype = 'string', name = 'user')
user_id_to_int = StringLookup(vocabulary = results.userId.astype(str).unique(), name = 'user_id_to_int')
user_ints = user_id_to_int(user)
user_vector = Embedding(
    user_id_to_int.vocabulary_size(),
    K,
    name = 'user_encoder',
    embeddings_regularizer = regularizers.L2(user_regularization_lambda),
    embeddings_initializer = initializers.RandomNormal(mean=0, stddev=3, seed=42)
)(user_ints)
user_vector_flat = Flatten(name = 'user_vector')(user_vector)

movies = Input(shape = (1,), dtype = 'string', name = 'movie')
movie_id_to_int = StringLookup(vocabulary = results.movieId.astype(str).unique(), name = 'movie_id_to_int')
movie_ints = movie_id_to_int(movies)
movie_vector = Embedding(
    movie_id_to_int.vocabulary_size(),
    K,
    name = 'movie_encoder',
    embeddings_regularizer = regularizers.L2(movie_regularization_lambda),
    embeddings_initializer = initializers.RandomNormal(mean=0, stddev=3, seed=42)
)(movie_ints)
movie_vector_flat = Flatten(name = 'movie_vector')(movie_vector)

dot_product = Dot(axes = (1, 1), name = 'dot_product')([user_vector_flat, movie_vector_flat])
outputs = sigmoid(dot_product)

model = Model([user, movies], outputs)
model.summary()

In [105]:
import numpy as np
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

preds_train = model.predict([train_results.userId.astype(str), train_results.movieId.astype(str)])
preds_test = model.predict([test_results.userId.astype(str), test_results.movieId.astype(str)])

rmse(train_results.rating, preds.squeeze()), rmse(test_results.rating, preds_test.squeeze())

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


(0.5177224565695472, 0.5251706922406896)

In [106]:
model.compile(optimizer = "adam", loss = "mse")
history = model.fit([train_results.userId.astype(str), train_results.movieId.astype(str)], train_results.rating, epochs=100, verbose = 1)

Epoch 1/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 6.2069
Epoch 2/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 5.3285
Epoch 3/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 4.5837
Epoch 4/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 3.9428
Epoch 5/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 3.3890
Epoch 6/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 2.9090
Epoch 7/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 2.4926
Epoch 8/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 2.1318
Epoch 9/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.8200
Epoch 10/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms

In [108]:
preds_train = model.predict([train_results.userId.astype(str), train_results.movieId.astype(str)])
preds_test = model.predict([test_results.userId.astype(str), test_results.movieId.astype(str)])

rmse(train_results.rating, preds_train.squeeze()), rmse(test_results.rating, preds_test.squeeze())

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


(0.20942606055993598, 0.25630314592457315)

In [103]:
train_users = set(train_results.userId)
test_results.userId.isin(train_users).value_counts()

userId
False    596
True     404
Name: count, dtype: int64