In [1]:
# Intall TensorFlow
# !pip install -q tensorflow-gpu==2.2.0-beta1

try:
  %tensorflow_version 2.x # Colab only
except Exception:
  pass

import tensorflow as tf
print(tf.__version__)

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `2.x # Colab only`. This will be interpreted as: `2.x`.


TensorFlow 2.x selected.
2.2.0


In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Flatten, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import  SGD, Adam


In [3]:
!wget -nc http://files.grouplens.org/datasets/movielens/ml-20m.zip

--2020-06-07 16:43:33--  http://files.grouplens.org/datasets/movielens/ml-20m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 198702078 (189M) [application/zip]
Saving to: ‘ml-20m.zip’


2020-06-07 16:43:46 (15.0 MB/s) - ‘ml-20m.zip’ saved [198702078/198702078]



In [4]:
!unzip -n ml-20m.zip

Archive:  ml-20m.zip
   creating: ml-20m/
  inflating: ml-20m/genome-scores.csv  
  inflating: ml-20m/genome-tags.csv  
  inflating: ml-20m/links.csv        
  inflating: ml-20m/movies.csv       
  inflating: ml-20m/ratings.csv      
  inflating: ml-20m/README.txt       
  inflating: ml-20m/tags.csv         


In [5]:
!ls

ml-20m	ml-20m.zip  sample_data


In [32]:
df = pd.read_csv('ml-20m/ratings.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [33]:
# we can't trust the userId and movieId to be numbered 0...N-1
# Let's just set our own ids

# current_user_id = 0
# custom_user_map = {} # old user id > new user id
# def map_user_id(row):
#    global current_user_id, custom_user_map
#    old_user_id = row['userId']
#    if old_user_id not in custom_user_map:
#       custom_user_map[old_user_id] = current_user_id
#       current_user_id += 1
#    return custom_user_map[old_user_id]

# df['new_user_id'] = df.apply(map_user_id, axis=1)

df.userId = pd.Categorical(df.userId)
df["new_user_id"] = df.userId.cat.codes
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,new_user_id
0,1,2,3.5,1112486027,0
1,1,29,3.5,1112484676,0
2,1,32,3.5,1112484819,0
3,1,47,3.5,1112484727,0
4,1,50,3.5,1112484580,0


In [34]:
# Now do the same thing for movie ids

# current_movie_id = 0
# custom_movie_map = {} # old movie id > new movie id
# def map_movie_id(row):
#    global current_movie_id, custom_movie_map
#    old_user_id = row['userId']
#    if old_movie_id not in custom_movie_map:
#       custom_movie_map[old_movie_id] = current_movie_id
#       current_movie_id += 1
#    return custom_movie_map[old_movie_id]

# df['new_movie_id'] = df.apply(map_movie_id, axis=1)

df.movieId = pd.Categorical(df.movieId)
df["new_movie_id"] = df.movieId.cat.codes
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,new_user_id,new_movie_id
0,1,2,3.5,1112486027,0,1
1,1,29,3.5,1112484676,0,28
2,1,32,3.5,1112484819,0,31
3,1,47,3.5,1112484727,0,46
4,1,50,3.5,1112484580,0,49


In [0]:
# Get user IDs, movie IDs, and ratings as separate arrays
user_ids = df['new_user_id'].values
movie_ids = df['new_movie_id'].values
ratings = df['rating'].values

In [0]:
# Get number of users and number of movies
N = len(set(user_ids))
M = len(set(movie_ids))

# Set embedding dimension
K = 10

In [0]:
# Make a neural network

# User input
u = Input(shape=(1,))

# Movie input
m = Input(shape=(1,))

# User Embedding
u_emb = Embedding(N, K)(u) # output is (num_samples, 1, K)

# Movie Embedding
m_emb = Embedding(M, K)(m) # output is (num_samples, 1, K)

# Flatten both embeddings
u_emb = Flatten()(u_emb) # now it's (num_samples, K)
m_emb = Flatten()(m_emb) # now it's (num_samples, K)

# Concatenate user-movie embeddings into a feature vector
x = Concatenate()([u_emb, m_emb]) # now it's (num_samples, 2K)

# Now that we have a feature vector, it's just a regular ANN
x = Dense(1024, activation='relu')(x)
# x = Dense(400, activation='relu')(x)
# x = Dense(400, activation='relu')(x)
x = Dense(1)(x)

In [0]:
# Build the model and compile
model = Model(inputs=[u, m], outputs=x)
model.compile(
    loss='mse',
    optimizer=SGD(lr=0.08, momentum=0.9),
)

In [0]:
# split the data
user_ids, movie_ids, ratings = shuffle(user_ids, movie_ids, ratings)
Ntrain = int(0.8 * len(ratings))
# train
train_user = user_ids[:Ntrain]
train_movie = movie_ids[:Ntrain]
train_ratings = ratings[:Ntrain]

# test
test_user = user_ids[Ntrain:]
test_movie = movie_ids[Ntrain:]
test_ratings = ratings[Ntrain:]

# center the ratings
avg_rating = train_ratings.mean()
train_ratings = train_ratings - avg_rating
test_ratings = test_ratings - avg_rating

In [40]:
r = model.fit(
    x=[train_user, train_movie],
    y=train_ratings,
    epochs=25,
    batch_size=1024,
    verbose=2, # goes a little faster when you don't print the progress bar
    validation_data=([test_user, test_movie], test_ratings),
)

Epoch 1/25
15626/15626 - 147s - loss: 0.7788 - val_loss: 0.7242
Epoch 2/25
15626/15626 - 147s - loss: 0.7022 - val_loss: 0.6980
Epoch 3/25
15626/15626 - 145s - loss: 0.6797 - val_loss: 0.6825
Epoch 4/25
15626/15626 - 149s - loss: 0.6639 - val_loss: 0.6775
Epoch 5/25
15626/15626 - 145s - loss: 0.6533 - val_loss: 0.6693
Epoch 6/25
15626/15626 - 146s - loss: 0.6416 - val_loss: 0.6614
Epoch 7/25
15626/15626 - 147s - loss: 0.6269 - val_loss: 0.6515
Epoch 8/25
15626/15626 - 148s - loss: 0.6119 - val_loss: 0.6430
Epoch 9/25
15626/15626 - 146s - loss: 0.6012 - val_loss: 0.6407
Epoch 10/25
15626/15626 - 145s - loss: 0.5923 - val_loss: 0.6410
Epoch 11/25
15626/15626 - 145s - loss: 0.5840 - val_loss: 0.6337
Epoch 12/25
15626/15626 - 139s - loss: 0.5763 - val_loss: 0.6314
Epoch 13/25
15626/15626 - 143s - loss: 0.5690 - val_loss: 0.6321
Epoch 14/25
15626/15626 - 143s - loss: 0.5628 - val_loss: 0.6324
Epoch 15/25
15626/15626 - 147s - loss: 0.5575 - val_loss: 0.6324
Epoch 16/25
15626/15626 - 144s - l

In [0]:
# plot losses
plt.plot(r.history['loss'], label='train loss')
plt.plot(r.history['val_loss'], label='val loss')
plt.legend()
plt.show()

In [0]:
# is this on par with other approaches
np.sqrt(0.6259)