Recomender System project using movielens dataset (20 million rating)

In [1]:
import tensorflow as tf
print(tf.__version__)

2.11.0


In [2]:
from tensorflow.keras.layers import Dense, Input, Embedding, Flatten, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD, Adam

from sklearn.utils import shuffle

import numpy as np
import matplotlib.pylab
import pandas as pd

In [3]:
!wget -nc http://files.grouplens.org/datasets/movielens/ml-20m.zip


--2023-03-21 13:46:44--  http://files.grouplens.org/datasets/movielens/ml-20m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 198702078 (189M) [application/zip]
Saving to: ‘ml-20m.zip’


2023-03-21 13:47:00 (12.7 MB/s) - ‘ml-20m.zip’ saved [198702078/198702078]



In [5]:
!unzip -n ml-20m.zip

Archive:  ml-20m.zip
   creating: ml-20m/
  inflating: ml-20m/genome-scores.csv  
  inflating: ml-20m/genome-tags.csv  
  inflating: ml-20m/links.csv        
  inflating: ml-20m/movies.csv       
  inflating: ml-20m/ratings.csv      
  inflating: ml-20m/README.txt       
  inflating: ml-20m/tags.csv         


In [6]:
!ls

ml-20m	ml-20m.zip  sample_data


In [7]:
df = pd.read_csv('ml-20m/ratings.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [8]:
'''
We can't trust the userId and movieId to be numbered 0 ... N-1
Let's just set out own ids

current_user_id = 0
custom_user_map = {} //old user id > new user id
def map_user(row):
  global current_user_id, custom_user_map
  old_user_id = row['userId']
  if old_user_id not in custom_user_map:
    custom_user_map[old_user_id] = current_user_id
    current_user_id += 1
  return custome_user_map[old_user_id]

df['new_user_id'] = df.apply(map_user_id, axis=1)
  
'''

df.movieId = pd.Categorical(df.movieId)
df['new_movie_id'] = df.movieId.cat.codes

In [9]:
'''
We can't trust the userId and movieId to be numbered 0 ... N-1
Let's just set out own ids

current_user_id = 0
custom_user_map = {} //old user id > new user id
def map_user(row):
  global current_user_id, custom_user_map
  old_user_id = row['userId']
  if old_user_id not in custom_user_map:
    custom_user_map[old_user_id] = current_user_id
    current_user_id += 1
  return custome_user_map[old_user_id]

df['new_user_id'] = df.apply(map_user_id, axis=1)
  
'''

df.userId = pd.Categorical(df.userId)
df['new_user_id'] = df.userId.cat.codes

In [10]:
#Get user IDs, omvie IDs, and ratings as separate arrays
user_ids = df['new_user_id'].values
movie_ids = df['new_movie_id'].values
ratings = df['rating'].values

In [11]:
#Get number of users and number of movies
N = len(set(user_ids))
M = len(set(movie_ids))

#set embedding dimension
K = 10

Make Neural network

In [12]:
#Userinput
u = Input(shape = (1, ))

#movie Input
m = Input(shape=(1,))

#User embedding
u_emb = Embedding(N, K)(u) #output is (num_sammple, 1, K)

#Movie embedding
m_emb = Embedding(M, K)(m) #output is (num_sammple, 1, K)

#Flatten both embeddings
u_emb = Flatten()(u_emb) #now it's (num_samples, K)
m_emb = Flatten()(m_emb) #now it's (num_samples, K)

#COncatenate user-movie embeddings into a feature vector
x = Concatenate()([u_emb, m_emb]) #now it's (num_samples, 2K)

#now that we have a feature vector it's just regular ANN
x = Dense(1024, activation='relu')(x)
#x = Dense(400, activation='relu')(x)
#x = Dense(400, activation='relu')(x)
x = Dense(1)(x)

Build Model and compile

In [13]:
model = Model(inputs=[u, m], outputs=x)
model.compile(
    loss='mse',
    optimizer=SGD(lr=0.08, momentum=0.9),
)



In [14]:
#split the data
user_ids, movie_ids, ratings = shuffle(user_ids, movie_ids, ratings)
Ntrain = int(0.8 * len(ratings))
train_user = user_ids[:Ntrain]
train_movie = movie_ids[:Ntrain]
train_ratings = ratings[:Ntrain]

test_user = user_ids[Ntrain:]
test_movie = movie_ids[Ntrain:]
test_ratings = ratings[Ntrain:]

#center the ratings
avg_rating = train_ratings.mean()
train_ratings = train_ratings - avg_rating
test_ratings =  test_ratings - avg_rating


In [15]:
r = model.fit(
    x = [train_user, train_movie],
    y = train_ratings,
    epochs = 25,
    batch_size = 1024,
    verbose=2, #goes a little faster when you don't print the progress bar
    validation_data=([test_user, test_movie], test_ratings),
)

Epoch 1/25


KeyboardInterrupt: ignored

In [None]:
#Plot loss per iteration

plt.plot(r.history['loss'], label="loss")
plt.plot(r.history['val_loss'], label="val_loss")
plt.legend()

In [None]:
#Plot accuracy per iteration
plt.plot(r.history['accuracy'], label="accuracy")
plt.plot(r.history['val_accuracy'], label="val_accuracy")
plt.legend()