##Lets get the imports 

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from IPython import display


# some additional functions
def mask(df, key, function):
  """Returns a filtered dataframe, by applying function to key"""
  return df[function(df[key])]

def flatten_cols(df):
  df.columns = [' '.join(col).strip() for col in df.columns.values]
  return df

pd.DataFrame.mask = mask
pd.DataFrame.flatten_cols = flatten_cols

## Get the Data
We will be using the same MovieLens Dataset we used in previous part of the series. You can get the data from [here](http://files.grouplens.org/datasets/movielens/ml-100k.zip). Feel Free to browse and look for another subset of data from the movie lens. Steps remain the same.

Extract the data and lets have a look what it contains.

In [2]:
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip ml-100k.zip

--2021-01-07 15:57:01--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2021-01-07 15:57:01 (16.2 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base    

### Read the files as CSV

In [3]:
with open("/content/ml-100k/u.info","r") as f:
  print(f.readlines())

['943 users\n', '1682 items\n', '100000 ratings\n']


In [4]:
###################################################################
#                                                                 #
# Reading the required files and see exactly which coloumns we    #
# need and then select those for moving ahead                     #
#                                                                 #
###################################################################

print("User data...")
with open("/content/ml-100k/u.user","r") as f:
  print(f.readlines(50))
print()

print("Ratings Data...")
with open("/content/ml-100k/u.data","r") as f:
  print(f.readlines(50))

print()
print("Movies Data...")
with open("/content/ml-100k/u.item","r") as f:
  print(f.readlines(1000))

print()
print("Genre Data...")
with open("/content/ml-100k/u.genre","r") as f:
  for line in f.readlines(5):
    print(line)

User data...
['1|24|M|technician|85711\n', '2|53|F|other|94043\n', '3|23|M|writer|32067\n']

Ratings Data...
['196\t242\t3\t881250949\n', '186\t302\t3\t891717742\n', '22\t377\t1\t878887116\n']

Movies Data...
['1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0\n', '2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0\n', '3|Four Rooms (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0\n', '4|Get Shorty (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)|0|1|0|0|0|1|0|0|1|0|0|0|0|0|0|0|0|0|0\n', '5|Copycat (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Copycat%20(1995)|0|0|0|0|0|0|1|0|1|0|0|0|0|0|0|0|1|0|0\n', '6|Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)|01-Jan-1995||http://us.imdb.com/Title?Yao+a+yao+yao+dao+waipo+qiao+(1995)|0|0|0|0|0|0|0|0|1|0

In [5]:
###################################################################
#
# Reading Data from the files as per the above spec we just saw
#
###################################################################

home_dir = '/content/ml-100k/'

user_columns = ['user_id','age','sex','occupation','zip_code']

users = pd.read_csv(home_dir+'u.user',
                    sep='|',
                    names=user_columns,
                    encoding='latin-1'
                    )

rating_columns = ['user_id','movie_id','rating','timestamp']

ratings = pd.read_csv(home_dir+'u.data',
                      sep='\t',
                      names=rating_columns,
                      encoding='latin-1')

genres = pd.read_csv(home_dir+'u.genre',
                     sep='|',
                     names=['genre','id'],
                     encoding='latin-1'
                     )

# Each record in movies file have a binary indicator if that movie
# belongs to that genre
# these genres are from the genre dataframe that we just created above

genre_columns = list(genres.genre.values)
movie_columns = ['movie_id','title','release_date','video_release_date','imdb_url'] + genre_columns
movies = pd.read_csv(home_dir+'u.item',
                     sep='|',
                     names=movie_columns,
                     encoding='latin-1'
                     )

In [6]:
###################################################################
#
# Movies DataFrame contains information about the movie along with
# the Genre columns. These Columns can have either 0 or 1 depending
# on whether or not the movie belong to that particular genre.
#
###################################################################
movies.head(2)

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [7]:
###################################################################
#
# Users DataFrame contains an id unique to every user and their
# information like age, sex, occupation and location. While we 
# do not use these right now for our notebook, but these factors
# can be useful in one's choice. 
#
###################################################################
users.head(2)

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043


In [8]:
###################################################################
#
# Ratings Dataframe contains very few, yet important data.
# This df contains all the ratings given by users for the movie out of
# 5 and timestamp of the rating given
#
###################################################################
ratings.head(2)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742


In [9]:
print('Genres,Ratings,Users,Movies')
genres.shape,ratings.shape,users.shape,movies.shape

Genres,Ratings,Users,Movies


((19, 2), (100000, 4), (943, 5), (1682, 24))

In [10]:
###################################################################
#
# Our Mandatory functions on a dataframe. While these seem comman
# can sometimes really be a lifesaver. Notice I used a parameter
# include = [np.object,int]. You can use this to include a certain 
# type of data in your df which by default is being excluded. You 
# can also use exclude = [] parameter incase you want to exclude.
#
###################################################################

users.describe(include=[np.object,int]).T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
user_id,943,,,,472.0,272.365,1.0,236.5,472.0,707.5,943.0
age,943,,,,34.052,12.1927,7.0,25.0,31.0,43.0,73.0
sex,943,2.0,M,670.0,,,,,,,
occupation,943,21.0,student,196.0,,,,,,,
zip_code,943,795.0,55414,9.0,,,,,,,


In [11]:
users.describe(exclude=[int]).T

Unnamed: 0,count,unique,top,freq
sex,943,2,M,670
occupation,943,21,student,196
zip_code,943,795,55414,9


## Start preprocessing the data

In [12]:
###################################################################
#
# Merging ratings and Users to get info about user 
# ratings and user in one place. 
#
###################################################################

user_ratings = (ratings.groupby('user_id',as_index=False).agg({'rating':['count','mean']}).flatten_cols().merge(users,on='user_id'))

print("Below DataFrame depicts average ratings as well as total number of ratings by a user\n")

user_ratings.head()

Below DataFrame depicts average ratings as well as total number of ratings by a user



Unnamed: 0,user_id,rating count,rating mean,age,sex,occupation,zip_code
0,1,272,3.610294,24,M,technician,85711
1,2,62,3.709677,53,F,other,94043
2,3,54,2.796296,23,M,writer,32067
3,4,24,4.333333,24,M,technician,43537
4,5,175,2.874286,33,F,other,15213


In [None]:
####################################################################
#
# Lets try to get more insights about movies and ratings using the 
# base dataframes like average rating and number of ratings for 
# that movie
#
####################################################################

tmp_ratings = ratings.groupby('movie_id',as_index=False).agg({'rating':['count','mean']}).flatten_cols()

movie_ratings = movies.merge(
    tmp_ratings,
    on='movie_id'
)

movie_ratings.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,rating count,rating mean
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,452,3.878319
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,131,3.206107
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,90,3.033333
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,209,3.550239
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,86,3.302326


# II. What do we need to do?

Our goal is to factorize the ratings matrix $A$ into the product of a user embedding matrix $U$ and movie embedding matrix $V$, such that $A \approx UV^\top$ with
$U = \begin{bmatrix} u_{1} \\ \hline \vdots \\ \hline u_{N} \end{bmatrix}$ and
$V = \begin{bmatrix} v_{1} \\ \hline \vdots \\ \hline v_{M} \end{bmatrix}$.

Here
- $N$ is the number of users,
- $M$ is the number of movies,
- $A_{ij}$ is the rating of the $j$th movies by the $i$th user,
- each row $U_i$ is a $d$-dimensional vector (embedding) representing user $i$,
- each row $V_j$ is a $d$-dimensional vector (embedding) representing movie $j$,
- the prediction of the model for the $(i, j)$ pair is the dot product $\langle U_i, V_j \rangle$.



# Encode Users and Movies as integer indices

In [13]:
####################################################################
#
# Getting Unique user ids and creating two encodings. One from user_id
# to index. Another from index to user ids.
#
####################################################################

user_ids = ratings['user_id'].unique().tolist()
user2user_encoded = {x:i for i,x in enumerate(user_ids)}
userencoded2user = {i:x for i,x in enumerate(user_ids)}

In [18]:
for i,x in enumerate(user_ids[:10]):
  print(x,": ",i,end="    ")
  print(i,": ",x)

196 :  0    0 :  196
186 :  1    1 :  186
22 :  2    2 :  22
244 :  3    3 :  244
166 :  4    4 :  166
298 :  5    5 :  298
115 :  6    6 :  115
253 :  7    7 :  253
305 :  8    8 :  305
6 :  9    9 :  6


In [19]:
####################################################################
#
# Getting unique movie IDs and create two mappings, one from movie_id
# to index/unique number and another vice versa.
#
####################################################################

movie_ids = ratings['movie_id'].unique().tolist()
movie2movie_encoded = {x:i for i,x in enumerate(movie_ids)}
movieencoded2movie = {i:x for i,x in enumerate(movie_ids)}

In [20]:
for i,x in enumerate(movie_ids[:10]):
  print(x,": ",i,end="    ")
  print(i,": ",x)

242 :  0    0 :  242
302 :  1    1 :  302
377 :  2    2 :  377
51 :  3    3 :  51
346 :  4    4 :  346
474 :  5    5 :  474
265 :  6    6 :  265
465 :  7    7 :  465
451 :  8    8 :  451
86 :  9    9 :  86


In [22]:
####################################################################
#
# Ratings before mapping the user id and movie id to the encodings
#
####################################################################

ratings[:10]

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


In [25]:
####################################################################
#
# We now map the user_id and movie_id to the mappings in a new column
# we created before. We take the number of unique movies and users
# we have. We calculated the min and max rating for normalizing the 
# ratings columns.
#
####################################################################

ratings['user'] = ratings['user_id'].map(user2user_encoded)
ratings['movie'] = ratings['movie_id'].map(movie2movie_encoded)

n_users = len(user2user_encoded)
n_movies = len(movieencoded2movie)

ratings['ratings'] = ratings['rating'].values.astype(np.float32)
min_rating = min(ratings['rating'])
max_rating = max(ratings['rating'])


In [27]:
####################################################################
#
# Ratings after mapping the user id and movie id to the encodings
#
####################################################################

ratings[:10]

Unnamed: 0,user_id,movie_id,rating,timestamp,user,movie,ratings
0,196,242,3,881250949,0,0,3.0
1,186,302,3,891717742,1,1,3.0
2,22,377,1,878887116,2,2,1.0
3,244,51,2,880606923,3,3,2.0
4,166,346,1,886397596,4,4,1.0
5,298,474,4,884182806,5,5,4.0
6,115,265,2,881171488,6,6,2.0
7,253,465,5,891628467,7,7,5.0
8,305,451,3,886324817,8,8,3.0
9,6,86,3,883603013,9,9,3.0


In [28]:
####################################################################
#
# Shuffling the data before training and creating our training data
# and labels. Spliting the data into Train and Valid Split.
#
####################################################################

ratings = ratings.sample(frac=1,random_state=42)

x = ratings[['user','movie']].values
y = ratings['rating'].apply(lambda x:(x-min_rating)/(max_rating-min_rating)).values

train_idx = int(0.9*ratings.shape[0])
x_train,x_val,y_train,y_val = (x[:train_idx],
                               x[train_idx:],
                               y[:train_idx],
                               y[train_idx:]
                               )

In [29]:
####################################################################
#
# Shape of the datasets.
#
####################################################################

x_train.shape,x_val.shape,n_users,n_movies

((90000, 2), (10000, 2), 943, 1682)

In [None]:
####################################################################
#
# Setup an Embedding size for the latent representation of the 
# users and Movies. Creating our Model.
#
####################################################################

EMBEDDING_SIZE = 75

class RecommenderNet(tf.keras.Model):
  def __init__(self,num_users,num_movies,embedding_size,**kwargs):
    super(RecommenderNet,self).__init__(**kwargs)
    self.num_users =  num_users
    self.num_movies = num_movies
    self.embedding_size = embedding_size
    self.user_embedding = tf.keras.layers.Embedding(
        num_users,
        embedding_size,
        embeddings_initializer="he_normal",
        embeddings_regularizer=tf.keras.regularizers.l2(1e-6)
    )
    self.user_bias = tf.keras.layers.Embedding(num_users,1)
    self.movie_embedding = tf.keras.layers.Embedding(
        num_movies,
        embedding_size,
        embeddings_initializer="he_normal",
        embeddings_regularizer=tf.keras.regularizers.l2(1e-6)
    )
    self.movie_bias = tf.keras.layers.Embedding(num_movies,1)

  def call(self,inputs):
    '''
      we perform a dot product of user vector and
      movie vector and compare that to our labels
    '''
    user_vector = self.user_embedding(inputs[:,0])
    user_bias = self.user_bias(inputs[:,0])
    movie_vector = self.movie_embedding(inputs[:,1])
    movie_bias = self.movie_bias(inputs[:,1])

    dot_user_movie = tf.tensordot(user_vector,movie_vector,2)

    x = dot_user_movie + user_bias + movie_bias
    
    return tf.nn.sigmoid(x)

model = RecommenderNet(n_users, n_movies, EMBEDDING_SIZE)

model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(lr=0.001)
)

In [None]:
####################################################################
#
# Well, After all the typing above lets run the model on the data.
#
####################################################################
model.fit(
    x = x_train,
    y = y_train,
    epochs=10,
    batch_size=128,
    verbose=1,
    validation_data=(x_val,y_val)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb1fb5289b0>

In [None]:
model.summary()

Model: "recommender_net_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_28 (Embedding)     multiple                  70725     
_________________________________________________________________
embedding_29 (Embedding)     multiple                  943       
_________________________________________________________________
embedding_30 (Embedding)     multiple                  126150    
_________________________________________________________________
embedding_31 (Embedding)     multiple                  1682      
Total params: 199,500
Trainable params: 199,500
Non-trainable params: 0
_________________________________________________________________


In [30]:
####################################################################
#
#                             REFERENCES
#
# Thanks to respective articles and tutorials:
# 1) https://keras.io/examples/structured_data/collaborative_filtering_movielens/
# 2) https://developers.google.com/machine-learning/recommendation
# 
#
####################################################################