# Content-Based Filtering based on Deep Learning: Movie Recommender System


In [1]:
# Libraries
import numpy as np
import pandas as pd
import tensorflow as tf 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

## Where's the data from?
As in the original lab, the dataset is [MovieLens ml-latest-small](https://grouplens.org/datasets/movielens/latest/). 
As quoted: 
>This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.

For more info, check **./MovieLens/README.txt**

In [2]:
# Let's pull some movie stats!
movies = pd.read_csv('./MovieLens/movies.csv')
ratings = pd.read_csv('./MovieLens/ratings.csv')

# I'm so bad with regex :(
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)')
# Message from the future: NaN in year field destroys the neural network.
movies['year'] = movies['year'].fillna('2000')

print(f"Movies dataframe shape: {movies.shape} Ratings shape: {ratings.shape}")

Movies dataframe shape: (9742, 4) Ratings shape: (100836, 4)


In [3]:
# Merge the DataFrames on movie ID, adding the name of the movie to each rating
merged_df = pd.merge(movies, ratings, on='movieId', how='left')

# Group by movie title and calculate rating count and average rating
movie_stats = merged_df.groupby('title').agg({
    'rating': ['count', 'mean'],
    'year': 'first' #So it also aggregates by year
})
movie_stats.columns = ['rating_count', 'average_rating', 'year']

# Top 5 movies by number of ratings. Really nice movies btw.
movie_stats.reset_index().sort_values(by='rating_count', ascending=False).head(5)

Unnamed: 0,title,rating_count,average_rating,year
3164,Forrest Gump (1994),329,4.164134,1994
7609,"Shawshank Redemption, The (1994)",317,4.429022,1994
6878,Pulp Fiction (1994),307,4.197068,1994
7696,"Silence of the Lambs, The (1991)",279,4.16129,1991
5521,"Matrix, The (1999)",278,4.192446,1999


## Which features do we have?
**Movie features:** by now, year released, one-hot encoded genre and average rating.<br>
TODO: Think of new interesting features! Duration, country, budget... We also have tags...<br>
**User features:** by now, per genre average. <br>
TODO: Add rating count and rating average, per-country average and so on. There's a lot of feature engineering to be done here... <br>

## How will we structure the model training?
We will use a data structure based on three dataframes, one containing information about the users, one containing information about the movies and one containing the ratings. <br>
For example, if entry #99 of each dataframe is: <br>
|Index  |Movie  |User   |Y |
| ---   |   --- | ---   |---|
|99     |Shrek 2|#793   |3.5|

That would mean that User 793 rated 'Shrek 2' 3.5 stars.


In [4]:
# Create the movies list
movies_data = []
# Go over the ratings and create the entries
for index, rating in ratings.iterrows():
    movie = movies[movies['movieId'] == rating['movieId']]
    if not movie.empty: # Better safe than sorry I guess
        movieId = movie.iloc[0]['movieId']
        title = movie.iloc[0]['title']
        year = movie.iloc[0]['year']
        genres = movie.iloc[0]['genres']
        avg_rating = ratings[ratings['movieId'] == rating['movieId']]['rating'].mean()

        movie = {
            'movieId': movieId,
            'title': title,
            'year': year,
            'genres': genres,
            'average_rating': avg_rating
        }

        movies_data.append(movie)

movies_df = pd.DataFrame(movies_data) # Convert the list to a DataFrame
movies_df.head(10)

Unnamed: 0,movieId,title,year,genres,average_rating
0,1,Toy Story (1995),1995,Adventure|Animation|Children|Comedy|Fantasy,3.92093
1,3,Grumpier Old Men (1995),1995,Comedy|Romance,3.259615
2,6,Heat (1995),1995,Action|Crime|Thriller,3.946078
3,47,Seven (a.k.a. Se7en) (1995),1995,Mystery|Thriller,3.975369
4,50,"Usual Suspects, The (1995)",1995,Crime|Mystery|Thriller,4.237745
5,70,From Dusk Till Dawn (1996),1996,Action|Comedy|Horror|Thriller,3.509091
6,101,Bottle Rocket (1996),1996,Adventure|Comedy|Crime|Romance,3.782609
7,110,Braveheart (1995),1995,Action|Drama|War,4.031646
8,151,Rob Roy (1995),1995,Action|Drama|Romance|War,3.545455
9,157,Canadian Bacon (1995),1995,Comedy|War,2.863636


In [5]:
movies_df.shape

(100836, 5)

Keep in mind that each entry here corresponds to a rating, not to a movie! There's over 100,000 entries in this df.

In [6]:
# Split the genres for one-hot encoding and add them to the dataframe
genres_one_hot = movies_df['genres'].str.get_dummies(sep='|')
movies_df_onehot = pd.concat([movies_df, genres_one_hot], axis=1)

# Drop the original 'genres' column, as well as '(no genres listed)', which will just be 0 in every one-hot column
movies_df_onehot.drop(columns=['genres', '(no genres listed)'], inplace=True)

movies_df_onehot.head(5)

Unnamed: 0,movieId,title,year,average_rating,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),1995,3.92093,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,3,Grumpier Old Men (1995),1995,3.259615,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,6,Heat (1995),1995,3.946078,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,47,Seven (a.k.a. Se7en) (1995),1995,3.975369,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,50,"Usual Suspects, The (1995)",1995,4.237745,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0


Now the movies dataframe is ready for the models. Let's create the users dataframe now.

In [7]:
# < -- Legacy version, wildly inefficient. Don't use.
# users_data = []
# merged_df = pd.concat([ratings[['userId', 'rating']], movies_df_onehot], axis=1) # Put together the two dataframes

# # Iterate over each review
# for index, rating in merged_df.iterrows():
#     # For each review, get the user that posted it
#     user_id = rating['userId']
#     genre_ratings = {}
#     # Iterate over the genres
#     for genre in movies_df_onehot.columns[4:]:
#         # Compute the average rating by the user of all entries within each genre
#         mask = (merged_df['userId'] == user_id) & (merged_df[genre] == 1) 
#         genre_avg_rating = merged_df.loc[mask, 'rating'].mean()
#         genre_ratings[genre + '_rating'] = genre_avg_rating

#     # Add the data to the user dataframe
#     users_data.append({'userId': user_id, **genre_ratings})

# users_df = pd.DataFrame(users_data)
# # When a user has not reviewed any movies of a specific genre, set 0 as the mean rating.
# users_df.fillna(0, inplace=True)

# users_df.head(10)


In [8]:
# This is my new optimized version! It precomputes the ratings for each user and then constructs a dataframe
# picking the appropiate user entry for each rating. Way better than the previous one, about 300x faster.
user_genre_ratings = {} # Dictionary to hold the precomputed user data

merged_df = pd.concat([ratings[['userId', 'rating']], movies_df_onehot], axis=1) # Put together the two dataframes

# Iterate over each user using groupby. In each iteration, we obtain a groupby object containing the reviews of one user
for user_id, user_ratings in merged_df.groupby('userId'):
    genre_ratings = {} 
    for genre in movies_df_onehot.columns[4:]: # Iterate over the genres
        mask = (user_ratings[genre] == 1) 
        genre_avg_rating = user_ratings.loc[mask, 'rating'].mean() # Compute the mean for all the movies that have the one-hot label for each genre set to one
        genre_ratings[genre + '_rating'] = genre_avg_rating
    user_genre_ratings[user_id] = genre_ratings

# Create a dataframe to hold the results. It needs to be transposed so that each user is a column instead of a row
user_ratings_df = pd.DataFrame(user_genre_ratings).T.reset_index().rename(columns={'index': 'userId'})

# Fill NaN values with 0
user_ratings_df.fillna(0, inplace=True)

# Merge the genre ratings into the final dataframe
users_df = pd.merge(ratings[['userId']], user_ratings_df, on='userId', how='left')

users_df


Unnamed: 0,userId,Action_rating,Adventure_rating,Animation_rating,Children_rating,Comedy_rating,Crime_rating,Documentary_rating,Drama_rating,Fantasy_rating,Film-Noir_rating,Horror_rating,IMAX_rating,Musical_rating,Mystery_rating,Romance_rating,Sci-Fi_rating,Thriller_rating,War_rating,Western_rating
0,1,4.322222,4.388235,4.689655,4.547619,4.277108,4.355556,0.0,4.529412,4.297872,5.00,3.470588,0.000000,4.681818,4.166667,4.307692,4.225000,4.145455,4.500000,4.285714
1,1,4.322222,4.388235,4.689655,4.547619,4.277108,4.355556,0.0,4.529412,4.297872,5.00,3.470588,0.000000,4.681818,4.166667,4.307692,4.225000,4.145455,4.500000,4.285714
2,1,4.322222,4.388235,4.689655,4.547619,4.277108,4.355556,0.0,4.529412,4.297872,5.00,3.470588,0.000000,4.681818,4.166667,4.307692,4.225000,4.145455,4.500000,4.285714
3,1,4.322222,4.388235,4.689655,4.547619,4.277108,4.355556,0.0,4.529412,4.297872,5.00,3.470588,0.000000,4.681818,4.166667,4.307692,4.225000,4.145455,4.500000,4.285714
4,1,4.322222,4.388235,4.689655,4.547619,4.277108,4.355556,0.0,4.529412,4.297872,5.00,3.470588,0.000000,4.681818,4.166667,4.307692,4.225000,4.145455,4.500000,4.285714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,3.600580,3.705993,3.901515,3.651786,3.731144,3.800366,4.2,3.874739,3.592715,4.35,3.506601,3.628049,3.928571,3.766667,3.731092,3.659363,3.573529,3.776596,3.742424
100832,610,3.600580,3.705993,3.901515,3.651786,3.731144,3.800366,4.2,3.874739,3.592715,4.35,3.506601,3.628049,3.928571,3.766667,3.731092,3.659363,3.573529,3.776596,3.742424
100833,610,3.600580,3.705993,3.901515,3.651786,3.731144,3.800366,4.2,3.874739,3.592715,4.35,3.506601,3.628049,3.928571,3.766667,3.731092,3.659363,3.573529,3.776596,3.742424
100834,610,3.600580,3.705993,3.901515,3.651786,3.731144,3.800366,4.2,3.874739,3.592715,4.35,3.506601,3.628049,3.928571,3.766667,3.731092,3.659363,3.573529,3.776596,3.742424


Keep in mind that each entry here also corresponds to one rating! Now let's create the easiest dataframe, Y, containing the rating for each movie-user pair.

In [9]:
y = np.array(ratings['rating'])
y

array([4., 4., 4., ..., 5., 5., 3.])

Putting everything toguether, we can see how everything makes sense. You can choose an index below and see the meaning.

In [10]:
index = 47005 # <--- Choose a index here

movie = movies_df_onehot.iloc[index]['title']
movie_id = movies_df_onehot.iloc[index]['movieId']
user_id = users_df.iloc[index]['userId']
rating = y[index]

print(f"User {user_id} rated {movie} (with ID {movie_id}) a {rating}\n")

print(f"Ratings.csv entry: \n{ratings.iloc[index]}")



User 307.0 rated Can't Hardly Wait (1998) (with ID 1895) a 2.0

Ratings.csv entry: 
userId       3.070000e+02
movieId      1.895000e+03
rating       2.000000e+00
timestamp    1.186088e+09
Name: 47005, dtype: float64


If the dataframes hold the same info as the ratings.csv entry, it is working!
## Preprocessing for the model

In [11]:
# First, I want to pick the columns that will be used during training
movie_features = [column for column in movies_df_onehot.columns if column not in ['movieId', 'title']]
user_features = [column for column in users_df.columns if column not in ['userId']]

print(f"We'll use {movie_features} for movies \nand {user_features} for users")

We'll use ['year', 'average_rating', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'] for movies 
and ['Action_rating', 'Adventure_rating', 'Animation_rating', 'Children_rating', 'Comedy_rating', 'Crime_rating', 'Documentary_rating', 'Drama_rating', 'Fantasy_rating', 'Film-Noir_rating', 'Horror_rating', 'IMAX_rating', 'Musical_rating', 'Mystery_rating', 'Romance_rating', 'Sci-Fi_rating', 'Thriller_rating', 'War_rating', 'Western_rating'] for users


In [12]:
# Let's split in train, cv and test partitions
movie_train, movie_, user_train, user_, y_train, y_ = train_test_split(movies_df_onehot[movie_features], users_df[user_features], y, test_size=.3, random_state=47005)
movie_cv, movie_test, user_cv, user_test, y_cv, y_test = train_test_split(movie_, user_, y_, test_size=.5, random_state=47005)

In [13]:
# Scale the data. For now, I'm going to go with the scalings used in the lab. I would like to play around 
# with other scalings tho.
movie_train_unscaled = movie_train
user_train_unscaled = user_train
y_train_unscaled = y_train

movie_cv_unscaled = movie_cv
user_v_unscaled = user_cv
y_cv_unscaled = y_cv

movie_test_unscaled = movie_test
user_test_unscaled = user_test
y_test_unscaled = y_test

scalerMovie = StandardScaler()
movie_train = scalerMovie.fit_transform(movie_train)

movie_cv = scalerMovie.transform(movie_cv)
movie_test = scalerMovie.transform(movie_test)

scalerUser = StandardScaler()
user_train = scalerUser.fit_transform(user_train)

user_cv = scalerUser.transform(user_cv)
user_test = scalerUser.transform(user_test)

scalerTarget = MinMaxScaler((-1, 1))

y_train = scalerTarget.fit_transform(y_train.reshape(-1, 1))
y_cv = scalerTarget.transform(y_cv.reshape(-1, 1))
y_test = scalerTarget.transform(y_test.reshape(-1, 1))

## Neural Network

The neural network architecture will have two networks combined by a dot product.

In [14]:
num_outputs = 32
user_network = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs)
])

movie_network = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs)
])

In [15]:
input_user = tf.keras.layers.Input(shape=(len(user_features)))
vu = user_network(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

input_movie = tf.keras.layers.Input(shape=(len(movie_features)))
vm = movie_network(input_movie)
vm = tf.linalg.l2_normalize(vm, axis=1)

output = tf.keras.layers.Dot(axes=1)([vu, vm])

model = tf.keras.Model([input_user, input_movie], output)

model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 19)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 21)]                 0         []                            
                                                                                                  
 sequential (Sequential)     (None, 32)                   42144     ['input_1[0][0]']             
                                                                                                  
 sequential_1 (Sequential)   (None, 32)                   42656     ['input_2[0][0]']             
                                                                                              

For reference ;)
> A latent representation, in the context of machine learning and neural networks, refers to a compressed, lower-dimensional representation of data that captures the most important features or characteristics of the input data.

In [20]:
cost_fn = tf.keras.losses.MeanSquaredError()
opt = tf.keras.optimizers.legacy.Adam(learning_rate=0.01) # I'm running on M1
model.compile(optimizer=opt, loss=cost_fn)

In [21]:
assert not np.any(np.isnan(movie_train))

In [22]:
model.fit([user_train, movie_train], y_train, epochs=20, validation_data=([user_cv, movie_cv], y_cv))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x16b610ad0>

In [19]:
model.evaluate([user_test, movie_test], y_test)



0.12019629031419754

# Similar  movies: computing a matrix of distances

In [27]:
def sq_dist(a, b):
    d = np.sum((a-b)**2)
    return d

In [28]:
input_movie_m = tf.keras.layers.Input(shape=(len(movie_features)))
vm_m = movie_network(input_movie_m)
vm_m = tf.linalg.l2_normalize(vm_m, axis=1)
model_m = tf.keras.Model(input_movie_m, vm_m)
model_m.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 21)]              0         
                                                                 
 sequential_1 (Sequential)   (None, 32)                42656     
                                                                 
 tf.math.l2_normalize_2 (TF  (None, 32)                0         
 OpLambda)                                                       
                                                                 
Total params: 42656 (166.62 KB)
Trainable params: 42656 (166.62 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


_Based on the Machine Learning Specialization lab. Thanks to Stanford Online, Coursera, and DeepLearningAI. It's been really fun!_