# Import Libraries

In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from keras.models import Model
from keras.layers import Embedding, Input, Flatten, Dot, Dense, Dropout, Concatenate
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
import re
from keras.models import Model, model_from_json
from google.colab import files

# Load Datasets

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
movies = pd.read_csv('/content/drive/MyDrive/Recommender Dataset/Project/ml-latest-small/movies.csv')
ratings = pd.read_csv('/content/drive/MyDrive/Recommender Dataset/Project/ml-latest-small/ratings.csv')

# Data Preprocessing

In [5]:
print(movies[movies['title'].str.contains("The")])

      movieId                                              title  \
10         11                     American President, The (1995)   
26         27                                Now and Then (1995)   
28         29  City of Lost Children, The (Cité des enfants p...   
46         50                         Usual Suspects, The (1995)   
49         54                              Big Green, The (1995)   
...       ...                                                ...   
9717   188833              The Man Who Killed Don Quixote (2018)   
9724   190183                           The Darkest Minds (2018)   
9732   193565                          Gintama: The Movie (2010)   
9733   193567  anohana: The Flower We Saw That Day - The Movi...   
9735   193573            Love Live! The School Idol Movie (2015)   

                                      genres  
10                      Comedy|Drama|Romance  
26                            Children|Drama  
28    Adventure|Drama|Fantasy|Mystery|Sci-

In [6]:
def correct_movie_titles(movies):
    def correct_title(title):
        # Regex to handle titles ending with ", The" followed by any characters
        new_title = re.sub(r'^(.*), The (.*)$', r'The \1 \2', title)
        return new_title

    movies['title'] = movies['title'].apply(correct_title)
    return movies

In [7]:
movies = correct_movie_titles(movies)

In [8]:
# Display a few samples to check if the titles are corrected
print(movies[movies['title'].str.contains("The")])

      movieId                                              title  \
10         11                      The American President (1995)   
26         27                                Now and Then (1995)   
28         29  The City of Lost Children (Cité des enfants pe...   
46         50                          The Usual Suspects (1995)   
49         54                               The Big Green (1995)   
...       ...                                                ...   
9717   188833              The Man Who Killed Don Quixote (2018)   
9724   190183                           The Darkest Minds (2018)   
9732   193565                          Gintama: The Movie (2010)   
9733   193567  anohana: The Flower We Saw That Day - The Movi...   
9735   193573            Love Live! The School Idol Movie (2015)   

                                      genres  
10                      Comedy|Drama|Romance  
26                            Children|Drama  
28    Adventure|Drama|Fantasy|Mystery|Sci-

In [9]:
has_duplicates = movies.duplicated().any()
has_duplicates

False

In [10]:
# Get the maximum user and movie IDs
max_userid = ratings['userId'].drop_duplicates().max()
max_movieid = ratings['movieId'].drop_duplicates().max()

In [11]:
# Shuffle the data to ensure randomness
shuffled_ratings = ratings.sample(frac=1., random_state=42)

In [12]:
# Extract users, movies, and ratings arrays
Users = shuffled_ratings['userId'].values
Movies = shuffled_ratings['movieId'].values
Ratings = shuffled_ratings['rating'].values

# Model

In [13]:
# Define constants
K_FACTORS = 100  # The number of latent factors for embeddings
TEST_USER = 1

In [14]:
user_input = Input(shape=(1,), name='user_input')
item_input = Input(shape=(1,), name='item_input')

In [15]:
# Define embeddings
user_embedding = Embedding(input_dim=max_userid+1, output_dim=K_FACTORS, input_length=1, name='user_embedding')(user_input)
item_embedding = Embedding(input_dim=max_movieid+1, output_dim=K_FACTORS, input_length=1, name='item_embedding')(item_input)

In [16]:
# Flatten embeddings
user_vec = Flatten(name='flatten_users')(user_embedding)
item_vec = Flatten(name='flatten_items')(item_embedding)

In [17]:
# Compute dot product of user and item vectors
y = Dot(axes=1, name='dot_product')([user_vec, item_vec])

In [26]:
# Define the model
cf_model = Model(inputs=[user_input, item_input], outputs=y)

In [27]:
# Compile the model using MSE as the loss function and the Adam optimizer
cf_model.compile(optimizer='adam', loss='mse')

In [20]:
# Callbacks for early stopping and saving the best model
callbacks = [
    EarlyStopping(patience=2, monitor='val_loss'),
    ModelCheckpoint('best_cf_model.h5', save_best_only=True, monitor='val_loss', mode='min')
]

In [28]:
# Train the model
cf_model.fit([Users, Movies], Ratings, epochs=5, batch_size=64,
          validation_split=0.2, callbacks=callbacks)

Epoch 1/5

  saving_api.save_model(


Epoch 2/5
Epoch 3/5
Epoch 4/5


<keras.src.callbacks.History at 0x7fad8c1e3280>

In [22]:
# Save the model in HDF5 format after training
cf_model.save('cf_model_final.h5')

Model saved in HDF5 format.


In [23]:
model_json = cf_model.to_json()

# Write the model architecture to a JSON file
with open('cf_model_architecture.json', 'w') as json_file:
    json_file.write(model_json)

Model architecture saved to JSON.


In [None]:
# # Define the custom model
# class CFModel(Model):
#     def __init__(self, num_users, num_items, embedding_size, **kwargs):
#         super(CFModel, self).__init__(**kwargs)
#         self.num_users = num_users
#         self.num_items = num_items
#         self.embedding_size = embedding_size

#         user_input = Input(shape=(1,))
#         user_embedding = Embedding(num_users, embedding_size, input_length=1)(user_input)
#         user_vec = Flatten()(user_embedding)

#         item_input = Input(shape=(1,))
#         item_embedding = Embedding(num_items, embedding_size, input_length=1)(item_input)
#         item_vec = Flatten()(item_embedding)

#         dot = Dot(axes=1)([user_vec, item_vec])
#         self.model = Model(inputs=[user_input, item_input], outputs=dot)

#     def call(self, inputs):
#         return self.model(inputs)

#     def get_config(self):
#         return {
#             'num_users': self.num_users,
#             'num_items': self.num_items,
#             'embedding_size': self.embedding_size
#         }

In [None]:
# # Instantiate and compile the model
# max_userid = ratings['userId'].drop_duplicates().max()
# max_movieid = ratings['movieId'].drop_duplicates().max()
# cf_model = CFModel(num_users=max_userid+1, num_items=max_movieid+1, embedding_size=K_FACTORS)

In [None]:
# # Compile the model using MSE as the loss function and the Adam optimizer
# cf_model.compile(loss='mse', optimizer='adam')

In [None]:
# # Callbacks
# callbacks = [
#     EarlyStopping(patience=2),
#     ModelCheckpoint('cfModel_weights', save_best_only=True, save_format='tf')
# ]

In [None]:
# # Train the model
# cf_model.fit([Users, Movies], Ratings, epochs=5, batch_size=64,
#              validation_split=0.2, callbacks=callbacks)

Epoch 1/5
Epoch 2/5
Epoch 3/5


<keras.src.callbacks.History at 0x7f6f98289300>

In [None]:
# cf_model.save('cf_model_final.tf')

# # Save the architecture of the model to a JSON file
# model_json = cf_model.to_json()
# with open("cf_model_final.json", "w") as json_file:
#     json_file.write(model_json)
# print("Model architecture saved to JSON.")

In [24]:
# Function to get top-N recommendations for a user
def get_user_based_recommendations(user_id, num_recommendations=10):
    all_movie_ids = np.array([movie_id for movie_id in range(1, max_movieid+1)])
    predicted_ratings = cf_model.predict([np.array([user_id] * len(all_movie_ids)), all_movie_ids])
    top_indices = np.argsort(predicted_ratings.flatten())[::-1][:num_recommendations]
    recommended_movie_ids = all_movie_ids[top_indices]
    recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)]
    return recommended_movies

In [29]:
# For test
user_id = 3
recommendations = get_user_based_recommendations(user_id, num_recommendations=10)
print(recommendations)

      movieId                                              title  \
1190     1587                         Conan the Barbarian (1982)   
2280     3024                                     Piranha (1978)   
2765     3703                The Road Warrior (Mad Max 2) (1981)   
3734     5181                                   Hangar 18 (1980)   
4045     5746                    Galaxy of Terror (Quest) (1981)   
4050     5764                                      Looker (1981)   
4122     5919                                     Android (1982)   
5052     7899  Master of the Flying Guillotine (Du bi quan wa...   
5504    26409                           The Clonus Horror (1979)   
7114    70946                                     Troll 2 (1990)   

                                genres  
1190          Action|Adventure|Fantasy  
2280                     Horror|Sci-Fi  
2765  Action|Adventure|Sci-Fi|Thriller  
3734            Action|Sci-Fi|Thriller  
4045      Action|Horror|Mystery|Sci-Fi  
4050 

# Evaluation

In [30]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

# Prepare the test data for evaluation
test_users = test_data['userId'].values
test_movies = test_data['movieId'].values
test_ratings = test_data['rating'].values

In [31]:
# Predict ratings for the test set
predicted_test_ratings = cf_model.predict([test_users, test_movies])

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(test_ratings, predicted_test_ratings))
rmse



0.4302523436503343

# Deeper Model

In [32]:
# Instantiate the deeper model
num_users = max_userid + 1
num_items = max_movieid + 1

In [None]:
# Define constants
K_FACTORS = 100

In [None]:
# Define the model
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(num_users, K_FACTORS, input_length=1, name='user_embedding')(user_input)
user_vec = Flatten(name='user_flatten')(user_embedding)
item_input = Input(shape=(1,), name='item_input')
item_embedding = Embedding(num_items, K_FACTORS, input_length=1, name='item_embedding')(item_input)
item_vec = Flatten(name='item_flatten')(item_embedding)
concat = Concatenate()([user_vec, item_vec])
dense = Dense(128, activation='relu')(concat)
dropout = Dropout(0.5)(dense)
dense = Dense(64, activation='relu')(dropout)
dropout = Dropout(0.5)(dense)
dense = Dense(32, activation='relu')(dropout)
dropout = Dropout(0.5)(dense)
output = Dense(1, name='output')(dropout)

deep_cf_model = Model(inputs=[user_input, item_input], outputs=output)
deep_cf_model.compile(optimizer='adam', loss='mse')

In [33]:
# Train the model
deep_cf_model.fit([Users, Movies], Ratings, epochs=10, batch_size=64, validation_split=0.2,
                  callbacks=[EarlyStopping(patience=2), ModelCheckpoint('deep_model.h5', save_best_only=True)])

# Save the model architecture to a JSON file
model_json = deep_cf_model.to_json()
with open("user_based_recommendation_model.json", "w") as json_file:
    json_file.write(model_json)

# Save the model weights to an HDF5 file
deep_cf_model.save_weights("user_based_recommendation_model_weights.h5")

# Download the model architecture and weights files
files.download('user_based_recommendation_model.json')
files.download('user_based_recommendation_model_weights.h5')

# # Function to get top-N recommendations for a user
# def get_user_based_recommendations(user_id, num_recommendations=10, model=deep_cf_model):
#     all_movie_ids = np.array([movie_id for movie_id in range(1, max_movieid+1)])
#     predicted_ratings = model.predict([np.array([user_id] * len(all_movie_ids)), all_movie_ids])
#     top_indices = np.argsort(predicted_ratings.flatten())[::-1][:num_recommendations]
#     recommended_movie_ids = all_movie_ids[top_indices]
#     recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)]
#     return recommended_movies


# user_id = 1
# recommendations = get_user_based_recommendations(user_id, num_recommendations=10)
# print(recommendations)

Epoch 1/10

  saving_api.save_model(


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

      movieId                                              title  \
87         99               Heidi Fleiss: Hollywood Madam (1995)   
1649     2202                                    Lifeboat (1944)   
3608     4956                               The Stunt Man (1980)   
4396     6460                      The Trial (Procès, Le) (1962)   
4504     6666  The Discreet Charm of the Bourgeoisie (Charme ...   
4782     7121                                  Adam's Rib (1949)   
5489    26326     The Holy Mountain (Montaña sagrada, La) (1973)   
5580    26810                               Bad Boy Bubby (1993)   
5848    32582          The Wild Parrots of Telegraph Hill (2003)   
9514   171495                                             Cosmos   

                                              genres  
87                                       Documentary  
1649                                       Drama|War  
3608  Action|Adventure|Comedy|Drama|Romance|Thriller  
4396                           

In [34]:
# Function to get top-N recommendations for a user
def get_user_based_recommendations(user_id, num_recommendations=10):
    all_movie_ids = np.array([movie_id for movie_id in range(1, max_movieid+1)])
    predicted_ratings = deep_cf_model.predict([np.array([user_id] * len(all_movie_ids)), all_movie_ids])
    top_indices = np.argsort(predicted_ratings.flatten())[::-1][:num_recommendations]
    recommended_movie_ids = all_movie_ids[top_indices]
    recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)]
    return recommended_movies

In [35]:
user_id = 3
recommendations = get_user_based_recommendations(user_id, num_recommendations=10)
print(recommendations)

      movieId                                              title  \
87         99               Heidi Fleiss: Hollywood Madam (1995)   
2206     2931        Time of the Gypsies (Dom za vesanje) (1989)   
2453     3266  Man Bites Dog (C'est arrivé près de chez vous)...   
4050     5764                                      Looker (1981)   
4396     6460                      The Trial (Procès, Le) (1962)   
5489    26326     The Holy Mountain (Montaña sagrada, La) (1973)   
5580    26810                               Bad Boy Bubby (1993)   
5848    32582          The Wild Parrots of Telegraph Hill (2003)   
6954    65642             Timecrimes (Cronocrímenes, Los) (2007)   
9301   158966                           Captain Fantastic (2016)   

                            genres  
87                     Documentary  
2206    Comedy|Crime|Drama|Fantasy  
2453   Comedy|Crime|Drama|Thriller  
4050  Drama|Horror|Sci-Fi|Thriller  
4396                         Drama  
5489                         

In [36]:
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

In [37]:
# Prepare the test data for evaluation
test_users = test_data['userId'].values
test_movies = test_data['movieId'].values
test_ratings = test_data['rating'].values

In [38]:
predicted_test_ratings = deep_cf_model.predict([test_users, test_movies])



In [39]:
rmse = np.sqrt(mean_squared_error(test_ratings, predicted_test_ratings))
print(f'RMSE: {rmse}')

RMSE: 0.7162030846048772
