# Import Libraries

In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from keras.models import Model
from keras.layers import Embedding, Input, Flatten, Dot, Dense, Dropout, Concatenate
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
import re
from keras.models import Model, model_from_json
from google.colab import files

# Load Datasets

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
movies = pd.read_csv('/content/drive/MyDrive/Recommender Dataset/Project/ml-latest-small/movies.csv')
ratings = pd.read_csv('/content/drive/MyDrive/Recommender Dataset/Project/ml-latest-small/ratings.csv')

# Data Preprocessing

In [26]:
print(movies[movies['title'].str.contains("The")])

      movieId                                              title  \
10         11                            American President, The   
26         27                                       Now and Then   
28         29  City of Lost Children, The (Cité des enfants p...   
46         50                                Usual Suspects, The   
49         54                                     Big Green, The   
...       ...                                                ...   
9717   188833                     The Man Who Killed Don Quixote   
9724   190183                                  The Darkest Minds   
9732   193565                                 Gintama: The Movie   
9733   193567    anohana: The Flower We Saw That Day - The Movie   
9735   193573                   Love Live! The School Idol Movie   

                                      genres  
10                      Comedy|Drama|Romance  
26                            Children|Drama  
28    Adventure|Drama|Fantasy|Mystery|Sci-

In [27]:
def correct_movie_titles(movies):
    def correct_title(title):
        # Regex to handle titles ending with ", The" followed by any characters
        new_title = re.sub(r'^(.), The (.)$', r'The \1 \2', title)
        new_title = re.sub(r'^(.), A (.)$', r'A \1 \2', new_title)
        new_title = re.sub(r'\s\(\d{4}\)$', '', new_title)
        return new_title

    movies['title'] = movies['title'].apply(correct_title)
    return movies

In [28]:
movies = correct_movie_titles(movies)

In [None]:
# Display a few samples to check if the titles are corrected
print(movies[movies['title'].str.contains("The")])

      movieId                                              title  \
10         11                      The American President (1995)   
26         27                                Now and Then (1995)   
28         29  The City of Lost Children (Cité des enfants pe...   
46         50                          The Usual Suspects (1995)   
49         54                               The Big Green (1995)   
...       ...                                                ...   
9717   188833              The Man Who Killed Don Quixote (2018)   
9724   190183                           The Darkest Minds (2018)   
9732   193565                          Gintama: The Movie (2010)   
9733   193567  anohana: The Flower We Saw That Day - The Movi...   
9735   193573            Love Live! The School Idol Movie (2015)   

                                      genres  
10                      Comedy|Drama|Romance  
26                            Children|Drama  
28    Adventure|Drama|Fantasy|Mystery|Sci-

In [8]:
has_duplicates = movies.duplicated().any()
has_duplicates

False

In [9]:
# Get the maximum user and movie IDs
max_userid = ratings['userId'].drop_duplicates().max()
max_movieid = ratings['movieId'].drop_duplicates().max()

In [10]:
def leave_one_out_split(ratings):
    """Split the ratings into training and test sets using Leave-One-Out strategy.
       For each user, hold out one interaction for testing.
    """
    train_data = []
    test_data = []

    for user_id in ratings['userId'].unique():
        user_ratings = ratings[ratings['userId'] == user_id]

        # Randomly sample one rating per user for the test set
        test_sample = user_ratings.sample(n=1, random_state=42)
        test_data.append(test_sample)

        # The remaining ratings go into the training set
        train_data.append(user_ratings.drop(test_sample.index))

    # Combine the training and test data
    train_data = pd.concat(train_data)
    test_data = pd.concat(test_data)

    return train_data, test_data

In [11]:
train_data, test_data = leave_one_out_split(ratings)

In [12]:
# Shuffle the data to ensure randomness
shuffled_ratings = train_data.sample(frac=1., random_state=42)

In [13]:
# Extract users, movies, and ratings arrays
Users = shuffled_ratings['userId'].values
Movies = shuffled_ratings['movieId'].values
Ratings = shuffled_ratings['rating'].values

# Model

In [14]:
# Define constants
K_FACTORS = 100  # The number of latent factors for embeddings
TEST_USER = 1

In [15]:
user_input = Input(shape=(1,), name='user_input')
item_input = Input(shape=(1,), name='item_input')

In [16]:
# Define embeddings
user_embedding = Embedding(input_dim=max_userid+1, output_dim=K_FACTORS, input_length=1, name='user_embedding')(user_input)
item_embedding = Embedding(input_dim=max_movieid+1, output_dim=K_FACTORS, input_length=1, name='item_embedding')(item_input)



In [17]:
# Flatten embeddings
user_vec = Flatten(name='flatten_users')(user_embedding)
item_vec = Flatten(name='flatten_items')(item_embedding)

In [18]:
# Compute dot product of user and item vectors
y = Dot(axes=1, name='dot_product')([user_vec, item_vec])

In [19]:
# Define the model
cf_model = Model(inputs=[user_input, item_input], outputs=y)

In [20]:
# Compile the model using MSE as the loss function and the Adam optimizer
cf_model.compile(optimizer='adam', loss='mse')

In [21]:
# Callbacks for early stopping and saving the best model
callbacks = [
    EarlyStopping(patience=2, monitor='val_loss'),
    ModelCheckpoint('best_cf_model.keras', save_best_only=True, monitor='val_loss', mode='min')
]

In [22]:
# Train the model
cf_model.fit([Users, Movies], Ratings, epochs=5, batch_size=64,
          validation_split=0.2, callbacks=callbacks)

Epoch 1/5
[1m1253/1253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 10ms/step - loss: 12.2983 - val_loss: 2.9369
Epoch 2/5
[1m1253/1253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 11ms/step - loss: 2.1001 - val_loss: 1.5040
Epoch 3/5
[1m1253/1253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step - loss: 1.0216 - val_loss: 1.2712
Epoch 4/5
[1m1253/1253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step - loss: 0.7273 - val_loss: 1.2130
Epoch 5/5
[1m1253/1253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - loss: 0.5657 - val_loss: 1.2028


<keras.src.callbacks.history.History at 0x7f55f82cee30>

In [23]:
# Save the model in HDF5 format after training
cf_model.save('cf_model_final.keras')

In [None]:
model_json = cf_model.to_json()

# Write the model architecture to a JSON file
with open('cf_model_architecture.json', 'w') as json_file:
    json_file.write(model_json)

Model architecture saved to JSON.


In [24]:
# Function to get top-N recommendations for a user
def get_user_based_recommendations(user_id, num_recommendations=10):
    all_movie_ids = np.array([movie_id for movie_id in range(1, max_movieid+1)])
    predicted_ratings = cf_model.predict([np.array([user_id] * len(all_movie_ids)), all_movie_ids])
    top_indices = np.argsort(predicted_ratings.flatten())[::-1][:num_recommendations]
    recommended_movie_ids = all_movie_ids[top_indices]
    recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)]
    return recommended_movies

In [None]:
# To Check
user_id = 3
recommendations = get_user_based_recommendations(user_id, num_recommendations=10)
print(recommendations)

      movieId                                              title  \
1190     1587                         Conan the Barbarian (1982)   
2280     3024                                     Piranha (1978)   
2765     3703                The Road Warrior (Mad Max 2) (1981)   
3734     5181                                   Hangar 18 (1980)   
4045     5746                    Galaxy of Terror (Quest) (1981)   
4050     5764                                      Looker (1981)   
4122     5919                                     Android (1982)   
5052     7899  Master of the Flying Guillotine (Du bi quan wa...   
5504    26409                           The Clonus Horror (1979)   
7114    70946                                     Troll 2 (1990)   

                                genres  
1190          Action|Adventure|Fantasy  
2280                     Horror|Sci-Fi  
2765  Action|Adventure|Sci-Fi|Thriller  
3734            Action|Sci-Fi|Thriller  
4045      Action|Horror|Mystery|Sci-Fi  
4050 

# Evaluation

In [30]:
# Prepare the test data for evaluation
test_users = test_data['userId'].values
test_movies = test_data['movieId'].values
test_ratings = test_data['rating'].values

In [31]:
# Predict ratings for the test set
predicted_test_ratings = cf_model.predict([test_users, test_movies])

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(test_ratings, predicted_test_ratings))
rmse

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step


1.0900596410737795

# Deeper Model

In [35]:
# Instantiate the deeper model
num_users = max_userid + 1
num_items = max_movieid + 1

In [36]:
# Define constants
K_FACTORS = 100

In [37]:
# Define the model
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(num_users, K_FACTORS, input_length=1, name='user_embedding')(user_input)
user_vec = Flatten(name='user_flatten')(user_embedding)
item_input = Input(shape=(1,), name='item_input')
item_embedding = Embedding(num_items, K_FACTORS, input_length=1, name='item_embedding')(item_input)
item_vec = Flatten(name='item_flatten')(item_embedding)
concat = Concatenate()([user_vec, item_vec])
dense = Dense(128, activation='relu')(concat)
dropout = Dropout(0.5)(dense)
dense = Dense(64, activation='relu')(dropout)
dropout = Dropout(0.5)(dense)
dense = Dense(32, activation='relu')(dropout)
dropout = Dropout(0.5)(dense)
output = Dense(1, name='output')(dropout)

deep_cf_model = Model(inputs=[user_input, item_input], outputs=output)
deep_cf_model.compile(optimizer='adam', loss='mse')



In [38]:
# Train the model
deep_cf_model.fit([Users, Movies], Ratings, epochs=10, batch_size=64, validation_split=0.2,
                  callbacks=[EarlyStopping(patience=2), ModelCheckpoint('deep_model.keras', save_best_only=True)])

Epoch 1/10
[1m1253/1253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 10ms/step - loss: 2.9939 - val_loss: 0.8701
Epoch 2/10
[1m1253/1253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 8ms/step - loss: 1.2976 - val_loss: 0.8352
Epoch 3/10
[1m1253/1253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - loss: 1.0267 - val_loss: 0.7705
Epoch 4/10
[1m1253/1253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - loss: 0.8608 - val_loss: 0.7657
Epoch 5/10
[1m1253/1253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 8ms/step - loss: 0.7558 - val_loss: 0.7618
Epoch 6/10
[1m1253/1253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 6ms/step - loss: 0.6932 - val_loss: 0.7623
Epoch 7/10
[1m1253/1253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 9ms/step - loss: 0.6441 - val_loss: 0.7585
Epoch 8/10
[1m1253/1253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 7ms/step - loss: 0.6068 - val_loss: 0.7670
Epoch 9/10
[1m1

<keras.src.callbacks.history.History at 0x7f5595323d90>

In [None]:
# Save the model architecture to a JSON file
model_json = deep_cf_model.to_json()
with open("cf_model_final_user_based.json", "w") as json_file:
    json_file.write(model_json)

In [None]:
# Save the model weights to an HDF5 file
deep_cf_model.save_weights("cf_model_final_user_based.keras")

In [None]:
# Download the model architecture and weights files
files.download('cf_model_final_user_based.json')
files.download('cf_model_final_user_based.keras')

In [None]:
user_id = 3
recommendations = get_user_based_recommendations(user_id, num_recommendations=10)
print(recommendations)

      movieId                                              title  \
87         99               Heidi Fleiss: Hollywood Madam (1995)   
2206     2931        Time of the Gypsies (Dom za vesanje) (1989)   
2453     3266  Man Bites Dog (C'est arrivé près de chez vous)...   
4050     5764                                      Looker (1981)   
4396     6460                      The Trial (Procès, Le) (1962)   
5489    26326     The Holy Mountain (Montaña sagrada, La) (1973)   
5580    26810                               Bad Boy Bubby (1993)   
5848    32582          The Wild Parrots of Telegraph Hill (2003)   
6954    65642             Timecrimes (Cronocrímenes, Los) (2007)   
9301   158966                           Captain Fantastic (2016)   

                            genres  
87                     Documentary  
2206    Comedy|Crime|Drama|Fantasy  
2453   Comedy|Crime|Drama|Thriller  
4050  Drama|Horror|Sci-Fi|Thriller  
4396                         Drama  
5489                         

In [40]:
predicted_test_ratings = deep_cf_model.predict([test_users, test_movies])

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step


In [None]:
rmse = np.sqrt(mean_squared_error(test_ratings, predicted_test_ratings))
print(f'RMSE: {rmse}')

RMSE: 0.7162030846048772
