In [24]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Model, Sequential
from pathlib import Path
import matplotlib.pyplot as plt

In [37]:
ratings_file = pd.read_csv('./ratings.csv', nrows=100000)
movies = pd.read_csv('./movies.csv')

In [38]:

# Merge the DataFrames on the 'movieId' column, including the 'title' column from the movies DataFrame
merged_df = pd.merge(ratings_file, movies[['movieId', 'title']], on='movieId')

# Select the relevant columns
ratings_file = merged_df[['userId', 'movieId', 'title', 'rating', 'timestamp']]
print(ratings_file)

       userId  movieId                                             title  \
0           1      296                               Pulp Fiction (1994)   
1           1      306  Three Colors: Red (Trois couleurs: Rouge) (1994)   
2           1      307  Three Colors: Blue (Trois couleurs: Bleu) (1993)   
3           1      665                                Underground (1995)   
4           1      899                        Singin' in the Rain (1952)   
...       ...      ...                                               ...   
99995     757     2115       Indiana Jones and the Temple of Doom (1984)   
99996     757     2117                1984 (Nineteen Eighty-Four) (1984)   
99997     757     2118                             Dead Zone, The (1983)   
99998     757     2124                         Addams Family, The (1991)   
99999     757     2132            Who's Afraid of Virginia Woolf? (1966)   

       rating   timestamp  
0         5.0  1147880044  
1         3.5  1147868817  
2  

In [39]:
user_ids = ratings_file["userId"].unique().tolist() 
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
movie_ids = ratings_file["movieId"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
ratings_file["user"] = ratings_file["userId"].map(user2user_encoded)
ratings_file["movie"] = ratings_file["movieId"].map(movie2movie_encoded)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_file["user"] = ratings_file["userId"].map(user2user_encoded)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_file["movie"] = ratings_file["movieId"].map(movie2movie_encoded)


In [40]:
movie2movie_encoded

{296: 0,
 306: 1,
 307: 2,
 665: 3,
 899: 4,
 1088: 5,
 1175: 6,
 1217: 7,
 1237: 8,
 1250: 9,
 1260: 10,
 1653: 11,
 2011: 12,
 2012: 13,
 2068: 14,
 2161: 15,
 2351: 16,
 2573: 17,
 2632: 18,
 2692: 19,
 2843: 20,
 3448: 21,
 3569: 22,
 3949: 23,
 4144: 24,
 4308: 25,
 4325: 26,
 4422: 27,
 4703: 28,
 4973: 29,
 5147: 30,
 5269: 31,
 5684: 32,
 5767: 33,
 5878: 34,
 5912: 35,
 5952: 36,
 6016: 37,
 6370: 38,
 6377: 39,
 6539: 40,
 6711: 41,
 6954: 42,
 7209: 43,
 7234: 44,
 7318: 45,
 7323: 46,
 7327: 47,
 7361: 48,
 7365: 49,
 7820: 50,
 7937: 51,
 7938: 52,
 7939: 53,
 7940: 54,
 8014: 55,
 8154: 56,
 8327: 57,
 8360: 58,
 8405: 59,
 8685: 60,
 8729: 61,
 8786: 62,
 8873: 63,
 8973: 64,
 27193: 65,
 27266: 66,
 27721: 67,
 31956: 68,
 32591: 69,
 1: 70,
 62: 71,
 110: 72,
 150: 73,
 151: 74,
 236: 75,
 260: 76,
 261: 77,
 266: 78,
 318: 79,
 333: 80,
 349: 81,
 356: 82,
 364: 83,
 380: 84,
 457: 85,
 480: 86,
 497: 87,
 524: 88,
 527: 89,
 534: 90,
 553: 91,
 588: 92,
 589: 93,
 65

In [41]:
ratings_file.head()

Unnamed: 0,userId,movieId,title,rating,timestamp,user,movie
0,1,296,Pulp Fiction (1994),5.0,1147880044,0,0
1,1,306,Three Colors: Red (Trois couleurs: Rouge) (1994),3.5,1147868817,0,1
2,1,307,Three Colors: Blue (Trois couleurs: Bleu) (1993),5.0,1147868828,0,2
3,1,665,Underground (1995),5.0,1147878820,0,3
4,1,899,Singin' in the Rain (1952),3.5,1147868510,0,4


In [42]:
num_users = len(user2user_encoded)
num_movies = len(movie2movie_encoded)
ratings_file["rating"] = ratings_file["rating"].values.astype(np.float32) # to convert rating value to float points

min_rating = min(ratings_file["rating"])
max_rating = max(ratings_file["rating"])

print("num_users",num_users)
print("num_movies",num_movies)
print("min_rating",min_rating)
print("max_rating",max_rating)



num_users 757
num_movies 9786
min_rating 0.5
max_rating 5.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_file["rating"] = ratings_file["rating"].values.astype(np.float32) # to convert rating value to float points


In [43]:
from sklearn.model_selection import train_test_split



x = ratings_file[["user", "movie"]].values
y = ratings_file["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values

# Split the data into training and validation sets (90% training, 10% validation)
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=42, shuffle=True)


print("x_train shape:", x_train.shape)
print("x_val shape:", x_val.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)

x_train shape: (90000, 2)
x_val shape: (10000, 2)
y_train shape: (90000,)
y_val shape: (10000,)


In [52]:
#model
embedding_size = 50

user_ips= layers.Input(shape=[1])
user_embedding = layers.Embedding(num_users,embedding_size,embeddings_initializer="he_normal",embeddings_regularizer=keras.regularizers.l2(1e-6))(user_ips)
# Embedding layer will create 610 vectors look-up table of 50 dimension each
#These layers flatten the output of the embedding layers into a 1-dimensional vector.
user_vect= layers.Flatten()(user_embedding)

movie_ips= layers.Input(shape=[1])

#Embedding layers: Convert user and movie indices into dense vectors of fixed size (embedding_size).
#These layers learn representations for users and movies.
movie_embedding = layers.Embedding(num_movies, embedding_size, embeddings_initializer="he_normal",embeddings_regularizer=keras.regularizers.l2(1e-6))(movie_ips)

#Flatten layers: Flatten the output of the embedding layers.
movie_vect= layers.Flatten()(movie_embedding)

#Dot layer: Computes the dot product of user and movie vectors.
prod = layers.Dot(axes=1)([user_vect, movie_vect])
#Dense layers: Fully connected layers with ReLU activation functions.
#Output layer: Single neuron with ReLU activation function, predicting the rating.

dense1= layers.Dense(150, activation='relu', kernel_initializer="he_normal")(prod)
dense2= layers.Dense(50, activation='relu',kernel_initializer="he_normal")(dense1)
dense3= layers.Dense(1,activation='relu')(dense2)

model = Model([user_ips, movie_ips], dense3)
model.compile(optimizer='adam',loss='mean_squared_error')



In [53]:
history = model.fit([x_train[:, 0], x_train[:, 1]], y_train, batch_size=64, epochs=10, verbose=1)


Epoch 1/10
[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 11ms/step - loss: 0.0799
Epoch 2/10
[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 9ms/step - loss: 0.0456
Epoch 3/10
[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 9ms/step - loss: 0.0339
Epoch 4/10
[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 9ms/step - loss: 0.0301
Epoch 5/10
[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 9ms/step - loss: 0.0241
Epoch 6/10
[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 9ms/step - loss: 0.0228
Epoch 7/10
[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 9ms/step - loss: 0.0203
Epoch 8/10
[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 9ms/step - loss: 0.0193
Epoch 9/10
[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 12ms/step - loss: 0.0177
Epoch 10/10
[1m1407/1407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

In [54]:
pred=model.predict([x_train[4:5,0], x_train[4:5,1]])
pred

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 447ms/step


array([[0.98899555]], dtype=float32)

In [76]:
model.save('movie_recommendation_model.h5')



In [57]:
import pickle

with open('user2user_encoded.pkl', 'wb') as file:
    pickle.dump(user2user_encoded, file)

In [58]:
with open('movie2movie_encoded.pkl', 'wb') as file:
    pickle.dump(movie2movie_encoded, file)
    

In [59]:
with open('movie_encoded2movie.pkl', 'wb') as file:
    pickle.dump(movie_encoded2movie, file)