In [1]:
! pip install tensorflow



In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, precision_score, recall_score, f1_score


# Load dataset

In [4]:
ratings = pd.read_csv('/content/dataset/ratings.csv')
movies = pd.read_csv('/content/dataset/movies.csv')

# Prepare the data

In [5]:
user_ids = ratings['userId'].unique().tolist()
movie_ids = ratings['movieId'].unique().tolist()

In [6]:
# Create mappings
user_user_encoded = {x: i for i, x in enumerate(user_ids)}
movie_movie_encoded = {x: i for i, x in enumerate(movie_ids)}

user_encoded_user = {i: x for x, i in user_user_encoded.items()}
movie_encoded_movie = {i: x for x, i in movie_movie_encoded.items()}

In [7]:
# Encode userId and movieId
ratings['user'] = ratings['userId'].map(user_user_encoded)
ratings['movie'] = ratings['movieId'].map(movie_movie_encoded)

In [8]:
# Process genres
all_genres = set()
for genres in movies['genres'].str.split('|'):
    all_genres.update(genres)
all_genres = sorted(all_genres)

# Create genre mappings
genre_genre_encoded = {x: i for i, x in enumerate(all_genres)}
genre_encoded_genre = {i: x for x, i in genre_genre_encoded.items()}

# Encode genres
def encode_genres(genres):
    encoded = np.zeros(len(all_genres))
    for genre in genres.split('|'):
        if genre in genre_genre_encoded:
            encoded[genre_genre_encoded[genre]] = 1
    return encoded

movies['genre_encoded'] = movies['genres'].apply(encode_genres)

# Merge ratings with movies to get genre information
ratings = ratings.merge(movies[['movieId', 'genre_encoded']], on='movieId', how='left')


In [9]:
num_users = len(user_user_encoded)
num_movies = len(movie_movie_encoded)
num_genres = len(all_genres)

# Sort ratings by user and timestamp
ratings = ratings.sort_values(['userId', 'timestamp'])

In [10]:
# Prepare training and test data
train_data = ratings.groupby('userId').apply(lambda x: x.iloc[:-1]).reset_index(drop=True)
test_data = ratings.groupby('userId').apply(lambda x: x.iloc[-1]).reset_index(drop=True)

X_train = train_data[['user', 'movie']].values
genres_train = np.stack(train_data['genre_encoded'].values)
y_train = train_data['rating'].values

X_test = test_data[['user', 'movie']].values
genres_test = np.stack(test_data['genre_encoded'].values)
y_test = test_data['rating'].values

In [11]:
embedding_size = 50

# User and Movie input layers
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(num_users, embedding_size, name='user_embedding')(user_input)
user_vec = Flatten(name='user_flatten')(user_embedding)

movie_input = Input(shape=(1,), name='movie_input')
movie_embedding = Embedding(num_movies, embedding_size, name='movie_embedding')(movie_input)
movie_vec = Flatten(name='movie_flatten')(movie_embedding)

# Genre input layer
genre_input = Input(shape=(num_genres,), name='genre_input')

# Concatenate user, movie, and genre embeddings
concat = Concatenate()([user_vec, movie_vec, genre_input])

# Fully connected layers
layer1 = Dense(256, activation='relu')(concat)
dropout1 = Dropout(0.3)(layer1)
layer2 = Dense(128, activation='relu')(dropout1)
dropout2 = Dropout(0.3)(layer2)
layer3 = Dense(64, activation='relu')(dropout2)
dropout3 = Dropout(0.3)(layer3)
layer4 = Dense(32, activation='relu')(dropout3)
# Output layer
output = Dense(1)(layer3)

# Build the model
model = Model([user_input, movie_input, genre_input], output)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Summary of the model
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 user_input (InputLayer)     [(None, 1)]                  0         []                            
                                                                                                  
 movie_input (InputLayer)    [(None, 1)]                  0         []                            
                                                                                                  
 user_embedding (Embedding)  (None, 1, 50)                30500     ['user_input[0][0]']          
                                                                                                  
 movie_embedding (Embedding  (None, 1, 50)                486200    ['movie_input[0][0]']         
 )                                                                                            

In [12]:
# Train the model
history = model.fit([X_train[:, 0], X_train[:, 1], genres_train], y_train, epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [13]:
# Predict ratings for the test data
predicted_ratings = model.predict([X_test[:, 0], X_test[:, 1], genres_test])

# Calculate Mean Squared Error
mse = np.mean((predicted_ratings.flatten() - y_test) ** 2)
print(f'Test MSE: {mse}')

Test MSE: 1.0696835575538624


In [14]:
def predict_top_n(user_id, n=10):
    user_encoded = user_user_encoded[user_id]
    movie_ids = movies['movieId'].values

    # Filter out movie_ids that are not in movie2movie_encoded
    valid_movie_ids = [movie_id for movie_id in movie_ids if movie_id in movie_movie_encoded]
    valid_movie_encoded = [movie_movie_encoded[movie_id] for movie_id in valid_movie_ids]
    genre_encoded = np.stack(movies[movies['movieId'].isin(valid_movie_ids)]['genre_encoded'].values)

    user_array = np.array([user_encoded] * len(valid_movie_encoded))

    predictions = model.predict([user_array, np.array(valid_movie_encoded), genre_encoded])
    predictions = predictions.flatten()

    top_n_indices = predictions.argsort()[-n:][::-1]
    top_n_movie_ids = [movie_encoded_movie[valid_movie_encoded[i]] for i in top_n_indices]
    top_n_predictions = predictions[top_n_indices]

    return top_n_movie_ids, top_n_predictions


In [15]:
# Predict top 10 movies for user with ID 1
user_id = 200
top_n = 10

top_n_movie_ids, top_n_predictions = predict_top_n(user_id, top_n)

# Create a DataFrame for the recommendations
recommendations_df = pd.DataFrame({
    'Rank': range(1, top_n + 1),
    # 'Movie ID': top_n_movie_ids,
    'Movie_Title': [movies[movies['movieId'] == movie_id]['title'].values[0] for movie_id in top_n_movie_ids],
    'Genres': [movies[movies['movieId'] == movie_id]['genres'].values[0] for movie_id in top_n_movie_ids],
    'Predicted_Rating': top_n_predictions
}).set_index('Rank')

print(f'Top {top_n} recommendations for User {user_id}:\n')
recommendations_df


Top 10 recommendations for User 200:



Unnamed: 0_level_0,Movie_Title,Genres,Predicted_Rating
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,De platte jungle (1978),Documentary,5.286945
2,Watermark (2014),Documentary,5.234839
3,Connections (1978),Documentary,5.170949
4,Zeitgeist: Moving Forward (2011),Documentary,5.137667
5,Bitter Lake (2015),Documentary,5.129382
6,Strictly Sexual (2008),Comedy|Drama|Romance,5.11802
7,Nasu: Summer in Andalusia (2003),Animation,5.063992
8,Blue Planet II (2017),Documentary,5.031184
9,Jonah Who Will Be 25 in the Year 2000 (Jonas q...,Comedy,4.977284
10,"Mystery of the Third Planet, The (Tayna tretey...",Adventure|Animation|Sci-Fi,4.925653
