In [1]:
import ast
import os
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error, precision_score, recall_score, accuracy_score,r2_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from sklearn.preprocessing import LabelEncoder


In [2]:
merged_df_loc = r"drive/MyDrive/MSUoA/merged_dataset.csv"

In [3]:
merged_df = pd.read_csv(merged_df_loc, index_col=0)

In [4]:
merged_df.head()

Unnamed: 0,user_id,movie_ids,user_ratings,imdb_rating,numVotes,primaryTitle,genres,directors_list,writers_list,user_reviews_padded
0,ur186072342,tt0015384,0.0,0.571429,0.000722,Diagonal Symphony,"['Animation', 'Music']",['nm0250873'],[],[ 2.6374e-01 -7.0189e-02 1.7799e-01 1.8658e-...
1,ur186072342,tt0019422,0.888889,0.766234,0.005652,Steamboat Willie,"['Animation', 'Comedy', 'Family']","['nm0412650', 'nm0000370']","['nm0000370', 'nm0412650']",[-1.5262e-01 2.0243e-01 7.3813e-01 1.9402e-...
2,ur186072342,tt0029583,0.666667,0.779221,0.106084,Snow White and the Seven Dwarfs,"['Adventure', 'Animation', 'Family']","['nm0183183', 'nm0359457', 'nm0414144', 'nm060...","['nm0342278', 'nm0342303', 'nm0780799', 'nm018...",[-0.01072 -0.37331 0.16924 0.15325 -...
3,ur186072342,tt0032910,1.0,0.766234,0.078725,Pinocchio,"['Adventure', 'Animation', 'Comedy']","['nm0272568', 'nm0373429', 'nm0414144', 'nm045...","['nm0172830', 'nm0780799', 'nm0257481', 'nm081...",[-0.043972 0.28374 0.13589 0.57326 ...
4,ur186072342,tt0040580,0.555556,0.584416,0.003482,Melody Time,"['Animation', 'Comedy', 'Family']","['nm0314671', 'nm0414144', 'nm0455741', 'nm052...","['nm0382548', 'nm0672093', 'nm0716206', 'nm010...",[ 0.54501 0.83112 0.45529 0.43781 -...


In [5]:
mlb_genres = MultiLabelBinarizer()
mlb_writers = MultiLabelBinarizer()
mlb_directors = MultiLabelBinarizer()

# Assuming 'genres', 'writers', and 'directors' columns are in merged_df
# Convert any string-type list representation to actual lists if needed
merged_df['genres'] = merged_df['genres'].apply(lambda x: eval(x) if isinstance(x, str) else x)
merged_df['writers_list'] = merged_df['writers_list'].apply(lambda x: eval(x) if isinstance(x, str) else x)
merged_df['directors_list'] = merged_df['directors_list'].apply(lambda x: eval(x) if isinstance(x, str) else x)
def convert_string_to_array(x):
    if isinstance(x, str):
        return np.fromstring(x.strip('[]'), sep=' ')
    return np.array(x)

# Apply conversion across the DataFrame
merged_df['user_reviews_padded'] = merged_df['user_reviews_padded'].apply(convert_string_to_array)

genres_encoded = mlb_genres.fit_transform(merged_df['genres'])
writers_encoded = mlb_writers.fit_transform(merged_df['writers_list'])
directors_encoded = mlb_directors.fit_transform(merged_df['directors_list'])

X = np.concatenate([
    np.vstack(merged_df['user_reviews_padded'].values),  # Padded user review embeddings
    genres_encoded,
    directors_encoded,
    writers_encoded
], axis=1)


In [6]:
def get_cold_start_context(fav_genres, fav_writers, fav_directors):
    # Generate binary encodings for genres, writers, and directors
    genre_vector = mlb_genres.transform([fav_genres]) if fav_genres else np.zeros((1, len(mlb_genres.classes_)))
    writer_vector = mlb_writers.transform([fav_writers]) if fav_writers else np.zeros((1, len(mlb_writers.classes_)))
    director_vector = mlb_directors.transform([fav_directors]) if fav_directors else np.zeros((1, len(mlb_directors.classes_)))

    # Generate dummy user review embedding (assuming 768-dim BERT embedding)
    dummy_user_reviews = np.zeros((1, 768))

    # Concatenate all features to match the expected input size
    context_vector = np.concatenate([dummy_user_reviews, genre_vector, writer_vector, director_vector], axis=1)

    return context_vector

def get_movie_context(fav_genres, fav_writers, fav_directors):
    # Encode genres, writers, and directors
    genre_vector = mlb_genres.transform([fav_genres]) if fav_genres else np.zeros((1, len(mlb_genres.classes_)))
    writer_vector = mlb_writers.transform([fav_writers]) if fav_writers else np.zeros((1, len(mlb_writers.classes_)))
    director_vector = mlb_directors.transform([fav_directors]) if fav_directors else np.zeros((1, len(mlb_directors.classes_)))

    # Create dummy review embeddings (zeros since it's a new movie)
    dummy_movie_reviews = np.zeros((1, 768))  # Assuming 768-dim BERT embeddings for reviews

    # Combine all features into one feature vector
    context_vector = np.concatenate([dummy_movie_reviews, genre_vector, writer_vector, director_vector], axis=1)

    return context_vector

In [7]:
# for movie recommendation

# Movie Input and Embedding
movie_encoder = LabelEncoder()
merged_df['movie_id_encoded'] = movie_encoder.fit_transform(merged_df['movie_ids'])


movie_input = Input(shape=(1,), name='movie_input')
movie_embedding = Embedding(input_dim=len(movie_encoder.classes_), output_dim=50, name='movie_embedding')(movie_input)
movie_vec = Flatten(name='movie_flatten')(movie_embedding)

movie_input = Input(shape=(1,), name='movie_input')
movie_embedding = Embedding(input_dim=len(movie_encoder.classes_), output_dim=50, name='movie_embedding')(movie_input)
movie_vec = Flatten(name='movie_flatten')(movie_embedding)

# Contextual Features Input
context_input = Input(shape=(X.shape[1],), name='context_input')

# Concatenate movie embedding with context input
concat = Concatenate()([movie_vec, context_input])

# Dense Layers
x = Dense(512, activation='relu')(concat)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)
output = Dense(1, activation='linear')(x)

# Define Cold Start Movie Recommendation Model
cold_start_movie_model = Model(inputs=[movie_input, context_input], outputs=output)
cold_start_movie_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Create a mapping from movie_id_encoded to primaryTitle
movie_id_to_title = dict(zip(merged_df['movie_ids'], merged_df['primaryTitle']))

def recommend_cold_start_movies(fav_genres=None, fav_writers=None, fav_directors=None, top_n=10):
    all_movies = np.arange(len(movie_encoder.classes_))

    # Generate context vector
    context_features = get_cold_start_context(fav_genres, fav_writers, fav_directors)
    context_repeated = np.tile(context_features, (len(all_movies), 1))

    # Predict ratings
    predictions = cold_start_movie_model.predict([all_movies, context_repeated])
    predictions = predictions.flatten()

    # Get top N movie indices
    top_indices = predictions.argsort()[-top_n:][::-1]
    recommended_movie_ids = movie_encoder.inverse_transform(top_indices)
    recommended_movies = [movie_id_to_title.get(movie_id, "Unknown Movie") for movie_id in recommended_movie_ids]

    return [(title, predictions[i]) for i, title in zip(top_indices, recommended_movies)]


# Top 5 Movies for Cold-Start User
top_cold_start_movies = recommend_cold_start_movies(
    fav_genres=['Action', 'Adventure'],
    fav_writers=['nm0522871'],
    fav_directors=['nm0412650'],
    top_n=5
)
print("Top 5 Cold Start Movie Recommendations:")
for movie, score in top_cold_start_movies:
    print(f"Movie Title: {movie}, Predicted Rating: {score:.2f}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 302ms/step
Top 5 Cold Start Movie Recommendations:
Movie Title: Pinocchio, Predicted Rating: -0.00
Movie Title: Melody Time, Predicted Rating: -0.00
Movie Title: Steamboat Willie, Predicted Rating: -0.01
Movie Title: Snow White and the Seven Dwarfs, Predicted Rating: -0.01
Movie Title: Diagonal Symphony, Predicted Rating: -0.01


In [8]:
# User Input and Embedding
# Encode user IDs and movie IDs
user_encoder = LabelEncoder()
merged_df['user_id_encoded'] = user_encoder.fit_transform(merged_df['user_id'])

user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=len(user_encoder.classes_), output_dim=50, name='user_embedding')(user_input)
user_vec = Flatten(name='user_flatten')(user_embedding)

# Movie Context Input
movie_context_input = Input(shape=(X.shape[1],), name='movie_context_input')

# Concatenate user embedding with movie context
concat_user_movie = Concatenate()([user_vec, movie_context_input])

# Dense Layers
x = Dense(512, activation='relu')(concat_user_movie)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)
output = Dense(1, activation='linear')(x)

# Define Top-N User Recommendation Model
top_n_user_model = Model(inputs=[user_input, movie_context_input], outputs=output)
top_n_user_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

def recommend_top_n_users(fav_genres=None, fav_writers=None, fav_directors=None, top_n=10):
    movie_context = get_movie_context(fav_genres, fav_writers, fav_directors)

    num_users = len(user_encoder.classes_)
    user_ids = np.arange(num_users).astype(np.int32).reshape(-1, 1)
    movie_context_repeated = np.tile(movie_context, (num_users, 1))

    # Predict ratings
    predictions = top_n_user_model.predict([user_ids, movie_context_repeated])
    predictions = predictions.flatten()

    # Get top-N user indices
    top_indices = predictions.argsort()[-top_n:][::-1]
    top_user_ids = user_encoder.inverse_transform(top_indices)

    return [(user_id, predictions[i]) for i, user_id in zip(top_indices, top_user_ids)]

top_users = recommend_top_n_users(
    fav_genres=['Drama', 'Thriller'],
    fav_writers=['nm0522871'],
    fav_directors=['nm0412650'],
    top_n=5
)
print("Top 5 Users Likely to Enjoy This Movie:")
for user_id, score in top_users:
    print(f"User ID: {user_id}, Predicted Rating: {score:.2f}")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 224ms/step
Top 5 Users Likely to Enjoy This Movie:
User ID: ur186072342, Predicted Rating: 0.02


In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def precision_at_k(y_true, y_pred, k):
    """Calculate Precision@K"""
    # Get the top-k predicted indices
    top_k_indices = np.argsort(y_pred)[-k:][::-1]
    relevant = np.isin(top_k_indices, np.where(y_true == 1)[0]).sum()
    return relevant / k


def recall_at_k(y_true, y_pred, k):
    """Calculate Recall@K"""
    # Get the top-k predicted indices
    top_k_indices = np.argsort(y_pred)[-k:][::-1]
    total_relevant = np.sum(y_true)
    if total_relevant == 0:
        return 0.0  # Avoid division by zero
    relevant = np.isin(top_k_indices, np.where(y_true == 1)[0]).sum()
    return relevant / total_relevant

# Predict ratings using the cold start movie model
predictions = cold_start_movie_model.predict([X_movies_test, X_context_test]).flatten()

# Calculate RMSE using actual test ratings
rmse_score = calculate_rmse(y_test, predictions)
print(f"Model RMSE: {rmse_score:.4f}")

# Convert true ratings to binary relevance for Precision@K and Recall@K
# Assuming ratings >= 4.0 are relevant
y_true_binary = (y_test >= 4.0).astype(int)

# Calculate Precision@K and Recall@K for K=10
k = 10
precision = precision_at_k(y_true_binary, predictions, k)
recall = recall_at_k(y_true_binary, predictions, k)
print(f"Precision@{k}: {precision:.4f}")
print(f"Recall@{k}: {recall:.4f}")
