In [1]:
import ast
import os
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error, precision_score, recall_score, accuracy_score,r2_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [2]:
merged_df_loc = r"drive/MyDrive/MSUoA/merged_dataset.csv"
merged_df = pd.read_csv(merged_df_loc, index_col=0).head(10000)

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, MinMaxScaler
import ast

# Safely convert string lists to Python lists using ast.literal_eval
def safe_eval_list(x):
    if isinstance(x, str):
        return ast.literal_eval(x)
    return x

# Convert string lists into proper lists
merged_df['genres'] = merged_df['genres'].apply(safe_eval_list)
merged_df['directors_list'] = merged_df['directors_list'].apply(safe_eval_list)
merged_df['writers_list'] = merged_df['writers_list'].apply(safe_eval_list)

# Convert string representations of embeddings to numpy arrays
def convert_string_to_array(x):
    if isinstance(x, str):
        return np.fromstring(x.strip('[]'), sep=' ')
    return np.array(x)

# Apply conversion across the DataFrame
merged_df['user_reviews_padded'] = merged_df['user_reviews_padded'].apply(convert_string_to_array)

# Encode categorical variables
mlb_genres = MultiLabelBinarizer()
mlb_writers = MultiLabelBinarizer()
mlb_directors = MultiLabelBinarizer()

genres_encoded = mlb_genres.fit_transform(merged_df['genres'])
writers_encoded = mlb_writers.fit_transform(merged_df['writers_list'])
directors_encoded = mlb_directors.fit_transform(merged_df['directors_list'])

# Normalize numerical features
scaler_numeric = MinMaxScaler()
numeric_features = scaler_numeric.fit_transform(merged_df[['imdb_rating', 'numVotes']])

# Consistent user_reviews_padded embeddings
embedding_dim = len(merged_df['user_reviews_padded'].iloc[0])
merged_df['user_reviews_padded'] = merged_df['user_reviews_padded'].apply(
    lambda x: x if len(x) == embedding_dim else np.zeros(embedding_dim)
)

# Concatenate all features into X
X = np.concatenate([
    np.vstack(merged_df['user_reviews_padded'].values),
    numeric_features,
    genres_encoded,
    writers_encoded,
    directors_encoded
], axis=1)

# Define y (scaled user ratings)
y = merged_df['user_ratings'].values


  return np.fromstring(x.strip('[]'), sep=' ')


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dropout, Concatenate
from tensorflow.keras.models import Model

# Encode user IDs and movie IDs
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()
merged_df['user_id_encoded'] = user_encoder.fit_transform(merged_df['user_id'])
merged_df['movie_id_encoded'] = movie_encoder.fit_transform(merged_df['movie_ids'])

# Train-test split
scaler = MinMaxScaler(feature_range=(0, 1))
y_scaled = scaler.fit_transform(y.reshape(-1, 1)).flatten() # y_scaled will be same because user_ratings are already scaled

# Splitting both features and movie IDs
X_train, X_test, y_train, y_test, movie_train, movie_test = train_test_split(
    X, y_scaled, merged_df['movie_id_encoded'].values, test_size=0.25, random_state=42
)
# Check the shapes of training data
print("X_train shape:", X_train.shape)
print("movie_train shape:", movie_train.shape)

movie_input = Input(shape=(1,), name='movie_input')
movie_embedding = Embedding(input_dim=len(movie_encoder.classes_), output_dim=50, name='movie_embedding')(movie_input)
movie_vec = Flatten(name='movie_flatten')(movie_embedding)

# Contextual Features Input
context_input = Input(shape=(X_train.shape[1],), name='context_input')

# Concatenate movie embedding with context input
concat = Concatenate()([movie_vec, context_input])

# Dense Layers for cold-start movie recommendation
x = Dense(512, activation='relu')(concat)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)
output = Dense(1, activation='linear')(x)

# Define Cold Start Movie Recommendation Model
cold_start_movie_model = Model(inputs=[movie_input, context_input], outputs=output)
cold_start_movie_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Model summary
print(cold_start_movie_model.summary())

# Train the model
history_cold_start = cold_start_movie_model.fit(
    [movie_train, X_train], y_train,  # Using only training data
    epochs=5,
    batch_size=64,
    validation_split=0.2
)

X_train shape: (7500, 17543)
movie_train shape: (7500,)


None
Epoch 1/5
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 273ms/step - loss: 0.1366 - mae: 0.2845 - val_loss: 0.0622 - val_mae: 0.1986
Epoch 2/5
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 266ms/step - loss: 0.0565 - mae: 0.1827 - val_loss: 0.0621 - val_mae: 0.1984
Epoch 3/5
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 366ms/step - loss: 0.0454 - mae: 0.1547 - val_loss: 0.0710 - val_mae: 0.2145
Epoch 4/5
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 378ms/step - loss: 0.0418 - mae: 0.1399 - val_loss: 0.0690 - val_mae: 0.2101
Epoch 5/5
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 277ms/step - loss: 0.0310 - mae: 0.1159 - val_loss: 0.0646 - val_mae: 0.2011


In [23]:
from sklearn.metrics import mean_squared_error
import numpy as np

def manual_minmax_descale(scaled_ratings, original_min=1, original_max=10):
    """
    Converts MinMax scaled ratings back to their original scale.

    Args:
        scaled_ratings (np.array): The scaled ratings between 0 and 1.
        original_min (float): The minimum value of the original scale.
        original_max (float): The maximum value of the original scale.

    Returns:
        np.array: Descaled ratings in the original scale.
    """
    return scaled_ratings * (original_max - original_min) + original_min


def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Function to Calculate Precision@K
def precision_at_k(y_true, y_pred, k):
    """Calculate Precision@K"""
    top_k_indices = np.argsort(y_pred)[-k:][::-1]  # Get top-k indices
    relevant = np.isin(top_k_indices, np.where(y_true == 1)[0]).sum()  # Count relevant hits
    return relevant / k

# Function to Calculate Recall@K
def recall_at_k(y_true, y_pred, k):
    """Calculate Recall@K"""
    top_k_indices = np.argsort(y_pred)[-k:][::-1]
    total_relevant = np.sum(y_true)
    if total_relevant == 0:
        return 0.0  # Avoid division by zero
    relevant = np.isin(top_k_indices, np.where(y_true == 1)[0]).sum()
    return relevant / total_relevant

# Predict on the test set
predictions = cold_start_movie_model.predict([movie_test, X_test]).flatten()

# Calculate RMSE
y_test_original = y_test.reshape(-1, 1).flatten()
rmse_score = calculate_rmse(y_test_original, predictions)
print("RMSE:", rmse_score)

# Convert True Ratings to Binary Relevance (e.g., ratings >= 4.0 are relevant)
y_true_binary = (y_test_original >= 0.4).astype(int)  # Adjust threshold since ratings are scaled between 0-1

# Calculate Precision@K and Recall@K for K=10
k = 10
k = min(k, len(y_true_binary))  # Adjust K if necessary

precision = precision_at_k(y_true_binary, predictions, k)
recall = recall_at_k(y_true_binary, predictions, k)

# Print Metrics
print(f"Precision@{k}: {precision:.4f}")
print(f"Recall@{k}: {recall:.4f}")


[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 63ms/step
RMSE: 0.2649399876642649
Precision@10: 0.9000
Recall@10: 0.0042


What Do These Results Indicate?
RMSE = 0.2649 (on scaled ratings)

Since your ratings were scaled between [0, 1], this corresponds to approximately:
0.2649
×
(
10
−
1
)
≈
2.38
0.2649×(10−1)≈2.38
A descaled RMSE of ~2.38 suggests that your model predicts fairly well but has some room for improvement.
Precision@10 = 0.9000

Out of the top 10 recommendations, 9 out of 10 movies are relevant (i.e., ratings ≥ 4.0).
This is a great result! It means your top recommendations are highly accurate.
Recall@10 = 0.0042

Only 0.42% of all relevant movies are being captured in the top 10 recommendations.
This suggests that while your model is good at recommending a few relevant movies (high precision), it misses most of the relevant movies (low recall).

RMSE table

0 - 1.0	Excellent (almost perfect predictions)

1.0 - 2.0	Good (strong predictive power)

2.0 - 3.0	Fair (room for improvement)

greater than 3.0	Poor (model is not performing well)


How to Improve RMSE Further
Here are some strategies to potentially improve your RMSE:

Feature Engineering:
Add more contextual features (e.g., release year, cast, user demographics).

Model Improvements:
Use deeper neural networks or advanced architectures like Neural Collaborative Filtering (NCF).

Hyperparameter Tuning:
Tune learning rates, batch sizes, or layer sizes.

Increase Epochs:
Train for more epochs with early stopping to prevent overfitting.

Regularization Techniques:
Add L2 regularization to prevent overfitting.

In [16]:
# Create a mapping from movie_id_encoded to primaryTitle
movie_id_to_title = dict(zip(merged_df['movie_ids'], merged_df['primaryTitle']))

# Generate binary encodings for genres, writers, and directors based on user preferences
def get_cold_start_context(fav_genres, fav_writers, fav_directors):
    # Encode genres, writers, and directors
    genre_vector = mlb_genres.transform([fav_genres]) if fav_genres else np.zeros((1, len(mlb_genres.classes_)))
    writer_vector = mlb_writers.transform([fav_writers]) if fav_writers else np.zeros((1, len(mlb_writers.classes_)))
    director_vector = mlb_directors.transform([fav_directors]) if fav_directors else np.zeros((1, len(mlb_directors.classes_)))

    # Generate dummy numerical features (imdb_rating and numVotes)
    dummy_numerical_features = np.zeros((1, 2))  # Placeholder for numerical features

    # Generate dummy user review embeddings
    embedding_dim = 17543 - (2 + len(mlb_genres.classes_) + len(mlb_writers.classes_) + len(mlb_directors.classes_))
    dummy_user_reviews = np.zeros((1, embedding_dim))

    # Concatenate all features to match the expected input size
    context_vector = np.concatenate([
        dummy_user_reviews,
        dummy_numerical_features,
        genre_vector,
        writer_vector,
        director_vector
    ], axis=1)

    return context_vector


# Function to recommend movies for cold-start users
def recommend_cold_start_movies(fav_genres=None, fav_writers=None, fav_directors=None, top_n=10):
    # Generate movie indices and ensure proper dtype
    all_movies = np.arange(len(movie_encoder.classes_)).reshape(-1, 1).astype(np.float32)  # Convert to float32

    # Generate context vector based on user preferences
    context_features = get_cold_start_context(fav_genres, fav_writers, fav_directors)

    # Repeat context for all movies and ensure proper dtype
    context_repeated = np.tile(context_features, (len(all_movies), 1)).astype(np.float32)  # Convert to float32

    # Check input shapes and types before prediction
    print("all_movies shape:", all_movies.shape, all_movies.dtype)
    print("context_repeated shape:", context_repeated.shape, context_repeated.dtype)

    # Predict ratings for all movies
    predictions = cold_start_movie_model.predict([all_movies, context_repeated]).flatten()

    # Descend the predicted ratings back to the original scale (1-10)
    descaled_predictions = manual_minmax_descale(predictions)

    # Retrieve top N movie indices
    top_indices = descaled_predictions.argsort()[-top_n:][::-1]
    recommended_movie_ids = movie_encoder.inverse_transform(top_indices)

    # Map movie IDs to movie titles
    recommended_movies = [movie_id_to_title.get(movie_id, "Unknown Movie") for movie_id in recommended_movie_ids]

    # Return the list of recommended movies with predicted ratings
    return [(title, descaled_predictions[i]) for i, title in zip(top_indices, recommended_movies)]



In [26]:
# Recommend top 5 movies for a cold-start user with provided preferences
top_cold_start_movies = recommend_cold_start_movies(
    fav_genres=['Action', 'Drama'],
    fav_writers=['nm0522871'],
    fav_directors=['nm0412650'],
    top_n=5
)

# Print the recommendations
print("Top 5 Cold-Start Movie Recommendations:")
for movie, score in top_cold_start_movies:
    print(f"Movie Title: {movie}, Predicted Rating: {score:.2f}")



all_movies shape: (7200, 1) float32
context_repeated shape: (7200, 17543) float32
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step
Top 5 Cold-Start Movie Recommendations:
Movie Title: Champagne for Caesar, Predicted Rating: 7.68
Movie Title: The Last Mile, Predicted Rating: 7.53
Movie Title: Spring in Park Lane, Predicted Rating: 7.46
Movie Title: Pink Flamingos, Predicted Rating: 7.41
Movie Title: Outer Space Jitters, Predicted Rating: 7.38


In [27]:
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dropout, Concatenate
from tensorflow.keras.models import Model

# Encode user IDs
user_encoder = LabelEncoder()
merged_df['user_id_encoded'] = user_encoder.fit_transform(merged_df['user_id'])

# Train-test split
X_train, X_test, y_train, y_test, user_train, user_test = train_test_split(
    X, y_scaled, merged_df['user_id_encoded'].values, test_size=0.25, random_state=42
)

# Check data shapes
print("X_train shape:", X_train.shape)
print("user_train shape:", user_train.shape)

# User Input and Embedding
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=len(user_encoder.classes_), output_dim=50, name='user_embedding')(user_input)
user_vec = Flatten(name='user_flatten')(user_embedding)

# Movie Context Input
movie_context_input = Input(shape=(X_train.shape[1],), name='movie_context_input')

# Concatenate user embedding with movie context
concat_user_movie = Concatenate()([user_vec, movie_context_input])

# Dense Layers
x = Dense(512, activation='relu')(concat_user_movie)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)
output = Dense(1, activation='linear')(x)

# Define Top-N User Recommendation Model
top_n_user_model = Model(inputs=[user_input, movie_context_input], outputs=output)
top_n_user_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Model summary
print(top_n_user_model.summary())

# Train the model
history = top_n_user_model.fit(
    [user_train, X_train], y_train,  # Using only training data
    epochs=5,
    batch_size=64,
    validation_split=0.2
)


X_train shape: (7500, 17543)
user_train shape: (7500,)


None
Epoch 1/5
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 273ms/step - loss: 0.1304 - mae: 0.2827 - val_loss: 0.0676 - val_mae: 0.2041
Epoch 2/5
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 247ms/step - loss: 0.0625 - mae: 0.1839 - val_loss: 0.0670 - val_mae: 0.2070
Epoch 3/5
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 274ms/step - loss: 0.0409 - mae: 0.1489 - val_loss: 0.0671 - val_mae: 0.2049
Epoch 4/5
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 275ms/step - loss: 0.0440 - mae: 0.1308 - val_loss: 0.0682 - val_mae: 0.2078
Epoch 5/5
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 423ms/step - loss: 0.0444 - mae: 0.1170 - val_loss: 0.0649 - val_mae: 0.2028


In [29]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Function to Descale Predictions
def manual_minmax_descale(scaled_ratings, original_min=1, original_max=10):
    return scaled_ratings * (original_max - original_min) + original_min

# Function to Calculate RMSE
def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Function to Calculate Precision@K
def precision_at_k(y_true, y_pred, k):
    """Calculate Precision@K"""
    top_k_indices = np.argsort(y_pred)[-k:][::-1]  # Get top-k indices
    relevant = np.isin(top_k_indices, np.where(y_true == 1)[0]).sum()  # Count relevant hits
    return relevant / k

# Function to Calculate Recall@K
def recall_at_k(y_true, y_pred, k):
    """Calculate Recall@K"""
    top_k_indices = np.argsort(y_pred)[-k:][::-1]
    total_relevant = np.sum(y_true)
    if total_relevant == 0:
        return 0.0  # Avoid division by zero
    relevant = np.isin(top_k_indices, np.where(y_true == 1)[0]).sum()
    return relevant / total_relevant

# Predict on the Test Set
predictions = top_n_user_model.predict([user_test, X_test]).flatten()

# # Descale Predictions Back to Original Rating Scale (1-10)
# descaled_predictions = manual_minmax_descale(predictions)

# Calculate RMSE
# Transform scaled y_test back to original ratings
y_test_original = y_test.reshape(-1, 1).flatten()
rmse_score = calculate_rmse(y_test_original, predictions)
print(f"RMSE: {rmse_score:.4f}")

# Convert True Ratings to Binary Relevance (ratings >= 4.0 are relevant)
y_true_binary = (y_test_original >= 0.4).astype(int)

# Calculate Precision@K and Recall@K for K=10
k = 10
k = min(k, len(y_true_binary))  # Adjust if fewer samples are present

precision = precision_at_k(y_true_binary, predictions, k)
recall = recall_at_k(y_true_binary, predictions, k)

# Print Evaluation Metrics
print(f"Precision@{k}: {precision:.4f}")
print(f"Recall@{k}: {recall:.4f}")



[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step
RMSE: 0.2635
Precision@10: 0.8000
Recall@10: 0.0038


What Do These Metrics Indicate?
RMSE = 0.2635 (Scaled)

When converted back to the original rating scale:
0.2635
×
(
10
−
1
)
≈
2.37
0.2635×(10−1)≈2.37
An RMSE of ~2.37 is acceptable for a movie recommendation system, though there’s still room for improvement.

Precision@10 = 0.8000

Out of the top 10 recommended users, 8 out of 10 were relevant (ratings ≥ 4.0).
This suggests the model is good at identifying highly relevant users for a given movie.

Recall@10 = 0.0038

The model is retrieving only 0.38% of all relevant users.
This indicates that while the recommendations are accurate (high precision), the model is missing most relevant users (low recall).

Why Is Recall So Low?
Imbalanced Data: There could be far more irrelevant users than relevant ones.
Top-10 Limitation: You might have many relevant users, but you're only retrieving 10 recommendations.
Overfitting on Specific Features: The model might be overfitting on dominant features and missing diverse user preferences.
🔧 How to Improve Recall Without Sacrificing Precision
✅ Increase K Value

Evaluate for higher K values (e.g., 20, 50) to see if recall improves.
✅ Add Regularization

Introduce L2 regularization to avoid overfitting.
✅ Incorporate More User Features

Add additional user metadata like watch frequency, historical preferences, or interaction history.
✅ Weighted Loss Function

Penalize missing relevant users more heavily.
✅ Train Longer with Early Stopping

Increase the number of training epochs with early stopping to avoid overfitting.
Visualize Precision@K and Recall@K for Different K Values
Generate a plot to show how Precision@K and Recall@K change for different values of K (e.g., 5, 10, 20, 50)? This can help assess the ideal number of recommendations to provide for users.

In [49]:
# ✅ Generate binary encodings for genres, writers, and directors
def get_movie_context(fav_genres, fav_writers, fav_directors):
    # Encode genres, writers, and directors using MultiLabelBinarizer
    genre_vector = mlb_genres.transform([fav_genres]) if fav_genres else np.zeros((1, len(mlb_genres.classes_)))
    writer_vector = mlb_writers.transform([fav_writers]) if fav_writers else np.zeros((1, len(mlb_writers.classes_)))
    director_vector = mlb_directors.transform([fav_directors]) if fav_directors else np.zeros((1, len(mlb_directors.classes_)))

    # Generate dummy numerical features (imdb_rating and numVotes)
    dummy_numerical_features = np.zeros((1, 2))  # Assuming these were part of the original features

    # Generate dummy user review embeddings
    embedding_dim = X_train.shape[1] - (2 + len(mlb_genres.classes_) + len(mlb_writers.classes_) + len(mlb_directors.classes_))
    dummy_user_reviews = np.zeros((1, embedding_dim))

    # Concatenate all features to match the expected input size
    context_vector = np.concatenate([
        dummy_user_reviews,
        dummy_numerical_features,
        genre_vector,
        writer_vector,
        director_vector
    ], axis=1)

    return context_vector

# ✅ Create a mapping from user_id_encoded to user_id
user_id_to_original = dict(zip(range(len(user_encoder.classes_)), user_encoder.classes_))


# ✅ Function to recommend users for a movie based on context features
def recommend_top_n_users(fav_genres=None, fav_writers=None, fav_directors=None, top_n=10):
    # Generate movie context features
    movie_context = get_movie_context(fav_genres, fav_writers, fav_directors)

    # Prepare inputs for all users
    num_users = len(user_encoder.classes_)
    user_ids = np.arange(num_users).astype(np.int32).reshape(-1, 1)
    movie_context_repeated = np.tile(movie_context, (num_users, 1))

    # Predict ratings using the trained model
    predictions = top_n_user_model.predict([user_ids, movie_context_repeated])
    predictions = predictions.flatten()

    # Descend predictions back to the original rating scale (1-10)
    descaled_predictions = manual_minmax_descale(predictions)

    # Get top N user indices
    top_indices = descaled_predictions.argsort()[-top_n:][::-1]

    # Directly use integer user IDs to access the mapping
    recommended_users = []
    for i in top_indices:
        if i in user_id_to_original:
            original_user_id = user_id_to_original[i]
            recommended_users.append((original_user_id, descaled_predictions[i]))
        else:
            print(f"Warning: Encoded User ID {i} not found in mapping.")

    return recommended_users




In [53]:
# Recommend top 5 users for a movie with provided genres, writers, and directors
top_users = recommend_top_n_users(
    fav_genres=['Drama', 'Thriller'],
    fav_writers=['nm0522871'],  # IMDb writer IDs
    fav_directors=['nm0412650'],  # IMDb director IDs
    top_n=5
)

# ✅ Print the top recommended users
print("Top 5 Users Likely to Enjoy This Movie:")
for user_id, score in top_users:
    print(f"User ID: {user_id}, Predicted Rating: {score:.2f}")


[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step
Top 5 Users Likely to Enjoy This Movie:
User ID: ur75042462, Predicted Rating: 8.66
User ID: ur48096051, Predicted Rating: 8.55
User ID: ur0762667, Predicted Rating: 8.49
User ID: ur0231836, Predicted Rating: 8.48
User ID: ur4453273, Predicted Rating: 8.44


# Chunk model to train model 10k rows.

In [None]:
merged_df = pd.read_csv(merged_df_loc, index_col=0).head(20000)

import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, MinMaxScaler
import ast

# Safely convert string lists to Python lists using ast.literal_eval
def safe_eval_list(x):
    if isinstance(x, str):
        return ast.literal_eval(x)
    return x

# Convert string lists into proper lists
merged_df['genres'] = merged_df['genres'].apply(safe_eval_list)
merged_df['directors_list'] = merged_df['directors_list'].apply(safe_eval_list)
merged_df['writers_list'] = merged_df['writers_list'].apply(safe_eval_list)

# Convert string representations of embeddings to numpy arrays
def convert_string_to_array(x):
    if isinstance(x, str):
        return np.fromstring(x.strip('[]'), sep=' ')
    return np.array(x)

# Apply conversion across the DataFrame
merged_df['user_reviews_padded'] = merged_df['user_reviews_padded'].apply(convert_string_to_array)

# Encode categorical variables
mlb_genres = MultiLabelBinarizer()
mlb_writers = MultiLabelBinarizer()
mlb_directors = MultiLabelBinarizer()

genres_encoded = mlb_genres.fit_transform(merged_df['genres'])
writers_encoded = mlb_writers.fit_transform(merged_df['writers_list'])
directors_encoded = mlb_directors.fit_transform(merged_df['directors_list'])

# Normalize numerical features
scaler_numeric = MinMaxScaler()
numeric_features = scaler_numeric.fit_transform(merged_df[['imdb_rating', 'numVotes']])

# Consistent user_reviews_padded embeddings
embedding_dim = len(merged_df['user_reviews_padded'].iloc[0])
merged_df['user_reviews_padded'] = merged_df['user_reviews_padded'].apply(
    lambda x: x if len(x) == embedding_dim else np.zeros(embedding_dim)
)

# Concatenate all features into X
X = np.concatenate([
    np.vstack(merged_df['user_reviews_padded'].values),
    numeric_features,
    genres_encoded,
    writers_encoded,
    directors_encoded
], axis=1)

# Define y (scaled user ratings)
y = merged_df['user_ratings'].values

In [4]:
import numpy as np
import tensorflow.keras.backend as K
import gc


# Function to manually split X, y, and user IDs into chunks
def split_data_into_chunks(X, y, user_ids, chunk_size=10000):
    total_size = X.shape[0]
    chunks = []
    for i in range(0, total_size, chunk_size):
        X_chunk = X[i:i + chunk_size]
        y_chunk = y[i:i + chunk_size]
        user_ids_chunk = user_ids[i:i + chunk_size]
        chunks.append(([user_ids_chunk, X_chunk], y_chunk))
    return chunks

from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dropout, Concatenate
from tensorflow.keras.models import Model

def create_top_n_user_model():
    # ✅ User Input and Embedding
    user_input = Input(shape=(1,), name='user_input')
    user_embedding = Embedding(
        input_dim=len(user_encoder.classes_),  # Number of unique users
        output_dim=50,  # Embedding size for users
        name='user_embedding'
    )(user_input)
    user_vec = Flatten(name='user_flatten')(user_embedding)

    # ✅ Movie Context Input (e.g., genres, writers, directors, user reviews, IMDb ratings)
    movie_context_input = Input(shape=(X_train.shape[1],), name='movie_context_input')

    # ✅ Concatenate user embeddings with movie context
    concat_user_movie = Concatenate()([user_vec, movie_context_input])

    # ✅ Dense Layers for Deep Feature Learning
    x = Dense(512, activation='relu')(concat_user_movie)
    x = Dropout(0.3)(x)  # Dropout layer for regularization
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(128, activation='relu')(x)

    # ✅ Output Layer: Predict user rating
    output = Dense(1, activation='linear')(x)  # Regression output

    # ✅ Define and Compile the Model
    model = Model(inputs=[user_input, movie_context_input], outputs=output)
    model.compile(
        optimizer='adam',
        loss='mse',  # Mean Squared Error for regression
        metrics=['mae']  # Mean Absolute Error for evaluation
    )

    return model



# Split the dataset into chunks
# Verify chunk sizes before training
user_encoder = LabelEncoder()
merged_df['user_id_encoded'] = user_encoder.fit_transform(merged_df['user_id'])

# Train-test split
X_train, X_test, y_train, y_test, user_train, user_test = train_test_split(
    X, y, merged_df['user_id_encoded'].values, test_size=0.25, random_state=42
)
data_chunks = split_data_into_chunks(X_train, y_train, user_train, chunk_size=10000)

print(f"Total training samples: {X_train.shape[0]}")
for idx, (input_data, y_chunk) in enumerate(data_chunks):
    print(f"Chunk {idx + 1}:")
    print(f"  User IDs shape: {input_data[0].shape}")
    print(f"  X_chunk shape: {input_data[1].shape}")
    print(f"  y_chunk shape: {y_chunk.shape}")

# Initialize the model
# Train the model incrementally with memory clearing
K.clear_session()
gc.collect()

chunk_model = create_top_n_user_model()
chunk_count = 0
loss_history = []

for (input_data, y_chunk) in data_chunks:
    user_ids_chunk, X_chunk = input_data

    print(f"Training on chunk {chunk_count + 1}/{len(data_chunks)}")
    print(f"User IDs shape: {user_ids_chunk.shape}, X_chunk shape: {X_chunk.shape}, y_chunk shape: {y_chunk.shape}")

    # Train the model on the current chunk
    history = chunk_model.fit(
        [user_ids_chunk, X_chunk],
        y_chunk,
        epochs=5,
        batch_size=16,
        verbose=1
    )

    # Store loss history
    loss_history.append(history.history['loss'])

    # Clear TensorFlow session and garbage collect memory
    K.clear_session()
    gc.collect()

    chunk_count += 1

print("Loss history for each chunk:", loss_history)



Total training samples: 15000
Chunk 1:
  User IDs shape: (10000,)
  X_chunk shape: (10000, 21634)
  y_chunk shape: (10000,)
Chunk 2:
  User IDs shape: (5000,)
  X_chunk shape: (5000, 21634)
  y_chunk shape: (5000,)
Training on chunk 1/2
User IDs shape: (10000,), X_chunk shape: (10000, 21634), y_chunk shape: (10000,)
Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 325ms/step - loss: 0.1102 - mae: 0.2503
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m224s[0m 265ms/step - loss: 0.1483 - mae: 0.1907
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 267ms/step - loss: 0.0550 - mae: 0.1469
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 262ms/step - loss: 0.0306 - mae: 0.1160
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 266ms/step - loss: 0.0254 - mae: 0.0969
Training on chunk 2/2
User IDs shape: (5000,), X_chunk shape: (5000, 21634), y_chunk shape: 

In [5]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Function to Descale Predictions
def manual_minmax_descale(scaled_ratings, original_min=1, original_max=10):
    return scaled_ratings * (original_max - original_min) + original_min

# Function to Calculate RMSE
def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Function to Calculate Precision@K
def precision_at_k(y_true, y_pred, k):
    """Calculate Precision@K"""
    top_k_indices = np.argsort(y_pred)[-k:][::-1]  # Get top-k indices
    relevant = np.isin(top_k_indices, np.where(y_true == 1)[0]).sum()  # Count relevant hits
    return relevant / k

# Function to Calculate Recall@K
def recall_at_k(y_true, y_pred, k):
    """Calculate Recall@K"""
    top_k_indices = np.argsort(y_pred)[-k:][::-1]
    total_relevant = np.sum(y_true)
    if total_relevant == 0:
        return 0.0  # Avoid division by zero
    relevant = np.isin(top_k_indices, np.where(y_true == 1)[0]).sum()
    return relevant / total_relevant

# Predict on the Test Set
predictions = chunk_model.predict([user_test, X_test]).flatten()

# # Descale Predictions Back to Original Rating Scale (1-10)
# descaled_predictions = manual_minmax_descale(predictions)

# Calculate RMSE
# Transform scaled y_test back to original ratings
y_test_original = y_test.reshape(-1, 1).flatten()
rmse_score = calculate_rmse(y_test_original, predictions)
print(f"RMSE: {rmse_score:.4f}")

# Convert True Ratings to Binary Relevance (ratings >= 4.0 are relevant)
y_true_binary = (y_test_original >= 0.4).astype(int)

# Calculate Precision@K and Recall@K for K=10
k = 10
k = min(k, len(y_true_binary))  # Adjust if fewer samples are present

precision = precision_at_k(y_true_binary, predictions, k)
recall = recall_at_k(y_true_binary, predictions, k)

# Print Evaluation Metrics
print(f"Precision@{k}: {precision:.4f}")
print(f"Recall@{k}: {recall:.4f}")

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 38ms/step
RMSE: 0.2557
Precision@10: 0.8000
Recall@10: 0.0018
