In [None]:
pip install recommenders


In [26]:
import kagglehub 
import pandas as pd 
import numpy as np 
import os 
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf 
from transformers import DistilBertTokenizer, TFDistilBertModel
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout,Multiply
from tensorflow.keras.models import Model
from recommenders.datasets.python_splitters import python_chrono_split
from math import sqrt
from tensorflow.keras.callbacks import EarlyStopping
from time import time
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.evaluation.python_evaluation import (
    map, ndcg_at_k, precision_at_k, recall_at_k
)
import random
import json
from tensorflow.keras.callbacks import Callback
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm 
from collections import defaultdict
import warnings

#### STEP 1 : LOADING THE DATASET FROM KAGGLE 

In [None]:
devices = tf.config.get_visible_devices()
print(devices)

In [28]:
strategy = tf.distribute.MirroredStrategy()

In [None]:
path = kagglehub.dataset_download("samlearner/letterboxd-movie-ratings-data")

print("Path to dataset files:", path)

In [30]:
movie_data_filename = 'movie_data.csv'
ratings_filename = 'ratings_export.csv'
users_filename = 'users_export.csv'
movie_data_path = os.path.join(path,movie_data_filename)
ratings_path = os.path.join(path,ratings_filename)
users_path = os.path.join(path,users_filename)

In [31]:
# movie_data_df = pd.read_csv(movie_data_path,engine='python')
# ratings_df = pd.read_csv(ratings_path,engine='python')
# users_df = pd.read_csv(users_path,engine = 'python')

In [32]:
#df = pd.read_parquet('movie_data.parquet', engine='pyarrow')
movie_data_df = pd.read_parquet('/kaggle/input/parquet-dataset/movie_data.parquet',engine='pyarrow')
ratings_df = pd.read_parquet('/kaggle/input/parquet-dataset/ratings_data.parquet',engine='pyarrow')
users_df = pd.read_parquet('/kaggle/input/parquet-dataset/users_dt.parquet',engine='pyarrow')

In [None]:
print(movie_data_df.head())


unneccessary columns need to be removed - image url, imdb id, imdb, link (perhaps needed to enhance model with imdb data ? ), tmbd id, link, (download that dataset and enhance with it ? )production countries, 

In [None]:
print(movie_data_df.info())


In [None]:
print(ratings_df.head())

In [None]:
print(ratings_df.info())

In [None]:
for col in movie_data_df.select_dtypes(include='object'):
    print(f"{col}: {movie_data_df[col].nunique()} unique values")


In [38]:
#movie_data_df.to_parquet('movie_data.parquet', engine='pyarrow')
#ratings_df.to_parquet('ratings_data.parquet', engine = 'pyarrow')
#users_df.to_parquet('users_dt.parquet',engine = 'pyarrow')
# Load from Parquet file



#### STEP 2: EXPLORATORY DATA ANALYSIS 

In [None]:
movie_data_df = movie_data_df.dropna(subset=['release_date'])
print(movie_data_df)

In [None]:
movie_data_df['release_date'] = pd.to_datetime(movie_data_df['release_date'], format='%Y-%m-%d')

# Now, sorting by release_date will sort chronologically
movie_data_df.sort_values('release_date', inplace=True)
print(movie_data_df.head())

In [41]:
ratings_df['_id'] = ratings_df['_id'].astype(str)
movie_data_df['_id'] = movie_data_df['_id'].astype(str)

In [42]:
merged_df = ratings_df.merge(
    movie_data_df[['movie_id', 'release_date']],  
    on='movie_id', 
    how='inner'  # or 'inner' if you only want ratings for movies that exist in movie_data_df
)

In [None]:
merged_df.rename(columns={'release_date': 'timestamp'}, inplace=True)
merged_df = merged_df.drop(['_id'],axis = 1)
# Now, merged_df contains all columns from ratings_df plus the 'timestamp' column
# so that python can chrono split the data 
print(merged_df.head())

In [44]:
merged_df = merged_df.rename(
    columns={
        "movie_id": "itemID",
        "rating_val": "rating",
        "user_id": "userID"
    }
)

In [45]:
merged_df = merged_df.sort_values(['userID', 'timestamp']).reset_index(drop=True)

#### STEP 3: DEFINING THE TASK, EVALUATION METRICS, CONSTRUCTING MODEL 

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')


In [None]:
#model.load_weights('./checkpoints/ncf_model')

In [None]:
# texts = movie_data_df['overview'].fillna('').tolist()
# inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="tf", max_length=128)
# with strategy.scope():
#     outputs = model(**inputs).last_hidden_state.mean(dim=1)

# # Get embeddings (use outputs.last_hidden_state or outputs.pooler_output)
# movie_df['embedding'] = embeddings.numpy().tolist()

In [47]:
ratings_df.dropna(subset=['user_id', 'movie_id', 'rating_val'], inplace=True)

In [None]:
ratings_df.head()

In [None]:
ratings_df.shape

# non integer frames must be mapped to a unique numeric value - movie id and user id in this case

In [50]:
user_mapping = {user: idx for idx, user in enumerate(ratings_df['user_id'].unique())}
item_mapping = {item: idx for idx, item in enumerate(ratings_df['movie_id'].unique())}


In [None]:
sample_items = list(user_mapping.items())[:10]
print(sample_items)

In [None]:
ratings_df['user_id'] = ratings_df['user_id'].map(user_mapping)
ratings_df['movie_id'] = ratings_df['movie_id'].map(item_mapping)
ratings_df.head()

In [None]:
ratings_df = ratings_df.drop(['_id'],axis = 1)
ratings_df.head()

In [54]:
train, val = train_test_split(ratings_df, test_size=0.2, random_state=42)
train, test = train_test_split(train, test_size = 0.2, random_state = 42)

In [None]:
n_users = len(user_mapping)
n_items = len(item_mapping)
print(f'No. users : {n_users}, no. items : {n_items}')

# easiest to construct model using functional API (multiple input network). 3 dense layers of sizes 64,32,16 respectively follow the embedding layers

### PART 1: MLP ONLY MODEL 

In [None]:
embedding_dim = 32  # Latent factor size
mlp_layer_sizes = [64, 32, 16]  # Fully connected layers

# 1d input for user and item 
user_input = Input(shape=(1,), name='user_input')
item_input = Input(shape=(1,), name='item_input')

# Embedding layers
user_embedding = Embedding(n_users, embedding_dim, name='user_embedding')(user_input)
item_embedding = Embedding(n_items, embedding_dim, name='item_embedding')(item_input)

# Flatten embeddings
user_vec = Flatten()(user_embedding)
item_vec = Flatten()(item_embedding)

# Concatenate embeddings
concat_vec = Concatenate()([user_vec, item_vec])
# MLP layers
mlp = concat_vec
for size in mlp_layer_sizes:
    mlp = Dense(size, activation='relu')(mlp)
    mlp = Dropout(0.2)(mlp)

# Output layer (e.g., single rating prediction)
output = Dense(1, activation='linear', name='output')(mlp)

# Build and compile the model
ncf_model = Model(inputs=[user_input, item_input], outputs=output)
ncf_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

ncf_model.summary()


In [None]:
tf.config.get_visible_devices()

In [58]:

train_dict = defaultdict(set)

# Assuming your train DataFrame has columns 'user_id' and 'movie_id':
for user, item in zip(train['user_id'], train['movie_id']):
    train_dict[user].add(item)

In [None]:
x_train = [train['user_id'].values, train['movie_id'].values]
y_train = train['rating_val'].values
x_val = [val['user_id'].values, val['movie_id'].values]
y_val = val['rating_val'].values
x_test = [test['user_id'].values, test['movie_id'].values]
y_test = test['rating_val'].values
print(y_test.shape)
print(y_val.shape)
early_stop_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=0,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True,
    start_from_epoch=0
)
# Train the model
history = ncf_model.fit(
    x=x_train,
    y=y_train,
    batch_size=256,
    epochs=5,
    validation_data=(x_val, y_val),
    verbose=1,
    callbacks = [early_stop_callback]
)

In [None]:
#model.save_weights('/kaggle/input/parquet-dataset/ncf_model')


In [None]:
loss, mae = ncf_model.evaluate(x_test, y_test)
print(f"Test Loss: {loss:.4f}, Test MAE: {mae:.4f}")

# RMSE
predictions = ncf_model.predict(x_test)
rmse = sqrt(np.mean((predictions.flatten() - y_test) ** 2))
print(f"Test RMSE: {rmse:.4f}")

In [None]:
user_id = 'jay'  # Replace with a valid user_id
user_idx = user_mapping.get(user_id)
print(user_idx)
# Predict ratings for all items
all_items = np.arange(n_items)
user_tensor = np.array([user_idx] * n_items)
predicted_ratings = ncf_model.predict([user_tensor, all_items])

# Get top 10 recommendations
top_items = np.argsort(predicted_ratings.flatten())[::-1][:10]
recommended_movie_ids = [list(item_mapping.keys())[list(item_mapping.values()).index(idx)] for idx in top_items]
print(f"Recommended movies for user {user_id}: {recommended_movie_ids}")

In [None]:
test = test.rename(columns={'rating_val': 'prediction'})

In [None]:
print(test)

In [None]:
# all_user_ids = list(user_mapping.keys())       # e.g. original user IDs
# all_user_idxs = list(user_mapping.values())    # integer indices
# BATCH_SIZE = 512
# all_items = np.arange(n_items)

# predictions_list = []
# for i in range(0, len(all_user_idxs), BATCH_SIZE):
#     # Take a chunk of users
#     user_idx_batch = all_user_idxs[i : i + BATCH_SIZE]
#     user_id_batch = all_user_ids[i : i + BATCH_SIZE]

#     # Repeat items for each user in the batch
#     # shape: (#users_in_batch * n_items,)
#     tile_users = np.repeat(user_idx_batch, n_items)
#     tile_items = np.tile(all_items, len(user_idx_batch))

#     # Model predict on that entire chunk
#     batch_preds = ncf_model.predict([tile_users, tile_items])  

#     # Now we map predictions back to (user, item) pairs
#     # We'll build a DataFrame
#     df_chunk = pd.DataFrame({
#         "userID": np.repeat(user_id_batch, n_items),
#         "itemID": tile_items,
#         "prediction": batch_preds
#     })
#     predictions_list.append(df_chunk)

# predictions_df = pd.concat(predictions_list, ignore_index=True)
# ### 1 HOUR 45 MINS FOR ENTIRE DATASET 

In [None]:
#print(predictions_df)

In [None]:
test.rename(
    columns={
        "movie_id": "itemID",
        "user_id": "userID",
        "rating_val": "rating"  # this is your ground truth rating
    },
    inplace=True,
)

# And ensure predictions_df has the same userID/itemID columns plus 'prediction'
predictions_df.rename(
    columns={
        "movie_id": "itemID",  # if you had that column
        "user_id": "userID"
        # 'prediction' can stay as 'prediction'
    },
    inplace=True,
)

In [None]:
# Evaluate predictions
#eval_map = map(test, predictions_df, col_prediction='prediction', k=TOP_K)
eval_ndcg = ndcg_at_k(test, predictions_df, col_prediction='prediction', k=TOP_K)
print("reaches this")
eval_precision = precision_at_k(test, predictions_df, col_prediction='prediction', k=TOP_K)
print("reaches this")
eval_recall = recall_at_k(test, predictions_df, col_prediction='prediction', k=TOP_K)
print("reaches this")
print(
    #f"MAP:\t{eval_map:.6f}",
    f"NDCG:\t{eval_ndcg:.6f}",
    f"Precision@K:\t{eval_precision:.6f}",
    f"Recall@K:\t{eval_recall:.6f}",
    sep='\n'
)

## COMPLEXITY OF PREDICTING FOR ALL USERS AND ITEMS IS (no_users * no_items) -> VERY LARGE. 
Use negative sampling - take 1 item user interacted with, and 50 or 100 items the user did NOT interact (did not rate). 

# with manual testing for different users, the model basically outputs the highest rated shows and movies that the user has not rated. how to improve on that ? -> bert tokenization of movie descriptions into 

In [None]:
print(test)

In [None]:
train_dict = {}
for user_id, subset in train.groupby('user_id'):
    train_dict[user_id] = set(subset['movie_id'].unique())

# Build 'all_items_set' of all unique itemIDs in train + test
all_items_set = set(train['movie_id'].unique()) | set(test['itemID'].unique())

# Number of items to evaluate
k = 10
hits = []
ndcgs = []

unique_user_ids = test['userID'].unique()

# We'll limit the loop to the first 100 users 
for user_id in tqdm(unique_user_ids[:100], desc="Processing users"):
    # Suppose each user has exactly 1 item in test to check
    test_items = test.loc[test['userID'] == user_id, 'itemID'].values
    pos_item = test_items[0]  # if exactly one test item

    # Build negative set (exclude train_dict[user_id] + the positive item)
    user_train_items = train_dict[user_id] | {pos_item}
    possible_negatives = list(all_items_set - user_train_items)

    # Sample 99 negatives
    neg_items = random.sample(possible_negatives, 99)

    # Combine into a batch
    item_batch = [pos_item] + neg_items
    user_batch = [user_id] * len(item_batch)

    # Get predictions (change ncf_model to your actual model)
    scores = ncf_model.predict([np.array(user_batch), np.array(item_batch)])
    scores = np.squeeze(scores)

    # Positive item is at index 0
    pos_score = scores[0]
    rank = np.sum(scores >= pos_score)  # 1-based rank

    if rank <= k:
        hits.append(1)
        ndcgs.append(1 / np.log2(rank + 1))
    else:
        hits.append(0)
        ndcgs.append(0)

# Finally, compute and print
hr = np.mean(hits)
ndcg = np.mean(ndcgs)
print(f"Hit@{k}: {hr:.4f}")
print(f"NDCG@{k}: {ndcg:.4f}")

### PART 1: GMF AND MLP INHOUSE MODELS TRAINED SEPARATELY 

### PART 2 : NEU_MF MODEL - GMF + MLP INHOUSE MODEL 

### PART 3 : NEU_MF MODEL - GMF + MLP FROM RECOMENDERS LIBRARY

In [None]:
gmf_embed_dim = 16
mlp_embed_dim = 16
mlp_layer_sizes = [64, 32, 16]

# ----- 1) Define two sets of Embeddings for GMF and MLP -----
user_input = Input(shape=(1,), name='user_input')
item_input = Input(shape=(1,), name='item_input')

# GMF embeddings
gmf_user_embedding = Embedding(n_users, gmf_embed_dim, name='gmf_user_embedding')(user_input)
gmf_item_embedding = Embedding(n_items, gmf_embed_dim, name='gmf_item_embedding')(item_input)
gmf_user_vec = Flatten()(gmf_user_embedding)
gmf_item_vec = Flatten()(gmf_item_embedding)

# Element-wise multiply for GMF part
gmf_out = Multiply()([gmf_user_vec, gmf_item_vec])

# MLP embeddings
mlp_user_embedding = Embedding(n_users, mlp_embed_dim, name='mlp_user_embedding')(user_input)
mlp_item_embedding = Embedding(n_items, mlp_embed_dim, name='mlp_item_embedding')(item_input)
mlp_user_vec = Flatten()(mlp_user_embedding)
mlp_item_vec = Flatten()(mlp_item_embedding)

# Concatenate for MLP part
mlp_vec = Concatenate()([mlp_user_vec, mlp_item_vec])

# ----- 2) Pass MLP concat through the hidden layers -----
mlp_out = mlp_vec
for size in mlp_layer_sizes:
    mlp_out = Dense(size, activation='relu')(mlp_out)

# ----- 3) Final NeuMF fusion -----
fusion = Concatenate()([gmf_out, mlp_out])  # combine GMF & MLP
output = Dense(1, activation='sigmoid', name='output')(fusion)

neu_mf_model = Model(inputs=[user_input, item_input], outputs=output)
neu_mf_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
neu_mf_model.summary()

Loss function and output layer can be either binary crossentropy and sigmoid (user interacts or doesn't), or root mean square error and linear output (ranking).

In [None]:
print(train)

In [None]:
print(test)

In [None]:
train_df = train.rename(
    columns={
        "movie_id": "itemID",
        "rating_val": "rating",
        "user_id": "userID"
    }
)
test = test.rename(
    columns={
        "prediction": "rating"  
    }
)
# have to be renamed for the recommendations dataset loader 

In [None]:
print(ratings_df)

In [None]:
ratings = ratings_df.rename(
    columns={
        "movie_id": "itemID",
        "rating_val": "rating",
        "user_id": "user"
    }
)

In [None]:
train_df, test_df = python_chrono_split(merged_df, 0.75)

In [None]:
test_df = test_df[test_df["userID"].isin(train_df["userID"].unique())]
test_df = test_df[test_df["itemID"].isin(train_df["itemID"].unique())]
train_df = train_df.sort_values(['userID', 'timestamp']).reset_index(drop=True)
test_df  = test_df.sort_values(['userID', 'timestamp']).reset_index(drop=True)
# 2) Create a leave-one-out test set by taking last row for user
leave_one_out_test = test_df.groupby("userID").last().reset_index()

# 3) Write them to CSV files
train_file = "./train2.csv"
test_file = "./test2.csv"
leave_one_out_test_file = "./leave_one_out_test2.csv"

train.to_csv(train_file, index=False)
test.to_csv(test_file, index=False)
leave_one_out_test.to_csv(leave_one_out_test_file, index=False)

In [None]:
print(train_df.head())

In [None]:
unique_users = merged_df['userID'].unique()
user_map = {user: idx for idx, user in enumerate(unique_users)}
merged_df['userID_num'] = merged_df['userID'].map(user_map)

# b. Convert `itemID` to numeric
unique_items = merged_df['itemID'].unique()
item_map = {item: idx for idx, item in enumerate(unique_items)}
merged_df['itemID_num'] = merged_df['itemID'].map(item_map)

# Optional: Save the mappings for future reference
with open('user_map.json', 'w') as f:
    json.dump(user_map, f)
with open('item_map.json', 'w') as f:
    json.dump(item_map, f)

In [None]:
merged_df['timestamp'] = pd.to_datetime(merged_df['timestamp'], errors='coerce')

# Drop rows with invalid timestamps if any
merged_df = merged_df.dropna(subset=['timestamp'])

# Sort by `userID_num` and `timestamp`
merged_df = merged_df.sort_values(['userID_num', 'timestamp']).reset_index(drop=True)
assert merged_df['userID_num'].isnull().sum() == 0, "Some userIDs were not mapped correctly."
assert merged_df['itemID_num'].isnull().sum() == 0, "Some itemIDs were not mapped correctly."

In [None]:
train_df, test_df = python_chrono_split(merged_df, 0.75)
train_df_to_save = train_df[['userID_num', 'itemID_num', 'rating', 'timestamp']]
test_df_to_save = test_df[['userID_num', 'itemID_num', 'rating', 'timestamp']]

# Optionally, rename columns to match expected names
train_df_to_save.rename(columns={
    'userID_num': 'userID',
    'itemID_num': 'itemID'
}, inplace=True)
test_df_to_save.rename(columns={
    'userID_num': 'userID',
    'itemID_num': 'itemID'
}, inplace=True)

# Save to CSV
train_file = 'train_data.csv'
test_file = 'test_data.csv'

train_df_to_save.to_csv(train_file, index=False)
test_df_to_save.to_csv(test_file, index=False)

In [None]:
train_df = train_df.sort_values(['userID_num', 'timestamp']).reset_index(drop=True)
test_df = test_df.sort_values(['userID_num', 'timestamp']).reset_index(drop=True)


In [None]:
train_df = train_df.drop(['itemID'],axis = 1)
train_df = train_df.drop(['userID'],axis = 1)
test_df = test_df.drop(['itemID'],axis = 1)
test_df = test_df.drop(['userID'],axis = 1)
# not sure whether test needs to be dropping itemID actually

In [None]:
print(train_df.head())

In [None]:
train_df.rename(columns={
    'userID_num': 'userID',
    'itemID_num': 'itemID'
}, inplace=True)
test_df.rename(columns={
    'userID_num': 'userID',
    'itemID_num': 'itemID'
}, inplace=True)

In [None]:
print(test_df.head())

In [None]:
train_file_fixed = 'train_file_fixed.csv'
test_file_fixed = 'test_file_fixed.csv'
leave_one_out_fixed_file = 'leave_one_out_fixed.csv'

In [None]:
train_df = train_df.sort_values(['userID', 'timestamp']).reset_index(drop=True)
test_df  = test_df.sort_values(['userID', 'timestamp']).reset_index(drop=True)
leave_one_out_fixed = test_df.groupby("userID").last().reset_index()
print(train_df.head())
print(test_df.head())
train_df.to_csv(train_file_fixed, index=False)
test_df.to_csv(test_file_fixed, index=False)
leave_one_out_fixed.to_csv(leave_one_out_fixed_file, index=False)

In [None]:
train_df.shape

In [None]:
train_df = pd.load_csv()

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore", FutureWarning)
    data = NCFDataset(
        train_file=train_file_fixed,
        test_file=leave_one_out_fixed_file,
        seed=42,
        overwrite_test_file_full=True
    )

In [None]:
EPOCHS = 5 
BATCH_SIZE = 256

In [None]:
class TqdmCallback(Callback):
    def on_train_begin(self, logs=None):
        self.epochs = self.params['epochs']
        self.pbar = tqdm(total=self.epochs, desc='Training', unit='epoch')

    def on_epoch_end(self, epoch, logs=None):
        self.pbar.update(1)

    def on_train_end(self, logs=None):
        self.pbar.close()


In [None]:
    model = NCF(
        n_users=data.n_users, 
        n_items=data.n_items,
        model_type="NeuMF",
        n_factors=4,
        layer_sizes=[16,8,4],
        n_epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        learning_rate=1e-3,
        verbose=1,
        seed=42
    )

In [None]:

for epoch in range(1, EPOCHS + 1):
    epoch_loss = 0
    epoch_accuracy = 0
    num_batches = int(np.ceil(len(data.train) / BATCH_SIZE))
    
    # Initialize tqdm progress bar for batches
    with tqdm(total=num_batches, desc=f'Epoch {epoch}/{EPOCHS}', unit='batch') as pbar:
        for batch in data.get_batches(BATCH_SIZE):
            # Extract inputs and targets from the batch
            user_ids, item_ids, labels = batch
            
            # Train on the current batch
            loss, accuracy = model.train_on_batch(user_ids, item_ids, labels)
            
            # Accumulate metrics
            epoch_loss += loss
            epoch_accuracy += accuracy
            
            # Update progress bar
            pbar.set_postfix({'loss': loss, 'accuracy': accuracy})
            pbar.update(1)
    
    # Calculate average metrics for the epoch
    avg_loss = epoch_loss / num_batches
    avg_accuracy = epoch_accuracy / num_batches
    
    # Display epoch summary
    print(f'Epoch {epoch}/{EPOCHS} - Loss: {avg_loss:.4f} - Accuracy: {avg_accuracy:.4f}')

In [None]:
#with strategy.scope():
model.fit(data)

In [None]:
@tf.function
def train_step(user_ids, item_ids, labels):
    with tf.GradientTape() as tape:
        predictions = model([user_ids, item_ids], training=True)
        loss = loss_object(labels, predictions)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    train_loss(loss)
    train_accuracy(labels, predictions)


In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')

In [None]:
for epoch in range(1, EPOCHS + 1):
    # Reset the metrics at the start of the epoch
    
    # Initialize tqdm progress bar for batches
    with tqdm(total=steps_per_epoch, desc=f'Epoch {epoch}/{EPOCHS}', unit='batch') as pbar:
        for batch in batched_dataset:
            user_ids, item_ids, labels = batch  # Adjust based on your batch structure
            
            # Perform a training step
            train_step(user_ids, item_ids, labels)
            
            # Update progress bar
            pbar.set_postfix({
                'loss': f"{train_loss.result():.4f}",
                'accuracy': f"{train_accuracy.result():.4f}"
            })
            pbar.update(1)
    
    # Display epoch metrics
    print(f'Epoch {epoch}/{EPOCHS} - Loss: {train_loss.result():.4f} - Accuracy: {train_accuracy.result():.4f}')
