In [54]:
import kagglehub 
import pandas as pd 
import numpy as np 
import os 
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf 
from transformers import DistilBertTokenizer, TFDistilBertModel
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout
from tensorflow.keras.models import Model
from math import sqrt
from tensorflow.keras.callbacks import EarlyStopping

#### STEP 1 : LOADING THE DATASET FROM KAGGLE 

In [None]:
path = kagglehub.dataset_download("samlearner/letterboxd-movie-ratings-data")

print("Path to dataset files:", path)

In [21]:
movie_data_filename = 'movie_data.csv'
ratings_filename = 'ratings_export.csv'
users_filename = 'users_export.csv'
movie_data_path = os.path.join(path,movie_data_filename)
ratings_path = os.path.join(path,ratings_filename)
users_path = os.path.join(path,users_filename)

In [22]:
# movie_data_df = pd.read_csv(movie_data_path,engine='python')
# ratings_df = pd.read_csv(ratings_path,engine='python')
# users_df = pd.read_csv(users_path,engine = 'python')

In [23]:
#df = pd.read_parquet('movie_data.parquet', engine='pyarrow')
movie_data_df = pd.read_parquet('movie_data.parquet',engine='pyarrow')
ratings_df = pd.read_parquet('ratings_data.parquet',engine='pyarrow')
users_df = pd.read_parquet('users_dt.parquet',engine='pyarrow')

In [None]:
print(movie_data_df.head())


unneccessary columns need to be removed - image url, imdb id, imdb, link (perhaps needed to enhance model with imdb data ? ), tmbd id, link, (download that dataset and enhance with it ? )production countries, 

In [None]:
print(movie_data_df.info())


In [36]:
print(ratings_df.head())

                        _id          movie_id  rating_val     user_id
0  5fc57c5d6758f6963451a07f        feast-2014           7  deathproof
1  5fc57c5d6758f6963451a063       loving-2016           7  deathproof
2  5fc57c5d6758f6963451a0ef  scripted-content           7  deathproof
3  5fc57c5d6758f6963451a060        the-future           4  deathproof
4  5fc57c5c6758f69634519398              mank           5  deathproof


In [None]:
print(ratings_df.info())

In [None]:
print(users_df.info())

In [None]:
print(users_df.head())

In [None]:
for col in movie_data_df.select_dtypes(include='object'):
    print(f"{col}: {movie_data_df[col].nunique()} unique values")


In [31]:
#movie_data_df.to_parquet('movie_data.parquet', engine='pyarrow')
#ratings_df.to_parquet('ratings_data.parquet', engine = 'pyarrow')
#users_df.to_parquet('users_dt.parquet',engine = 'pyarrow')
# Load from Parquet file



#### STEP 2: EXPLORATORY DATA ANALYSIS 

#### STEP 3: DEFINING THE TASK, EVALUATION METRICS, CONSTRUCTING MODEL 

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')


In [None]:
texts = movie_data_df['overview'].fillna('').tolist()
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="tf", max_length=128)
outputs = model(**inputs)

# Get embeddings (use outputs.last_hidden_state or outputs.pooler_output)
embeddings = outputs.last_hidden_state.mean(dim=1)

In [37]:
ratings_df.dropna(subset=['user_id', 'movie_id', 'rating_val'], inplace=True)

In [38]:
ratings_df.head()

Unnamed: 0,_id,movie_id,rating_val,user_id
0,5fc57c5d6758f6963451a07f,feast-2014,7,deathproof
1,5fc57c5d6758f6963451a063,loving-2016,7,deathproof
2,5fc57c5d6758f6963451a0ef,scripted-content,7,deathproof
3,5fc57c5d6758f6963451a060,the-future,4,deathproof
4,5fc57c5c6758f69634519398,mank,5,deathproof


# non integer frames must be mapped to a unique numeric value - movie id and user id in this case

In [39]:
user_mapping = {user: idx for idx, user in enumerate(ratings_df['user_id'].unique())}
item_mapping = {item: idx for idx, item in enumerate(ratings_df['movie_id'].unique())}


In [40]:
ratings_df['user_id'] = ratings_df['user_id'].map(user_mapping)
ratings_df['item_id'] = ratings_df['movie_id'].map(item_mapping)

In [44]:
ratings_df = ratings_df.drop(['_id','movie_id'],axis = 1)
ratings_df.head()

Unnamed: 0,rating_val,user_id,item_id
0,7,0,0
1,7,0,1
2,7,0,2
3,4,0,3
4,5,0,4


In [46]:
train, test = train_test_split(ratings_df, test_size=0.2, random_state=42)

In [48]:
n_users = len(user_mapping)
n_items = len(item_mapping)
print(f'No. users : {n_users}, no. items : {n_items}')

No. users : 7477, no. items : 286070


# easiest to construct model using functional API (multiple input network). 3 dense layers of sizes 64,32,16 respectively follow the embedding layers. the resulting 32 dimensional vectors get flattened, concantenated with each other (as the model documentation requests), then get fed into dense network that outputs a probability for the respective user and item (think of as normalized rating).

In [57]:
embedding_dim = 32  # Latent factor size
mlp_layer_sizes = [64, 32, 16]  # Fully connected layers

# 1d input for user and item 
user_input = Input(shape=(1,), name='user_input')
item_input = Input(shape=(1,), name='item_input')

# Embedding layers
user_embedding = Embedding(n_users, embedding_dim, name='user_embedding')(user_input)
item_embedding = Embedding(n_items, embedding_dim, name='item_embedding')(item_input)

# Flatten embeddings
user_vec = Flatten()(user_embedding)
item_vec = Flatten()(item_embedding)

# Concatenate embeddings - the model needs it like this 
concat_vec = Concatenate()([user_vec, item_vec])
# MLP layers
mlp = concat_vec
for size in mlp_layer_sizes:
    mlp = Dense(size, activation='relu')(mlp)
    mlp = Dropout(0.2)(mlp)

# Output layer (e.g., single rating prediction)
output = Dense(1, activation='linear', name='output')(mlp)

# Build and compile the model
ncf_model = Model(inputs=[user_input, item_input], outputs=output)
ncf_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

ncf_model.summary()


In [59]:
user_vec[0]

<KerasTensor shape=(32,), dtype=float32, sparse=False, name=keras_tensor_36>

In [56]:
tf.config.get_visible_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [55]:
x_train = [train['user_id'].values, train['item_id'].values]
y_train = train['rating_val'].values

x_test = [test['user_id'].values, test['item_id'].values]
y_test = test['rating_val'].values
early_stop_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=0,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=False,
    start_from_epoch=0
)
# Train the model
history = ncf_model.fit(
    x=x_train,
    y=y_train,
    batch_size=256,
    epochs=5,
    validation_data=(x_test, y_test),
    verbose=1,
    callbacks = [early_stop_callback]
)

Epoch 1/5
[1m  281/34620[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m15:17[0m 27ms/step - loss: 2.2836 - mae: 1.1620

KeyboardInterrupt: 

In [None]:
loss, mae = ncf_model.evaluate(x_test, y_test)
print(f"Test Loss: {loss:.4f}, Test MAE: {mae:.4f}")

# RMSE
predictions = ncf_model.predict(x_test)
rmse = sqrt(np.mean((predictions.flatten() - y_test) ** 2))
print(f"Test RMSE: {rmse:.4f}")