In [27]:
import ast
import os
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error, precision_score, recall_score, accuracy_score,r2_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [28]:
merged_df_loc = r"drive/MyDrive/MSUoA/goodreads/goodreads_users_data_bert_embeddings.csv"
merged_df = pd.read_csv(merged_df_loc, index_col=0)

In [29]:
import numpy as np
import ast
import json
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler


# Function to safely convert string lists to Python lists
def safe_eval_list(x):
    if isinstance(x, str):
        return ast.literal_eval(x)
    return x

# Fill NaN and convert lists
merged_df['genres'] = merged_df['genres'].fillna('[]').apply(safe_eval_list)

# Convert tensor-based embeddings to numpy arrays
def convert_tensor_to_array(x):
    if isinstance(x, str) and x.startswith("tensor("):
        try:
            x = x.replace("tensor(", "").rstrip(")")
            return np.array(json.loads(x.replace("'", "\"")))
        except json.JSONDecodeError:
            return np.zeros(embedding_dim)  # Fallback to zero vector
    return np.array(x)

merged_df['review_text'] = merged_df['review_text'].apply(convert_tensor_to_array)

# Determine embedding dimension
if not merged_df['review_text'].empty:
    embedding_dim = len(merged_df['review_text'].iloc[0])
else:
    embedding_dim = 0  # Handle empty cases

# Ensure embeddings are consistent
merged_df['review_text'] = merged_df['review_text'].apply(
    lambda x: x if len(x) == embedding_dim else np.zeros(embedding_dim)
)

# One-hot encode categorical variables
all_genres = set().union(*merged_df['genres'])

mlb_genres = MultiLabelBinarizer().fit([list(all_genres)])
genres_encoded = mlb_genres.transform(merged_df['genres'])

# Normalize numerical features
merged_df[['goodreads_rating', 'bbeVotes']] = merged_df[['goodreads_rating', 'bbeVotes']].fillna(0)
scaler_numeric = MinMaxScaler()
numeric_features = scaler_numeric.fit_transform(merged_df[['goodreads_rating', 'bbeVotes']])

# Concatenate all features into X
X_list = [
    np.vstack(merged_df['review_text'].values) if embedding_dim > 0 else np.empty((len(merged_df), 0)),
    numeric_features,
    genres_encoded
]

X = np.concatenate(X_list, axis=1)

# Define y (scaled user ratings)
y = merged_df['user_rating'].values

# Display processed X and y shapes
X.shape, y.shape



((40000, 1528), (40000,))

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dropout, Concatenate
from tensorflow.keras.models import Model

# Encode user IDs and book IDs (instead of movie IDs)
user_encoder = LabelEncoder()
book_encoder = LabelEncoder()

merged_df['user_id_encoded'] = user_encoder.fit_transform(merged_df['user_id'])
merged_df['book_id_encoded'] = book_encoder.fit_transform(merged_df['book_id'])

# Define features (X) and target variable (y)
y_scaled = merged_df['user_rating'].values  # No need to scale user ratings

# Train-test split
X_train, X_test, y_train, y_test, book_train, book_test = train_test_split(
    X, y_scaled, merged_df['book_id_encoded'].values, test_size=0.25, random_state=42
)

# Check the shapes of training data
print("X_train shape:", X_train.shape)
print("book_train shape:", book_train.shape)

# Book ID Input Layer
book_input = Input(shape=(1,), name='book_input')
book_embedding = Embedding(input_dim=len(book_encoder.classes_), output_dim=50, name='book_embedding')(book_input)
book_vec = Flatten(name='book_flatten')(book_embedding)

# Contextual Features Input Layer
context_input = Input(shape=(X_train.shape[1],), name='context_input')

# Concatenate book embedding with context input
concat = Concatenate()([book_vec, context_input])

# Dense Layers for cold-start recommendation
x = Dense(512, activation='relu')(concat)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)
output = Dense(1, activation='linear')(x)

# Define the Cold Start Model
cold_start_model = Model(inputs=[book_input, context_input], outputs=output)
cold_start_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Model Summary
print(cold_start_model.summary())

# Train the Model
history_cold_start = cold_start_model.fit(
    [book_train, X_train], y_train,  # Using only training data
    epochs=5,  # Increased for better learning
    batch_size=64,  # Adjusted for small dataset
    validation_split=0.2
)


X_train shape: (30000, 1528)
book_train shape: (30000,)


None
Epoch 1/5
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 2.1288 - mae: 1.1360 - val_loss: 3.2205 - val_mae: 1.6084
Epoch 2/5
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.5562 - mae: 0.9707 - val_loss: 2.1505 - val_mae: 1.2550
Epoch 3/5
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.3052 - mae: 0.8679 - val_loss: 2.7505 - val_mae: 1.4535
Epoch 4/5
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.1539 - mae: 0.8126 - val_loss: 2.6044 - val_mae: 1.4020
Epoch 5/5
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.0865 - mae: 0.7831 - val_loss: 2.9221 - val_mae: 1.5006


In [31]:
import tensorflow as tf

# Define the path to save the converted TFLite model
tflite_model_path = "cold_start_model.tflite"

# Convert the model to TensorFlow Lite format
converter = tf.lite.TFLiteConverter.from_keras_model(cold_start_model)

# Apply optimization (quantization for smaller size and faster inference)
converter.optimizations = [tf.lite.Optimize.DEFAULT]

# Convert the model
tflite_model = converter.convert()

# Save the model to disk
with open(tflite_model_path, "wb") as f:
    f.write(tflite_model)

print(f"✅ Model successfully converted and saved as {tflite_model_path}")


Saved artifact at '/tmp/tmpgillfpv8'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): List[TensorSpec(shape=(None, 1), dtype=tf.float32, name='book_input'), TensorSpec(shape=(None, 1528), dtype=tf.float32, name='context_input')]
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  138361776283408: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138361776283600: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138361776282448: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138361776282832: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138361776280528: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138361776281296: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138361776279376: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138361776279952: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138361776278608: TensorSpec(shape=(), dtype=tf.resource, name=None)
✅ Model successfully

In [32]:
import tensorflow as tf
import numpy as np

# Load the TFLite model (corrected file path)
interpreter = tf.lite.Interpreter(model_path="cold_start_model.tflite")
interpreter.allocate_tensors()

# Get input & output tensor details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Print expected input shapes for debugging
print(f"🔍 Expected Input 0 Shape: {input_details[0]['shape']}, Type: {input_details[0]['dtype']} (Context Features)")
print(f"🔍 Expected Input 1 Shape: {input_details[1]['shape']}, Type: {input_details[1]['dtype']} (Book ID)")

# Function to run inference with TFLite model
def predict_tflite(context_feature, book_id):
    """Runs inference using the TensorFlow Lite model with correctly shaped inputs."""
    # Convert inputs to FLOAT32 and match expected shapes
    context_feature = np.array(context_feature, dtype=np.float32).reshape(1, -1)  # Ensure (1, feature_dim)
    book_id = np.array(book_id, dtype=np.float32).reshape(1, 1)  # Ensure (1, 1)

    # Ensure input shapes match model expectations
    assert context_feature.shape == tuple(input_details[0]['shape']), f"Mismatch in context_feature shape: {context_feature.shape}"
    assert book_id.shape == tuple(input_details[1]['shape']), f"Mismatch in book_id shape: {book_id.shape}"

    # Set model inputs (Swapped order to match TFLite expectations)
    interpreter.set_tensor(input_details[0]['index'], context_feature)
    interpreter.set_tensor(input_details[1]['index'], book_id)

    # Run inference
    interpreter.invoke()

    # Get output prediction
    return interpreter.get_tensor(output_details[0]['index'])[0][0]  # Return scalar prediction

# Test the model with a sample (use 10 test samples)
sample_context_features = X_test[:10]  # Context features (feature_dim per book)
sample_book_ids = book_test[:10].reshape(-1, 1)  # Encoded book IDs

# Get TFLite model predictions for each sample (one-by-one)
tflite_predictions = [predict_tflite(context_feature, book_id)
                      for context_feature, book_id in zip(sample_context_features, sample_book_ids)]

# Convert to NumPy array
tflite_predictions = np.array(tflite_predictions)

print("✅ TFLite Model Inference Completed. Sample Predictions:", tflite_predictions[:5])


🔍 Expected Input 0 Shape: [   1 1528], Type: <class 'numpy.float32'> (Context Features)
🔍 Expected Input 1 Shape: [1 1], Type: <class 'numpy.float32'> (Book ID)
✅ TFLite Model Inference Completed. Sample Predictions: [2.1873565 2.6881132 2.2176178 3.4464824 2.1201553]


In [33]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Get true ratings for the same test samples used for inference
y_true = y_test[:10]

# Compute RMSE for TFLite model predictions
rmse = np.sqrt(mean_squared_error(y_true, tflite_predictions))

# Function to compute Precision@K
def precision_at_k(y_true, y_pred, k=5):
    """Computes Precision@K for Top-K recommendations."""
    top_k_preds = np.argsort(y_pred)[::-1][:k]  # Get top-K predicted indices
    relevant_items = set(np.argsort(y_true)[::-1][:k])  # Get top-K actual relevant indices
    return len(set(top_k_preds) & relevant_items) / k

# Function to compute Recall@K
def recall_at_k(y_true, y_pred, k=5):
    """Computes Recall@K for Top-K recommendations."""
    top_k_preds = np.argsort(y_pred)[::-1][:k]  # Get top-K predicted indices
    relevant_items = set(np.argsort(y_true)[::-1][:k])  # Get top-K actual relevant indices
    return len(set(top_k_preds) & relevant_items) / len(relevant_items)

# Compute Precision@K and Recall@K
k = 5
precision_k = precision_at_k(y_true, tflite_predictions, k)
recall_k = recall_at_k(y_true, tflite_predictions, k)

# Display results
rmse, precision_k, recall_k


(1.5690924616423927, 0.6, 0.6)

In [40]:
# Ensure we have the correct test set for the cold_start_model
num_samples = min(10, len(y_test))

# Predict using the cold_start_model (original Keras model before compression)
cold_start_predictions = cold_start_model.predict([book_test[:num_samples], X_test[:num_samples]]).flatten()

# Compute RMSE for cold_start_model
rmse_cold_start = np.sqrt(mean_squared_error(y_test[:num_samples], cold_start_predictions))

# Convert True Ratings to Binary Relevance (ratings >= 4.0 are relevant)
y_true_binary_cold_start = (y_test[:num_samples] >= 4.0).astype(int)

# Compute Precision@10 and Recall@10 for cold_start_model
k = 10
precision_k_cold_start = precision_at_k(y_true_binary_cold_start, cold_start_predictions, k)
recall_k_cold_start = recall_at_k(y_true_binary_cold_start, cold_start_predictions, k)

# Display results
rmse_cold_start, precision_k_cold_start, recall_k_cold_start


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step


(1.5631697935742908, 1.0, 1.0)

In [34]:
import os
print("Compressed model size\n" , os.stat("cold_start_model.tflite").st_size / (1024 * 1024))

import tensorflow as tf

# Save the original model as an H5 file
original_model_path = "cold_start_model.h5"
cold_start_model.save(original_model_path)

print(f" Original model saved as {original_model_path}")
print(os.stat("cold_start_model.h5").st_size / (1024 * 1024))



Compressed model size
 1.579742431640625
 Original model saved as cold_start_model.h5
18.793838500976562


In [35]:
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dropout, Concatenate
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Encode user IDs
user_encoder = LabelEncoder()
merged_df['user_id_encoded'] = user_encoder.fit_transform(merged_df['user_id'].astype(str))

# Define target variable (already scaled)
y_scaled = merged_df['user_rating'].values

# Train-test split
X_train, X_test, y_train, y_test, user_train, user_test = train_test_split(
    X, y_scaled, merged_df['user_id_encoded'].values, test_size=0.25, random_state=42
)

# Check data shapes
print("✅ X_train shape:", X_train.shape)
print("✅ user_train shape:", user_train.shape)

# User Input and Embedding
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(input_dim=len(user_encoder.classes_), output_dim=50, name='user_embedding')(user_input)
user_vec = Flatten(name='user_flatten')(user_embedding)

# Movie Context Input
movie_context_input = Input(shape=(X_train.shape[1],), name='movie_context_input')

# Concatenate user embedding with movie context
concat_user_movie = Concatenate()([user_vec, movie_context_input])

# Dense Layers
x = Dense(512, activation='relu')(concat_user_movie)
x = Dropout(0.3)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)
output = Dense(1, activation='linear')(x)

# Define Top-N User Recommendation Model
top_n_user_model = Model(inputs=[user_input, movie_context_input], outputs=output)
top_n_user_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Model summary
print(top_n_user_model.summary())

# Train the model with validation
history = top_n_user_model.fit(
    [user_train, X_train], y_train,
    epochs=5,  # Increased for better learning
    batch_size=64,  # Adjusted for dataset size
    validation_split=0.2
)


✅ X_train shape: (30000, 1528)
✅ user_train shape: (30000,)


None
Epoch 1/5
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - loss: 2.1861 - mae: 1.1322 - val_loss: 1.6889 - val_mae: 1.0949
Epoch 2/5
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.3929 - mae: 0.9180 - val_loss: 2.2251 - val_mae: 1.2995
Epoch 3/5
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.3353 - mae: 0.8923 - val_loss: 2.4716 - val_mae: 1.3820
Epoch 4/5
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.3108 - mae: 0.8829 - val_loss: 2.1790 - val_mae: 1.2799
Epoch 5/5
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.2740 - mae: 0.8624 - val_loss: 2.5435 - val_mae: 1.4024


In [37]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import mean_squared_error

# ✅ Load the TFLite model
interpreter = tf.lite.Interpreter(model_path="top_n_user_model.tflite")
interpreter.allocate_tensors()

# ✅ Get input & output tensor details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# ✅ Print expected input shapes for debugging
print(f"🔍 Expected User Input Shape: {input_details[0]['shape']}, Type: {input_details[0]['dtype']}")
print(f"🔍 Expected Movie Context Input Shape: {input_details[1]['shape']}, Type: {input_details[1]['dtype']}")

# ✅ Check the actual shape of movie context features (X_test)
print(f"🔍 Actual Feature Shape: {X_test.shape}")  # Check actual feature size

# ✅ Fix feature shape if necessary
expected_feature_dim = input_details[0]['shape'][1]  # Get expected feature size from model
if X_test.shape[1] != expected_feature_dim:
    raise ValueError(f"❌ Shape mismatch: Model expects {expected_feature_dim} features, but got {X_test.shape[1]}")

# ✅ Function to run inference with TFLite model
def predict_tflite(user_id, movie_context):
    """Runs inference using the TensorFlow Lite model with correctly shaped inputs."""
    # Convert inputs to FLOAT32 and reshape to expected dimensions
    movie_context = np.array(movie_context, dtype=np.float32).reshape(1, expected_feature_dim)  # Ensure correct shape
    user_id = np.array(user_id, dtype=np.float32).reshape(1, 1)  # Ensure (1, 1)

    # Set inputs to the interpreter
    interpreter.set_tensor(input_details[0]['index'], movie_context)
    interpreter.set_tensor(input_details[1]['index'], user_id)

    # Run inference
    interpreter.invoke()

    # Get output prediction
    return interpreter.get_tensor(output_details[0]['index'])[0][0]  # Return scalar prediction

# ✅ Run inference on test samples
num_samples = 10
movie_context_samples = X_test[:num_samples]  # Movie context features
user_samples = user_test[:num_samples].reshape(-1, 1)  # User IDs

# ✅ Run predictions
tflite_predictions = np.array([
    predict_tflite(user_id, movie_context)
    for user_id, movie_context in zip(user_samples, movie_context_samples)
])

print("✅ TFLite Model Inference Completed. Sample Predictions:", tflite_predictions[:5])

# ✅ Compute RMSE
y_true = y_test[:num_samples]
rmse = np.sqrt(mean_squared_error(y_true, tflite_predictions))
print(f"📊 RMSE (TFLite Model): {rmse:.4f}")

# ✅ Precision@K and Recall@K Functions
def precision_at_k(y_true, y_pred, k=5):
    """Computes Precision@K for Top-K recommendations."""
    top_k_preds = np.argsort(y_pred)[::-1][:k]
    relevant_items = set(np.argsort(y_true)[::-1][:k])
    return len(set(top_k_preds) & relevant_items) / k

def recall_at_k(y_true, y_pred, k=5):
    """Computes Recall@K for Top-K recommendations."""
    top_k_preds = np.argsort(y_pred)[::-1][:k]
    relevant_items = set(np.argsort(y_true)[::-1][:k])
    return len(set(top_k_preds) & relevant_items) / len(relevant_items)

# ✅ Compute Precision@K and Recall@K
k = 5
precision_k = precision_at_k(y_true, tflite_predictions, k)
recall_k = recall_at_k(y_true, tflite_predictions, k)

print(f"📊 Precision@{k}: {precision_k:.4f}")
print(f"📊 Recall@{k}: {recall_k:.4f}")


🔍 Expected User Input Shape: [   1 1528], Type: <class 'numpy.float32'>
🔍 Expected Movie Context Input Shape: [1 1], Type: <class 'numpy.float32'>
🔍 Actual Feature Shape: (10000, 1528)
✅ TFLite Model Inference Completed. Sample Predictions: [2.0172863 3.1474802 2.9622478 2.3944912 2.844131 ]
📊 RMSE (TFLite Model): 1.4740
📊 Precision@5: 0.8000
📊 Recall@5: 0.8000


In [38]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Function to Calculate RMSE
def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Function to Calculate Precision@K
def precision_at_k(y_true, y_pred, k):
    """Calculate Precision@K"""
    top_k_indices = np.argsort(y_pred)[-k:][::-1]  # Get top-K predicted indices
    relevant_indices = np.argsort(y_true)[-k:][::-1]  # Get top-K actual relevant indices
    relevant = len(set(top_k_indices) & set(relevant_indices))  # Intersection count
    return relevant / k

# Function to Calculate Recall@K
def recall_at_k(y_true, y_pred, k):
    """Calculate Recall@K"""
    top_k_indices = np.argsort(y_pred)[-k:][::-1]  # Get top-K predicted indices
    total_relevant = np.sum(y_true)  # Total number of relevant items
    if total_relevant == 0:
        return 0.0  # Avoid division by zero
    relevant = np.isin(top_k_indices, np.where(y_true == 1)[0]).sum()  # Count hits
    return relevant / total_relevant

# ✅ Predict on the Test Set
predictions = top_n_user_model.predict([user_test, X_test]).flatten()

# ✅ Calculate RMSE
rmse_score = calculate_rmse(y_test, predictions)
print(f"📊 RMSE: {rmse_score:.4f}")

# ✅ Convert True Ratings to Binary Relevance (ratings >= 4.0 are relevant)
y_true_binary = (y_test >= 4.0).astype(int)  # Fixed threshold

# ✅ Calculate Precision@K and Recall@K for K=10
k = min(10, len(y_true_binary))  # Adjust if fewer samples are present
precision = precision_at_k(y_true_binary, predictions, k)
recall = recall_at_k(y_true_binary, predictions, k)

# ✅ Print Evaluation Metrics
print(f"📊 Precision@{k}: {precision:.4f}")
print(f"📊 Recall@{k}: {recall:.4f}")


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
📊 RMSE: 1.5851
📊 Precision@10: 0.0000
📊 Recall@10: 0.0015
