In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise import AlgoBase
import numpy as np
from collections import defaultdict
import torch
from sentence_transformers import util

In [3]:
# Load Book Data

book_df = pd.read_csv('Amazon_Books_Scraping/Books_df.csv')

# Optionally, create a 'book_id' column if you haven't:
if 'book_id' not in book_df.columns:
    book_df.insert(0, 'book_id', range(len(book_df)))

# We'll focus on the columns we need
# (You can include more text in "text_for_embedding" if desired.)
def create_metadata_text(row):
    return (
        f"Title: {row['Title']} | "
        f"Author: {row['Author']} | "
        f"Main Genre: {row['Main Genre']} | "
        f"Sub Genre: {row['Sub Genre']}"
    )

book_df['text_for_embedding'] = book_df.apply(create_metadata_text, axis=1)

In [4]:
# Prepare a Ratings DataFrame for Surprise

ratings_df = pd.DataFrame({
    'user_id': 0,
    'item_id': book_df['book_id'],
    'rating': book_df['Rating']
})

# Convert Rating to float (if it's not already)
ratings_df['rating'] = ratings_df['rating'].astype(float)

# Prepare data for Surprise
reader = Reader(rating_scale=(1, 5))  # If your ratings are between 1 and 5
data = Dataset.load_from_df(ratings_df[['user_id', 'item_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [5]:
# Generate Embeddings for Each Book
model = SentenceTransformer('all-MiniLM-L6-v2')
book_texts = book_df['text_for_embedding'].tolist()
embeddings = model.encode(book_texts, convert_to_tensor=True)

# We'll store them in a dictionary: book_id -> embedding
book_embeddings = {}
for i, row in book_df.iterrows():
    b_id = row['book_id']
    book_embeddings[b_id] = embeddings[i]

In [6]:
class EmbeddingBased(AlgoBase):
    def __init__(self, book_embeddings, k=10, verbose=False):
        AlgoBase.__init__(self)
        self.book_embeddings = book_embeddings
        self.k = k
        self.verbose = verbose
    
    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
        return self
    
    def estimate(self, u, i):
        # Try mapping internal user/item IDs to raw IDs
        try:
            user_id = self.trainset.to_raw_uid(u)
        except ValueError:
            # Unknown user
            return self.trainset.global_mean

        try:
            item_id = self.trainset.to_raw_iid(i)
        except ValueError:
            # Unknown item
            return self.trainset.global_mean
        
        # If the item isn't in our embedding dictionary, fallback
        if item_id not in self.book_embeddings:
            return self.trainset.global_mean

        # Proceed with embedding-based logic
        user_ratings = self.trainset.ur[u]  # items user has rated in the training set
        if len(user_ratings) == 0:
            # If user never rated anything, fallback to global mean
            return self.trainset.global_mean
        
        target_emb = self.book_embeddings[item_id]
        
        scores_sims = []
        for (j_inner, r_j) in user_ratings:
            # j_inner is the internal item ID
            j_raw = self.trainset.to_raw_iid(j_inner)
            if j_raw not in self.book_embeddings:
                continue
            j_emb = self.book_embeddings[j_raw]
            
            # compute cosine similarity (assuming they're Torch tensors)
            sim_val = float(util.cos_sim(target_emb, j_emb)[0][0])
            scores_sims.append((r_j, sim_val))
        
        if not scores_sims:
            return self.trainset.global_mean

        # Sort descending
        scores_sims.sort(key=lambda x: x[1], reverse=True)
        top_k = scores_sims[: self.k]
        
        numerator = sum(rating_j * sim_j for (rating_j, sim_j) in top_k)
        denominator = sum(sim_j for (_, sim_j) in top_k)
        
        if denominator == 0:
            return self.trainset.global_mean
        return numerator / denominator


In [7]:
# Train & Evaluate

algo = EmbeddingBased(book_embeddings=book_embeddings, k=10)
algo.fit(trainset)
predictions = algo.test(testset)
rmse = accuracy.rmse(predictions)
print("Embedding-Based RMSE:", rmse)

RMSE: 0.8999
Embedding-Based RMSE: 0.8998568097578655


In [8]:
def precision_recall(predictions, k=10, threshold=3.5):
    """
    Evaluate precision and recall at k, given a list of predictions:
      predictions is a list of tuples (uid, iid, true_r, est, details),
      k is the number of top items to consider,
      threshold is the rating threshold above which a user is said to have 'liked' the item.
      
    Returns: (mean_precision, mean_recall)
    """
    # Map each user to a list of (estimated rating, true rating)
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((iid, est, true_r))
    
    precisions = []
    recalls = []
    
    for uid, user_ratings in user_est_true.items():
        # Sort user_ratings by predicted rating 'est' descending
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        
        # Number of relevant items (true rating >= threshold)
        n_rel = sum((true_r >= threshold) for (_, _, true_r) in user_ratings)
        
        # Number of recommended items in top k that are relevant
        n_rec_k = sum((true_r >= threshold) for (_, _, true_r) in user_ratings[:k])
        
        # Precision@k: Proportion of recommended items in top k that are relevant
        if k > 0:
            precision = n_rec_k / k
        else:
            precision = 1
        
        # Recall@k: Proportion of relevant items found in top k
        if n_rel != 0:
            recall = n_rec_k / n_rel
        else:
            recall = 1
        
        precisions.append(precision)
        recalls.append(recall)
    
    mean_precision = np.mean(precisions)
    mean_recall = np.mean(recalls)
    
    return mean_precision, mean_recall

In [9]:
algo = EmbeddingBased(book_embeddings=book_embeddings, k=10)
algo.fit(trainset)

# Generate predictions on the test set
predictions = algo.test(testset)

# Evaluate RMSE (optional)
rmse = accuracy.rmse(predictions)

# Evaluate precision & recall
p, r = precision_recall(predictions, k=10, threshold=3.5)
print(f"Precision@10: {p:.4f}")
print(f"Recall@10:    {r:.4f}")

RMSE: 0.8999
Precision@10: 0.9000
Recall@10:    0.0060


In [10]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import train_test_split
from sentence_transformers import SentenceTransformer, util
from surprise import AlgoBase
from collections import defaultdict

####################################
# 1) PREPARE THE DATA
####################################

book_df = pd.read_csv('Amazon_Books_Scraping/Books_df.csv')

# Ensure we have a 'book_id' column
if 'book_id' not in book_df.columns:
    book_df.insert(0, 'book_id', range(len(book_df)))

# Create a simple text representation for each book (for embeddings)
def create_metadata_text(row):
    return (
        f"Title: {row['Title']} | "
        f"Author: {row['Author']} | "
        f"Main Genre: {row['Main Genre']} | "
        f"Sub Genre: {row['Sub Genre']}"
    )

book_df['text_for_embedding'] = book_df.apply(create_metadata_text, axis=1)

# We'll create a user-item rating DataFrame for Surprise
# Note: This is still "faking" user input. We'll just treat 'user_id=0' for demonstration.
# If you have real user IDs, replace this logic.
ratings_df = pd.DataFrame({
    'user_id': 0,  # single user scenario (fake)
    'item_id': book_df['book_id'],
    'rating': book_df['Rating'].astype(float)
})

reader = Reader(rating_scale=(1, 5))
full_data = Dataset.load_from_df(ratings_df[['user_id', 'item_id', 'rating']], reader)

# Split into train/test
trainset_cold, testset_cold = train_test_split(full_data, test_size=0.2, random_state=21)
test_df_cold = pd.DataFrame(testset_cold, columns=['user_id', 'item_id', 'rating'])

####################################
# 2) INTRODUCE A COLD-START ITEM
####################################

# We'll add a brand new item that does NOT appear in the training set.
# For example, item_id=9999 with "Action & Adventure" rating=4.
# (If you want it to be a brand-new user, you'd change 'user_id' instead.)
cold_item_id = 9999
new_item = [(0, cold_item_id, 4.0)]  
# user_id=0 (since we only have user=0)
# item_id=9999 doesn't exist in training. rating=4.0 is arbitrary for demonstration.

new_item_df = pd.DataFrame(new_item, columns=['user_id', 'item_id', 'rating'])
test_df_cold = pd.concat([test_df_cold, new_item_df], ignore_index=True)

# Convert test_df_cold back into a Surprise testset format: list of (user, item, rating) tuples
testset_new = list(test_df_cold[['user_id', 'item_id', 'rating']].itertuples(index=False, name=None))

####################################
# 3) CREATE EMBEDDINGS
####################################

# We'll use a SentenceTransformer model to embed each book's metadata.
# This is done only for the books that appear in the TRAINING set
# (which is the normal scenario: new items won't have embeddings unless you embed them).
model = SentenceTransformer('all-MiniLM-L6-v2')

# Build a dict: book_id -> embedding for all books that appear in the TRAINING set
train_items = trainset_cold.build_testset()  # all (user, item, rating) in train
train_item_ids = {iid for (_, iid, _) in train_items}

train_book_df = book_df[book_df['book_id'].isin(train_item_ids)].copy()
train_book_df.reset_index(drop=True, inplace=True)

# 2) Encode the text
train_embeddings_tensor = model.encode(
    train_book_df['text_for_embedding'].tolist(), 
    convert_to_tensor=True
)

# 3) Now iterate using the row’s *new* index
for idx, row in train_book_df.iterrows():
    b_id = row['book_id']
    book_embeddings[b_id] = train_embeddings_tensor[idx]

# NOTE: We have NOT embedded item_id=9999 because it's truly "cold" (no metadata).
# If you *did* have metadata, you could embed it here. That wouldn't be a total cold start anymore.
# We'll show how to handle that scenario below.


####################################
# 4) CUSTOM EMBEDDING-BASED ALGORITHM
####################################

class EmbeddingBased(AlgoBase):
    def __init__(self, book_embeddings, k=10):
        AlgoBase.__init__(self)
        self.book_embeddings = book_embeddings  # dict: item_id -> embedding
        self.k = k
    
    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
        return self
    
    def estimate(self, u, i):
        # Convert internal IDs to raw IDs
        try:
            user_id = self.trainset.to_raw_uid(u)
        except ValueError:
            return self.trainset.global_mean
        
        try:
            item_id = self.trainset.to_raw_iid(i)
        except ValueError:
            return self.trainset.global_mean
        
        # If this item is not in our embeddings dictionary => cold start item
        if item_id not in self.book_embeddings:
            # fallback to global mean (or any other strategy)
            return self.trainset.global_mean
        
        # Get all items that user has rated in training
        user_ratings = self.trainset.ur[u]  # list of (inner_item_id, rating)
        if len(user_ratings) == 0:
            # user never rated anything => fallback
            return self.trainset.global_mean
        
        target_emb = self.book_embeddings[item_id]
        scores_sims = []
        
        # We'll use sentence_transformers.util.cos_sim for similarity
        for (j_inner, rating_j) in user_ratings:
            j_raw = self.trainset.to_raw_iid(j_inner)
            # If the user-rated item is in the embedding dictionary
            if j_raw in self.book_embeddings:
                j_emb = self.book_embeddings[j_raw]
                sim_val = float(util.cos_sim(target_emb, j_emb)[0][0])
                scores_sims.append((rating_j, sim_val))
        
        if not scores_sims:
            return self.trainset.global_mean
        
        # Sort descending by similarity
        scores_sims.sort(key=lambda x: x[1], reverse=True)
        top_k = scores_sims[:self.k]
        
        numerator = sum(r * s for (r, s) in top_k)
        denominator = sum(s for (_, s) in top_k)
        
        if denominator == 0:
            return self.trainset.global_mean
        return numerator / denominator

####################################
# 5) TRAIN & TEST
####################################

algo_cold = EmbeddingBased(book_embeddings=book_embeddings, k=10)
algo_cold.fit(trainset_cold)

pred_knn_cold = algo_cold.test(testset_new)
rmse_knn_cold = accuracy.rmse(pred_knn_cold, verbose=True)

####################################
# 6) SPLIT NEW ITEM vs OLD ITEMS
####################################

# Our "cold item" is item_id = 9999
cold_item_ids = {9999}

preds_new = [p for p in pred_knn_cold if p.iid in cold_item_ids]
preds_old = [p for p in pred_knn_cold if p.iid not in cold_item_ids]

# Evaluate separate RMSE
rmse_new = accuracy.rmse(preds_new, verbose=False)
rmse_old = accuracy.rmse(preds_old, verbose=False)

####################################
# 7) PRECISION/RECALL FUNCTION
####################################
def precision_recall(predictions, k=10, threshold=3.5):
    from collections import defaultdict
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((iid, est, true_r))
    
    precisions = []
    recalls = []
    
    for uid, user_ratings in user_est_true.items():
        # Sort by estimated rating descending
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        
        # number of relevant items
        n_rel = sum((true_r >= threshold) for (_, _, true_r) in user_ratings)
        
        # number of recommended items in top k that are relevant
        n_rec_k = sum((true_r >= threshold) for (_, _, true_r) in user_ratings[:k])
        
        precision = n_rec_k / k if k > 0 else 1
        recall = n_rec_k / n_rel if n_rel != 0 else 1
        
        precisions.append(precision)
        recalls.append(recall)
    
    mean_precision = np.mean(precisions)
    mean_recall = np.mean(recalls)
    return mean_precision, mean_recall

p_all_cold, r_all_cold = precision_recall(pred_knn_cold, k=10, threshold=3.5)
p_new, r_new = precision_recall(preds_new, k=10, threshold=3.5)
p_old, r_old = precision_recall(preds_old, k=10, threshold=3.5)

####################################
# 8) REPORT RESULTS
####################################
print("======== Cold-Start Analysis (Embedding-Based) ========")
print(f"Overall RMSE: {rmse_knn_cold:.4f}, Precision: {p_all_cold:.4f}, Recall: {r_all_cold:.4f}")
print(f"New Item RMSE: {rmse_new:.4f}, Precision: {p_new:.4f}, Recall: {r_new:.4f}")
print(f"Old Items RMSE: {rmse_old:.4f}, Precision: {p_old:.4f}, Recall: {r_old:.4f}")


RMSE: 0.8979
Overall RMSE: 0.8979, Precision: 0.9000, Recall: 0.0060
New Item RMSE: 0.2603, Precision: 0.1000, Recall: 1.0000
Old Items RMSE: 0.8982, Precision: 0.9000, Recall: 0.0060


In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise import AlgoBase
from collections import defaultdict

###########################################################
# 1) LOAD THE REAL BOOK DATA (from CSV)
###########################################################

df_books = pd.read_csv("Amazon_Books_Scraping/Books_df.csv")

# Ensure there's a 'book_id' column
if 'book_id' not in df_books.columns:
    df_books.insert(0, 'book_id', range(len(df_books)))

# We'll create a text column for embeddings
def create_metadata_text(row):
    return (
        f"Title: {row['Title']} | "
        f"Author: {row['Author']} | "
        f"Main Genre: {row['Main Genre']} | "
        f"Sub Genre: {row['Sub Genre']}"
    )
df_books['text_for_embedding'] = df_books.apply(create_metadata_text, axis=1)

# (Optional) You can drop any rows with missing or invalid data
# df_books.dropna(subset=["Title", "Author", "Main Genre", "Sub Genre"], inplace=True)

print(f"Loaded {len(df_books)} books from CSV.")

###########################################################
# 2) CREATE EMBEDDINGS FOR EACH BOOK
###########################################################

model = SentenceTransformer("all-MiniLM-L6-v2")
texts = df_books["text_for_embedding"].tolist()
embeddings_tensor = model.encode(texts, convert_to_tensor=True)

book_embeddings = {}
for idx, row in df_books.iterrows():
    b_id = row['book_id']
    book_embeddings[b_id] = embeddings_tensor[idx]

print("Created embeddings for each book.")

###########################################################
# 3) SYNTHETICALLY GENERATE 50 USERS' RATINGS
###########################################################

num_users = 50
prob_rate = 0.6  # each user has a 60% chance to rate any given book

ratings_data = []
np.random.seed(42)  # for reproducibility

# We'll gather all the book_ids in a list
all_book_ids = df_books['book_id'].tolist()

for user_id in range(num_users):
    for book_id in all_book_ids:
        # Decide if this user rates this book
        if np.random.rand() < prob_rate:
            # random rating from 1..5
            rating_val = np.random.randint(1, 6)  # 1 to 5
            ratings_data.append((user_id, book_id, rating_val))

df_ratings = pd.DataFrame(ratings_data, columns=["user_id", "book_id", "rating"])
print(f"Synthetic Ratings: {df_ratings.shape[0]} total ratings from {num_users} users.")

###########################################################
# 4) CREATE A SURPRISE DATASET
###########################################################

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_ratings[["user_id", "book_id", "rating"]], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
print("Train/Test Split Done.")

###########################################################
# 5) DEFINE A CUSTOM EMBEDDING-BASED ALGO
###########################################################

class EmbeddingBased(AlgoBase):
    def __init__(self, book_embeddings, k=10):
        AlgoBase.__init__(self)
        self.book_embeddings = book_embeddings
        self.k = k
    
    def fit(self, trainset):
        super().fit(trainset)
        return self
    
    def estimate(self, u, i):
        from sentence_transformers import util
        # Convert internal IDs to raw IDs
        try:
            user_id = self.trainset.to_raw_uid(u)
        except ValueError:
            return self.trainset.global_mean
        
        try:
            book_id = self.trainset.to_raw_iid(i)
        except ValueError:
            return self.trainset.global_mean
        
        # If no embedding for this book => cold start fallback
        if book_id not in self.book_embeddings:
            return self.trainset.global_mean
        
        # Gather user's rated items
        user_ratings = self.trainset.ur[u]  # list of (inner_item_id, rating)
        if not user_ratings:
            return self.trainset.global_mean
        
        target_emb = self.book_embeddings[book_id]
        
        scores_sims = []
        for (j_inner, rating_j) in user_ratings:
            j_raw = self.trainset.to_raw_iid(j_inner)
            if j_raw in self.book_embeddings:
                j_emb = self.book_embeddings[j_raw]
                sim_val = float(util.cos_sim(target_emb, j_emb)[0][0])
                scores_sims.append((rating_j, sim_val))
        
        if not scores_sims:
            return self.trainset.global_mean
        
        # Sort by similarity descending
        scores_sims.sort(key=lambda x: x[1], reverse=True)
        top_k = scores_sims[: self.k]
        
        numerator = sum(r * s for (r, s) in top_k)
        denominator = sum(s for (_, s) in top_k)
        
        if denominator == 0:
            return self.trainset.global_mean
        return numerator / denominator

###########################################################
# 6) TRAIN & EVALUATE THE EMBEDDING-BASED MODEL
###########################################################

algo_llm = EmbeddingBased(book_embeddings, k=10)
algo_llm.fit(trainset)
predictions_llm = algo_llm.test(testset)
rmse_llm = accuracy.rmse(predictions_llm, verbose=True)

###########################################################
# 7) PRECISION/RECALL EVALUATION
###########################################################
def precision_recall(predictions, k=10, threshold=3.5):
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((iid, est, true_r))
    
    precisions = []
    recalls = []
    
    for uid, user_ratings in user_est_true.items():
        # Sort by estimated rating descending
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        
        # number of relevant items
        n_rel = sum((true_r >= threshold) for (_, _, true_r) in user_ratings)
        
        # number of recommended items in top k that are relevant
        n_rec_k = sum((true_r >= threshold) for (_, _, true_r) in user_ratings[:k])
        
        precision = n_rec_k / k if k > 0 else 1
        recall = n_rec_k / n_rel if n_rel != 0 else 1
        
        precisions.append(precision)
        recalls.append(recall)
    
    mean_precision = np.mean(precisions)
    mean_recall = np.mean(recalls)
    return mean_precision, mean_recall

p_llm, r_llm = precision_recall(predictions_llm, k=10, threshold=3.5)
print(f"\nPrecision@10: {p_llm:.4f}, Recall@10: {r_llm:.4f}")


Loaded 7928 books from CSV.
Created embeddings for each book.
Synthetic Ratings: 237736 total ratings from 50 users.
Train/Test Split Done.
