In [1]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack, csr_matrix
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")

All libraries imported successfully!


In [None]:
# Define the base path
base_path = Path('/kaggle/input/RokomariBG_Dataset')

# Load all JSON files
print("Loading datasets...")

# Main metadata
with open(base_path / 'book.json', 'r', encoding='utf-8') as f:
    books_data = json.load(f)
    
with open(base_path / 'author.json', 'r', encoding='utf-8') as f:
    authors_data = json.load(f)
    
with open(base_path / 'category.json', 'r', encoding='utf-8') as f:
    categories_data = json.load(f)
    
with open(base_path / 'publisher.json', 'r', encoding='utf-8') as f:
    publishers_data = json.load(f)
    
with open(base_path / 'review.json', 'r', encoding='utf-8') as f:
    reviews_data = json.load(f)

# Relationship tables
with open(base_path / 'book_to_category.json', 'r', encoding='utf-8') as f:
    book_category = json.load(f)
    
with open(base_path / 'book_to_author.json', 'r', encoding='utf-8') as f:
    book_author = json.load(f)
    
with open(base_path / 'book_to_publisher.json', 'r', encoding='utf-8') as f:
    book_publisher = json.load(f)
    
with open(base_path / 'book_to_review.json', 'r', encoding='utf-8') as f:
    book_review = json.load(f)
    
with open(base_path / 'user_to_review.json', 'r', encoding='utf-8') as f:
    user_review = json.load(f)

print("All datasets loaded successfully!")
print(f"Books: {len(books_data)}")
print(f"Authors: {len(authors_data)}")
print(f"Categories: {len(categories_data)}")
print(f"Publishers: {len(publishers_data)}")
print(f"Reviews: {len(reviews_data)}")

Loading datasets...
All datasets loaded successfully!
Books: 149515
Authors: 16601
Categories: 1516
Publishers: 2757
Reviews: 209602


In [4]:
# Convert to DataFrames
df_books = pd.DataFrame(books_data)
df_authors = pd.DataFrame(authors_data)
df_categories = pd.DataFrame(categories_data)
df_publishers = pd.DataFrame(publishers_data)
df_reviews = pd.DataFrame(reviews_data)
df_book_category = pd.DataFrame(book_category)
df_book_author = pd.DataFrame(book_author)
df_book_publisher = pd.DataFrame(book_publisher)
df_book_review = pd.DataFrame(book_review)
df_user_review = pd.DataFrame(user_review)

# Convert all IDs to strings for consistency
df_books['book_id'] = df_books['book_id'].astype(str)
df_authors['author_id'] = df_authors['author_id'].astype(str)
df_categories['category_id'] = df_categories['category_id'].astype(str)
df_publishers['publisher_id'] = df_publishers['publisher_id'].astype(str)
df_reviews['review_id'] = df_reviews['review_id'].astype(str)

df_book_category['book_id'] = df_book_category['book_id'].astype(str)
df_book_category['category_id'] = df_book_category['category_id'].astype(str)

df_book_author['book_id'] = df_book_author['book_id'].astype(str)
df_book_author['author_id'] = df_book_author['author_id'].astype(str)

df_book_publisher['book_id'] = df_book_publisher['book_id'].astype(str)
df_book_publisher['publisher_id'] = df_book_publisher['publisher_id'].astype(str)

df_book_review['book_id'] = df_book_review['book_id'].astype(str)
df_book_review['review_id'] = df_book_review['review_id'].astype(str)

df_user_review['user_id'] = df_user_review['user_id'].astype(str)
df_user_review['review_id'] = df_user_review['review_id'].astype(str)

# Remove duplicate books - keep first occurrence
print(f"Books before deduplication: {len(df_books)}")
df_books = df_books.drop_duplicates(subset=['book_id'], keep='first')
print(f"Books after deduplication: {len(df_books)}")

# Basic data info
print("\nDataset Overview:")
print(f"Unique Books: {df_books['book_id'].nunique()}")
print(f"Unique Authors: {df_authors['author_id'].nunique()}")
print(f"Unique Categories: {df_categories['category_id'].nunique()}")
print(f"Unique Publishers: {df_publishers['publisher_id'].nunique()}")
print(f"Total Reviews: {len(df_reviews)}")
print(f"Unique Users: {df_user_review['user_id'].nunique()}")

# Check for missing values in key columns
print("\nMissing values in books:")
print(df_books[['book_id', 'book_title']].isnull().sum())

Books before deduplication: 149515
Books after deduplication: 127302

Dataset Overview:
Unique Books: 127302
Unique Authors: 16601
Unique Categories: 1516
Unique Publishers: 2757
Total Reviews: 209602
Unique Users: 63723

Missing values in books:
book_id       0
book_title    0
dtype: int64


In [5]:
# Create mapping dictionaries
author_map = dict(zip(df_authors['author_id'], df_authors['author']))
category_map = dict(zip(df_categories['category_id'], df_categories['category_name']))
publisher_map = dict(zip(df_publishers['publisher_id'], df_publishers['publisher_name']))

# Aggregate categories per book
book_categories = df_book_category.groupby('book_id')['category_id'].apply(list).to_dict()

# Aggregate authors per book
book_authors = df_book_author.groupby('book_id')['author_id'].apply(list).to_dict()

# Aggregate publishers per book
book_publishers = df_book_publisher.groupby('book_id')['publisher_id'].apply(list).to_dict()

# Aggregate reviews per book
book_reviews_map = df_book_review.groupby('book_id')['review_id'].apply(list).to_dict()

# Create review text mapping - the field is 'review_detail'
review_text_map = dict(zip(df_reviews['review_id'], df_reviews['review_detail']))

print("Mappings created successfully!")
print(f"Books with categories: {len(book_categories)}")
print(f"Books with authors: {len(book_authors)}")
print(f"Books with publishers: {len(book_publishers)}")
print(f"Books with reviews: {len(book_reviews_map)}")

Mappings created successfully!
Books with categories: 107680
Books with authors: 77444
Books with publishers: 94957
Books with reviews: 17670


In [7]:
# Get user-book interactions from reviews
user_books = df_user_review.merge(df_book_review, on='review_id')
user_books = user_books[['user_id', 'book_id']].drop_duplicates()

print(f"Initial user-book interactions: {len(user_books)}")
print(f"Initial unique users: {user_books['user_id'].nunique()}")
print(f"Initial unique books: {user_books['book_id'].nunique()}")

# Convert book_id to string to ensure matching
user_books['book_id'] = user_books['book_id'].astype(str)

# Filter books that exist in our cleaned books dataset
valid_books = set(df_books['book_id'].unique())
print(f"\nTotal valid books in df_books: {len(valid_books)}")

user_books = user_books[user_books['book_id'].isin(valid_books)]

print(f"\nAfter filtering for valid books:")
print(f"Total user-book interactions: {len(user_books)}")
print(f"Unique users: {user_books['user_id'].nunique()}")
print(f"Unique books: {user_books['book_id'].nunique()}")

# NO USER FILTERING - Use all users
valid_users = user_books['user_id'].unique()

print(f"\nNo user filtering applied:")
print(f"Total users: {len(valid_users)}")
print(f"Total interactions: {len(user_books)}")

# Split each user's interactions into train/val/test (70/15/15)
# This ensures every user appears in all three sets
np.random.seed(42)

train_list = []
val_list = []
test_list = []

for user_id in valid_users:
    user_interactions = user_books[user_books['user_id'] == user_id].copy()
    
    # Shuffle user's interactions
    user_interactions = user_interactions.sample(frac=1, random_state=42).reset_index(drop=True)
    
    n_interactions = len(user_interactions)
    
    # For users with only 1 interaction, put in training
    if n_interactions == 1:
        train_list.append(user_interactions)
        continue
    
    # For users with 2 interactions, put 1 in train, 1 in test
    if n_interactions == 2:
        train_list.append(user_interactions[:1])
        test_list.append(user_interactions[1:])
        continue
    
    # For users with 3+ interactions, do proper split
    train_size = int(0.70 * n_interactions)
    val_size = int(0.15 * n_interactions)
    
    # Ensure at least 1 item in train
    if train_size == 0:
        train_size = 1
    
    # Ensure at least 1 item in test if possible
    remaining = n_interactions - train_size
    if remaining > 0 and val_size >= remaining:
        val_size = remaining - 1
    
    train_list.append(user_interactions[:train_size])
    
    if val_size > 0:
        val_list.append(user_interactions[train_size:train_size + val_size])
    
    if train_size + val_size < n_interactions:
        test_list.append(user_interactions[train_size + val_size:])

train_df = pd.concat(train_list, ignore_index=True) if train_list else pd.DataFrame(columns=['user_id', 'book_id'])
val_df = pd.concat(val_list, ignore_index=True) if val_list else pd.DataFrame(columns=['user_id', 'book_id'])
test_df = pd.concat(test_list, ignore_index=True) if test_list else pd.DataFrame(columns=['user_id', 'book_id'])

print(f"\n{'='*60}")
print("DATA SPLIT SUMMARY (70/15/15) - NO USER FILTERING")
print(f"{'='*60}")
print(f"Training set:")
print(f"  Users: {train_df['user_id'].nunique()}")
print(f"  Interactions: {len(train_df)} ({len(train_df)/len(user_books)*100:.1f}%)")
print(f"\nValidation set:")
print(f"  Users: {val_df['user_id'].nunique()}")
print(f"  Interactions: {len(val_df)} ({len(val_df)/len(user_books)*100:.1f}%)")
print(f"\nTest set:")
print(f"  Users: {test_df['user_id'].nunique()}")
print(f"  Interactions: {len(test_df)} ({len(test_df)/len(user_books)*100:.1f}%)")
print(f"\nTotal: {len(valid_users)} users, {len(user_books)} interactions")

Initial user-book interactions: 205924
Initial unique users: 63723
Initial unique books: 17670

Total valid books in df_books: 127302

After filtering for valid books:
Total user-book interactions: 205924
Unique users: 63723
Unique books: 17670

No user filtering applied:
Total users: 63723
Total interactions: 205924

DATA SPLIT SUMMARY (70/15/15) - NO USER FILTERING
Training set:
  Users: 63723
  Interactions: 142360 (69.1%)

Validation set:
  Users: 6227
  Interactions: 12799 (6.2%)

Test set:
  Users: 29368
  Interactions: 50765 (24.7%)

Total: 63723 users, 205924 interactions


In [8]:
# Get all unique books that appear in training data
train_books = train_df['book_id'].unique()
all_books = df_books[df_books['book_id'].isin(valid_books)]['book_id'].unique()

print(f"Building features for {len(all_books)} books...")

# 1. Author Features (Multi-hot encoding)
book_author_lists = []
for book_id in all_books:
    authors = book_authors.get(book_id, [])
    book_author_lists.append(authors)

mlb_authors = MultiLabelBinarizer(sparse_output=True)
author_features = mlb_authors.fit_transform(book_author_lists)

print(f"Author features shape: {author_features.shape}")

# 2. Category Features (Multi-hot encoding)
book_category_lists = []
for book_id in all_books:
    categories = book_categories.get(book_id, [])
    book_category_lists.append(categories)

mlb_categories = MultiLabelBinarizer(sparse_output=True)
category_features = mlb_categories.fit_transform(book_category_lists)

print(f"Category features shape: {category_features.shape}")

# 3. Publisher Features (Multi-hot encoding)
book_publisher_lists = []
for book_id in all_books:
    publishers = book_publishers.get(book_id, [])
    book_publisher_lists.append(publishers)

mlb_publishers = MultiLabelBinarizer(sparse_output=True)
publisher_features = mlb_publishers.fit_transform(book_publisher_lists)

print(f"Publisher features shape: {publisher_features.shape}")

Building features for 127302 books...
Author features shape: (127302, 16572)
Category features shape: (127302, 1493)
Publisher features shape: (127302, 2752)


In [9]:
# 4. Review Text Features (TF-IDF)
print("Building TF-IDF features from reviews...")

book_review_texts = []
for book_id in all_books:
    review_ids = book_reviews_map.get(book_id, [])
    reviews = [review_text_map.get(rid, '') for rid in review_ids]
    combined_text = ' '.join([r for r in reviews if r])
    book_review_texts.append(combined_text)

# TF-IDF with parameters to handle large vocabulary
tfidf = TfidfVectorizer(
    max_features=5000,  # Limit features to avoid memory issues
    min_df=2,           # Ignore terms that appear in less than 2 documents
    max_df=0.8,         # Ignore terms that appear in more than 80% of documents
    ngram_range=(1, 2), # Use unigrams and bigrams
    strip_accents='unicode',
    lowercase=True,
    stop_words=None     # Keep all words since we're working with Bengali
)

review_features = tfidf.fit_transform(book_review_texts)

print(f"Review features shape: {review_features.shape}")
print(f"Total vocabulary size: {len(tfidf.vocabulary_)}")

Building TF-IDF features from reviews...
Review features shape: (127302, 5000)
Total vocabulary size: 5000


In [10]:
# Combine all features
print("\nCombining all features...")

# Stack all feature matrices horizontally
item_features = hstack([
    author_features,
    category_features,
    publisher_features,
    review_features
]).tocsr()

print(f"Combined item features shape: {item_features.shape}")
print(f"Feature dimensions: {item_features.shape[1]}")

# Create book_id to index mapping
book_to_idx = {book_id: idx for idx, book_id in enumerate(all_books)}
idx_to_book = {idx: book_id for book_id, idx in book_to_idx.items()}

print(f"Created mappings for {len(book_to_idx)} books")


Combining all features...
Combined item features shape: (127302, 25817)
Feature dimensions: 25817
Created mappings for 127302 books


In [11]:
# Build user profiles as average of interacted book features
print("Building user profiles...")

user_profiles = {}

for user_id in train_df['user_id'].unique():
    user_books_list = train_df[train_df['user_id'] == user_id]['book_id'].values
    
    # Get feature vectors for user's books
    book_indices = [book_to_idx[bid] for bid in user_books_list if bid in book_to_idx]
    
    if book_indices:
        # Average of book features
        user_feature_vectors = item_features[book_indices]
        user_profile = np.asarray(user_feature_vectors.mean(axis=0)).flatten()
        user_profiles[user_id] = user_profile

print(f"Created profiles for {len(user_profiles)} users")
print(f"Profile dimension: {user_profiles[list(user_profiles.keys())[0]].shape[0]}")

Building user profiles...
Created profiles for 63723 users
Profile dimension: 25817


In [12]:
def get_recommendations(user_id, user_profile, item_features, book_to_idx, 
                        train_books_set, top_k=50):
    """
    Generate top-k recommendations for a user
    """
    # Calculate similarity between user profile and all items
    user_profile_reshaped = user_profile.reshape(1, -1)
    similarities = cosine_similarity(user_profile_reshaped, item_features)[0]
    
    # Get top-k items
    top_indices = np.argsort(similarities)[::-1]
    
    # Filter out books already interacted with
    recommendations = []
    for idx in top_indices:
        book_id = idx_to_book[idx]
        if book_id not in train_books_set:
            recommendations.append((book_id, similarities[idx]))
            if len(recommendations) >= top_k:
                break
    
    return recommendations

print("Recommendation function defined!")

Recommendation function defined!


In [13]:
def hit_at_k(recommended, actual, k):
    """Hit@K: 1 if any recommended item in top-k is in actual, else 0"""
    recommended_k = set([item[0] for item in recommended[:k]])
    actual_set = set(actual)
    return 1.0 if len(recommended_k & actual_set) > 0 else 0.0

def mrr(recommended, actual):
    """Mean Reciprocal Rank"""
    actual_set = set(actual)
    for i, (item, score) in enumerate(recommended):
        if item in actual_set:
            return 1.0 / (i + 1)
    return 0.0

def ndcg_at_k(recommended, actual, k):
    """Normalized Discounted Cumulative Gain at K"""
    recommended_k = [item[0] for item in recommended[:k]]
    actual_set = set(actual)
    
    dcg = 0.0
    for i, item in enumerate(recommended_k):
        if item in actual_set:
            dcg += 1.0 / np.log2(i + 2)  # +2 because i is 0-indexed
    
    # Ideal DCG (if all actual items were at top)
    idcg = sum([1.0 / np.log2(i + 2) for i in range(min(len(actual), k))])
    
    return dcg / idcg if idcg > 0 else 0.0

print("Evaluation metric functions defined!")

Evaluation metric functions defined!


In [14]:
print("Starting validation evaluation...")

# Metrics storage for validation
val_metrics = {
    'hit@5': [],
    'hit@10': [],
    'hit@50': [],
    'mrr': [],
    'ndcg@10': [],
    'ndcg@50': []
}

# Get validation users
val_users_list = val_df['user_id'].unique()
print(f"Total validation users: {len(val_users_list)}")

# Filter to only validation users who also appear in training (so they have profiles)
val_users_with_profiles = [u for u in val_users_list if u in user_profiles]
print(f"Validation users with training profiles: {len(val_users_with_profiles)}")

if len(val_users_with_profiles) > 0:
    print(f"Evaluating on {len(val_users_with_profiles)} validation users...")
    
    # Evaluation on validation set
    for i, user_id in enumerate(val_users_with_profiles):
        if i % 500 == 0:
            print(f"Processing user {i+1}/{len(val_users_with_profiles)}...")
        
        # Get user's training books (to exclude from recommendations)
        train_books_set = set(train_df[train_df['user_id'] == user_id]['book_id'].values)
        
        # Get user's validation books (ground truth)
        val_books = val_df[val_df['user_id'] == user_id]['book_id'].values
        
        if len(val_books) == 0:
            continue
        
        # Get recommendations
        user_profile = user_profiles[user_id]
        recommendations = get_recommendations(
            user_id, user_profile, item_features, book_to_idx, 
            train_books_set, top_k=50
        )
        
        # Calculate metrics
        val_metrics['hit@5'].append(hit_at_k(recommendations, val_books, 5))
        val_metrics['hit@10'].append(hit_at_k(recommendations, val_books, 10))
        val_metrics['hit@50'].append(hit_at_k(recommendations, val_books, 50))
        val_metrics['mrr'].append(mrr(recommendations, val_books))
        val_metrics['ndcg@10'].append(ndcg_at_k(recommendations, val_books, 10))
        val_metrics['ndcg@50'].append(ndcg_at_k(recommendations, val_books, 50))

    print(f"\nValidation evaluation complete! Evaluated {len(val_metrics['hit@5'])} users.")
else:
    print("\n‚ö†Ô∏è No validation users have training profiles. Skipping validation evaluation.")

Starting validation evaluation...
Total validation users: 6227
Validation users with training profiles: 6227
Evaluating on 6227 validation users...
Processing user 1/6227...
Processing user 501/6227...
Processing user 1001/6227...
Processing user 1501/6227...
Processing user 2001/6227...
Processing user 2501/6227...
Processing user 3001/6227...
Processing user 3501/6227...
Processing user 4001/6227...
Processing user 4501/6227...
Processing user 5001/6227...
Processing user 5501/6227...
Processing user 6001/6227...

Validation evaluation complete! Evaluated 6227 users.


In [15]:
# Calculate average validation metrics
print("\n" + "="*60)
print("VALIDATION SET - EVALUATION RESULTS")
print("="*60)
print(f"\nDataset Statistics:")
print(f"  Total Books: {len(all_books)}")
print(f"  Training Users: {len(user_profiles)}")
print(f"  Validation Users: {len(val_users_list)}")
print(f"  Training Interactions: {len(train_df)}")
print(f"  Validation Interactions: {len(val_df)}")

if len(val_metrics['hit@5']) == 0:
    print("\n‚ö†Ô∏è No validation metrics available.")
    print("Skipping to test set evaluation...")
else:
    print(f"  Evaluated Users: {len(val_metrics['hit@5'])}")
    
    print(f"\n{'Metric':<15} {'Score':<10}")
    print("-" * 25)
    print(f"{'Hit@5':<15} {np.mean(val_metrics['hit@5']):.4f}")
    print(f"{'Hit@10':<15} {np.mean(val_metrics['hit@10']):.4f}")
    print(f"{'Hit@50':<15} {np.mean(val_metrics['hit@50']):.4f}")
    print(f"{'MRR':<15} {np.mean(val_metrics['mrr']):.4f}")
    print(f"{'NDCG@10':<15} {np.mean(val_metrics['ndcg@10']):.4f}")
    print(f"{'NDCG@50':<15} {np.mean(val_metrics['ndcg@50']):.4f}")
    
    print("\n" + "="*60)
    
    # Additional statistics
    print(f"\nüìä Validation Hit Coverage:")
    print(f"  ‚Ä¢ Users with Hit@5: {sum([1 for x in val_metrics['hit@5'] if x > 0])} ({sum([1 for x in val_metrics['hit@5'] if x > 0])/len(val_metrics['hit@5'])*100:.1f}%)")
    print(f"  ‚Ä¢ Users with Hit@10: {sum([1 for x in val_metrics['hit@10'] if x > 0])} ({sum([1 for x in val_metrics['hit@10'] if x > 0])/len(val_metrics['hit@10'])*100:.1f}%)")
    print(f"  ‚Ä¢ Users with Hit@50: {sum([1 for x in val_metrics['hit@50'] if x > 0])} ({sum([1 for x in val_metrics['hit@50'] if x > 0])/len(val_metrics['hit@50'])*100:.1f}%)")

print("\n" + "="*60)


VALIDATION SET - EVALUATION RESULTS

Dataset Statistics:
  Total Books: 127302
  Training Users: 63723
  Validation Users: 6227
  Training Interactions: 142360
  Validation Interactions: 12799
  Evaluated Users: 6227

Metric          Score     
-------------------------
Hit@5           0.1734
Hit@10          0.2322
Hit@50          0.3808
MRR             0.1118
NDCG@10         0.1085
NDCG@50         0.1374


üìä Validation Hit Coverage:
  ‚Ä¢ Users with Hit@5: 1080 (17.3%)
  ‚Ä¢ Users with Hit@10: 1446 (23.2%)
  ‚Ä¢ Users with Hit@50: 2371 (38.1%)



In [16]:
print("\n" + "="*60)
print("Starting test evaluation...")
print("="*60)

# Metrics storage for test
test_metrics = {
    'hit@5': [],
    'hit@10': [],
    'hit@50': [],
    'mrr': [],
    'ndcg@10': [],
    'ndcg@50': []
}

# Get test users
test_users_list = test_df['user_id'].unique()
print(f"Total test users: {len(test_users_list)}")

# Filter to only test users who also appear in training (so they have profiles)
test_users_with_profiles = [u for u in test_users_list if u in user_profiles]
print(f"Test users with training profiles: {len(test_users_with_profiles)}")

if len(test_users_with_profiles) > 0:
    print(f"Evaluating on {len(test_users_with_profiles)} test users...")
    
    # Evaluation on test set
    for i, user_id in enumerate(test_users_with_profiles):
        if i % 500 == 0:
            print(f"Processing user {i+1}/{len(test_users_with_profiles)}...")
        
        # Get user's training books (to exclude from recommendations)
        train_books_set = set(train_df[train_df['user_id'] == user_id]['book_id'].values)
        
        # Get user's test books (ground truth)
        test_books = test_df[test_df['user_id'] == user_id]['book_id'].values
        
        if len(test_books) == 0:
            continue
        
        # Get recommendations
        user_profile = user_profiles[user_id]
        recommendations = get_recommendations(
            user_id, user_profile, item_features, book_to_idx, 
            train_books_set, top_k=50
        )
        
        # Calculate metrics
        test_metrics['hit@5'].append(hit_at_k(recommendations, test_books, 5))
        test_metrics['hit@10'].append(hit_at_k(recommendations, test_books, 10))
        test_metrics['hit@50'].append(hit_at_k(recommendations, test_books, 50))
        test_metrics['mrr'].append(mrr(recommendations, test_books))
        test_metrics['ndcg@10'].append(ndcg_at_k(recommendations, test_books, 10))
        test_metrics['ndcg@50'].append(ndcg_at_k(recommendations, test_books, 50))

    print(f"\nTest evaluation complete! Evaluated {len(test_metrics['hit@5'])} users.")
else:
    print("\n‚ö†Ô∏è No test users have training profiles.")


Starting test evaluation...
Total test users: 29368
Test users with training profiles: 29368
Evaluating on 29368 test users...
Processing user 1/29368...
Processing user 501/29368...
Processing user 1001/29368...
Processing user 1501/29368...
Processing user 2001/29368...
Processing user 2501/29368...
Processing user 3001/29368...
Processing user 3501/29368...
Processing user 4001/29368...
Processing user 4501/29368...
Processing user 5001/29368...
Processing user 5501/29368...
Processing user 6001/29368...
Processing user 6501/29368...
Processing user 7001/29368...
Processing user 7501/29368...
Processing user 8001/29368...
Processing user 8501/29368...
Processing user 9001/29368...
Processing user 9501/29368...
Processing user 10001/29368...
Processing user 10501/29368...
Processing user 11001/29368...
Processing user 11501/29368...
Processing user 12001/29368...
Processing user 12501/29368...
Processing user 13001/29368...
Processing user 13501/29368...
Processing user 14001/29368.

In [22]:
# Calculate average test metrics
print("\n" + "="*60)
print("TEST SET - EVALUATION RESULTS")
print("="*60)
print(f"\nDataset Statistics:")
print(f"  Total Books: {len(all_books)}")
print(f"  Training Users: {len(user_profiles)}")
print(f"  Test Users: {len(test_users_list)}")
print(f"  Training Interactions: {len(train_df)}")
print(f"  Test Interactions: {len(test_df)}")

if len(test_metrics['hit@5']) == 0:
    print("\n‚ö†Ô∏è No test metrics available.")
else:
    print(f"  Evaluated Users: {len(test_metrics['hit@5'])}")
    
    print(f"\n{'Metric':<15} {'Score':<10}")
    print("-" * 25)
    print(f"{'Hit@5':<15} {np.mean(test_metrics['hit@5']):.4f}")
    print(f"{'Hit@10':<15} {np.mean(test_metrics['hit@10']):.4f}")
    print(f"{'Hit@50':<15} {np.mean(test_metrics['hit@50']):.4f}")
    print(f"{'MRR':<15} {np.mean(test_metrics['mrr']):.4f}")
    print(f"{'NDCG@10':<15} {np.mean(test_metrics['ndcg@10']):.4f}")
    print(f"{'NDCG@50':<15} {np.mean(test_metrics['ndcg@50']):.4f}")
    
    print("\n" + "="*60)
    
    # Additional statistics
    print(f"\nüìä Test Hit Coverage:")
    print(f"  ‚Ä¢ Users with Hit@5: {sum([1 for x in test_metrics['hit@5'] if x > 0])} ({sum([1 for x in test_metrics['hit@5'] if x > 0])/len(test_metrics['hit@5'])*100:.1f}%)")
    print(f"  ‚Ä¢ Users with Hit@10: {sum([1 for x in test_metrics['hit@10'] if x > 0])} ({sum([1 for x in test_metrics['hit@10'] if x > 0])/len(test_metrics['hit@10'])*100:.1f}%)")
    print(f"  ‚Ä¢ Users with Hit@50: {sum([1 for x in test_metrics['hit@50'] if x > 0])} ({sum([1 for x in test_metrics['hit@50'] if x > 0])/len(test_metrics['hit@50'])*100:.1f}%)")

print("\n" + "="*60)


TEST SET - EVALUATION RESULTS

Dataset Statistics:
  Total Books: 127302
  Training Users: 63723
  Test Users: 29368
  Training Interactions: 142360
  Test Interactions: 50765
  Evaluated Users: 29368

Metric          Score     
-------------------------
Hit@5           0.2554
Hit@10          0.3147
Hit@50          0.4373
MRR             0.1803
NDCG@10         0.1706
NDCG@50         0.1968


üìä Test Hit Coverage:
  ‚Ä¢ Users with Hit@5: 7501 (25.5%)
  ‚Ä¢ Users with Hit@10: 9242 (31.5%)
  ‚Ä¢ Users with Hit@50: 12844 (43.7%)



In [23]:
print("="*80)
print("DEMONSTRATION: SHOWING ACTUAL RECOMMENDATIONS VS GROUND TRUTH (TEST SET)")
print("="*80)

# Select a few random test users to demonstrate
if len(test_users_with_profiles) > 0:
    np.random.seed(42)
    demo_users = np.random.choice(test_users_with_profiles, min(5, len(test_users_with_profiles)), replace=False)
    
    for user_idx, user_id in enumerate(demo_users):
        print(f"\n{'='*80}")
        print(f"USER {user_idx + 1}: {user_id}")
        print(f"{'='*80}")
        
        # Get user's training books
        train_books_list = train_df[train_df['user_id'] == user_id]['book_id'].values
        print(f"\nüìö TRAINING BOOKS (what user read): {len(train_books_list)} books")
        for i, book_id in enumerate(train_books_list[:3]):  # Show first 3
            book_info = df_books[df_books['book_id'] == book_id].iloc[0]
            print(f"  {i+1}. {book_info['book_title']}")
        if len(train_books_list) > 3:
            print(f"  ... and {len(train_books_list) - 3} more")
        
        # Get user's test books (ground truth)
        test_books_list = test_df[test_df['user_id'] == user_id]['book_id'].values
        print(f"\nüéØ GROUND TRUTH (actual test books): {len(test_books_list)} books")
        for i, book_id in enumerate(test_books_list):
            book_info = df_books[df_books['book_id'] == book_id].iloc[0]
            print(f"  {i+1}. {book_info['book_title']}")
        
        # Get recommendations
        user_profile = user_profiles[user_id]
        train_books_set = set(train_books_list)
        recommendations = get_recommendations(
            user_id, user_profile, item_features, book_to_idx,
            train_books_set, top_k=10
        )
        
        # Display top-10 recommendations
        print(f"\nüí° TOP-10 RECOMMENDATIONS:")
        hits = []
        for i, (book_id, score) in enumerate(recommendations[:10]):
            book_info = df_books[df_books['book_id'] == book_id].iloc[0]
            is_hit = "‚úì HIT!" if book_id in test_books_list else ""
            print(f"  {i+1}. {book_info['book_title'][:60]:60s} (score: {score:.4f}) {is_hit}")
            if book_id in test_books_list:
                hits.append(i+1)
        
        # Show metrics for this user
        hit10 = 1.0 if len(hits) > 0 else 0.0
        mrr_val = (1.0 / hits[0]) if hits else 0.0
        
        print(f"\nüìä USER METRICS:")
        print(f"  ‚Ä¢ Hit@10: {hit10}")
        print(f"  ‚Ä¢ MRR: {mrr_val:.4f}")
        if hits:
            print(f"  ‚Ä¢ Hits at positions: {hits}")
else:
    print("\n‚ö†Ô∏è No test users available for demonstration.")

DEMONSTRATION: SHOWING ACTUAL RECOMMENDATIONS VS GROUND TRUTH (TEST SET)

USER 1: USER41435

üìö TRAINING BOOKS (what user read): 3 books
  1. ‡¶á‡¶Ç‡¶≤‡¶ø‡¶∂‡ßá ‡¶¶‡ßÅ‡¶∞‡ßç‡¶¨‡¶≤‡¶¶‡ßá‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø
  2. ‡¶á‡¶Ç‡¶≤‡¶ø‡¶∂‡ßá ‡¶¶‡ßÅ‡¶∞‡ßç‡¶¨‡¶≤‡¶¶‡ßá‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø English Therapy
  3. ‡¶∏‡¶π‡¶ú ‡¶≠‡¶æ‡¶∑‡¶æ‡ßü ‡¶á‡¶Ç‡¶≤‡¶ø‡¶∂ ‡¶ó‡ßç‡¶∞‡¶æ‡¶Æ‡¶æ‡¶∞

üéØ GROUND TRUTH (actual test books): 2 books
  1. ‡¶Æ‡ßç‡¶Ø‡¶æ‡¶∏‡ßá‡¶ú
  2. ‡¶á‡¶Ç‡¶≤‡¶ø‡¶∂‡ßá ‡¶¶‡ßÅ‡¶∞‡ßç‡¶¨‡¶≤‡¶¶‡ßá‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø VOCAB THERAPY

üí° TOP-10 RECOMMENDATIONS:
  1. ‡¶á‡¶Ç‡¶≤‡¶ø‡¶∂ ‡¶•‡ßá‡¶∞‡¶æ‡¶™‡¶ø ‡¶™‡ßç‡¶∞‡¶æ‡¶ï‡¶ü‡¶ø‡¶∏ ‡¶¨‡ßÅ‡¶ï                                    (score: 0.9169) 
  2. ‡¶á‡¶Ç‡¶≤‡¶ø‡¶∂‡ßá ‡¶¶‡ßÅ‡¶∞‡ßç‡¶¨‡¶≤‡¶¶‡ßá‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø VOCAB THERAPY                          (score: 0.8255) ‚úì HIT!
  3. ‡¶á‡¶Ç‡¶≤‡¶ø‡¶∂‡ßá ‡¶¶‡ßÅ‡¶∞‡ßç‡¶¨‡¶≤‡¶¶‡ßá‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø English Therapy ‡¶™‡ßç‡¶Ø‡¶æ‡¶ï‡ßá‡¶ú                (score: 0.7619) 
  4. ‡¶á‡¶Ç‡¶≤‡¶ø‡¶∂‡ßá ‡¶¶‡ßÅ‡¶∞‡ßç‡¶¨‡¶≤‡¶¶‡ßá‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶™‡¶æ

In [24]:
print("\n" + "="*80)
print("OVERALL PERFORMANCE SUMMARY")
print("="*80)

if len(val_metrics['hit@5']) > 0:
    print("\n" + "="*80)
    print("VALIDATION SET SUMMARY")
    print("="*80)
    
    # Validation statistics
    users_with_hit5_val = sum([1 for x in val_metrics['hit@5'] if x > 0])
    users_with_hit10_val = sum([1 for x in val_metrics['hit@10'] if x > 0])
    users_with_hit50_val = sum([1 for x in val_metrics['hit@50'] if x > 0])
    total_val_users = len(val_metrics['hit@5'])
    
    print(f"\nüìä VALIDATION HIT COVERAGE STATISTICS:")
    print("-" * 80)
    print(f"  ‚Ä¢ Total Validation Users: {total_val_users}")
    print(f"  ‚Ä¢ Users with Hit@5: {users_with_hit5_val} ({users_with_hit5_val/total_val_users*100:.1f}%)")
    print(f"  ‚Ä¢ Users with Hit@10: {users_with_hit10_val} ({users_with_hit10_val/total_val_users*100:.1f}%)")
    print(f"  ‚Ä¢ Users with Hit@50: {users_with_hit50_val} ({users_with_hit50_val/total_val_users*100:.1f}%)")
    
    print(f"\nüìà VALIDATION AVERAGE METRICS:")
    print("-" * 80)
    print(f"  ‚Ä¢ Hit@5:    {np.mean(val_metrics['hit@5']):.4f}")
    print(f"  ‚Ä¢ Hit@10:   {np.mean(val_metrics['hit@10']):.4f}")
    print(f"  ‚Ä¢ Hit@50:   {np.mean(val_metrics['hit@50']):.4f}")
    print(f"  ‚Ä¢ MRR:      {np.mean(val_metrics['mrr']):.4f}")
    print(f"  ‚Ä¢ NDCG@10:  {np.mean(val_metrics['ndcg@10']):.4f}")
    print(f"  ‚Ä¢ NDCG@50:  {np.mean(val_metrics['ndcg@50']):.4f}")

if len(test_metrics['hit@5']) > 0:
    print("\n" + "="*80)
    print("TEST SET SUMMARY")
    print("="*80)
    
    # Test statistics
    users_with_hit5_test = sum([1 for x in test_metrics['hit@5'] if x > 0])
    users_with_hit10_test = sum([1 for x in test_metrics['hit@10'] if x > 0])
    users_with_hit50_test = sum([1 for x in test_metrics['hit@50'] if x > 0])
    total_test_users = len(test_metrics['hit@5'])
    
    print(f"\nüìä TEST HIT COVERAGE STATISTICS:")
    print("-" * 80)
    print(f"  ‚Ä¢ Total Test Users: {total_test_users}")
    print(f"  ‚Ä¢ Users with Hit@5: {users_with_hit5_test} ({users_with_hit5_test/total_test_users*100:.1f}%)")
    print(f"  ‚Ä¢ Users with Hit@10: {users_with_hit10_test} ({users_with_hit10_test/total_test_users*100:.1f}%)")
    print(f"  ‚Ä¢ Users with Hit@50: {users_with_hit50_test} ({users_with_hit50_test/total_test_users*100:.1f}%)")
    
    print(f"\nüìà TEST AVERAGE METRICS:")
    print("-" * 80)
    print(f"  ‚Ä¢ Hit@5:    {np.mean(test_metrics['hit@5']):.4f}")
    print(f"  ‚Ä¢ Hit@10:   {np.mean(test_metrics['hit@10']):.4f}")
    print(f"  ‚Ä¢ Hit@50:   {np.mean(test_metrics['hit@50']):.4f}")
    print(f"  ‚Ä¢ MRR:      {np.mean(test_metrics['mrr']):.4f}")
    print(f"  ‚Ä¢ NDCG@10:  {np.mean(test_metrics['ndcg@10']):.4f}")
    print(f"  ‚Ä¢ NDCG@50:  {np.mean(test_metrics['ndcg@50']):.4f}")

print("\n" + "="*80)
print("‚úÖ EVALUATION COMPLETE!")
print("="*80)


OVERALL PERFORMANCE SUMMARY

VALIDATION SET SUMMARY

üìä VALIDATION HIT COVERAGE STATISTICS:
--------------------------------------------------------------------------------
  ‚Ä¢ Total Validation Users: 6227
  ‚Ä¢ Users with Hit@5: 1080 (17.3%)
  ‚Ä¢ Users with Hit@10: 1446 (23.2%)
  ‚Ä¢ Users with Hit@50: 2371 (38.1%)

üìà VALIDATION AVERAGE METRICS:
--------------------------------------------------------------------------------
  ‚Ä¢ Hit@5:    0.1734
  ‚Ä¢ Hit@10:   0.2322
  ‚Ä¢ Hit@50:   0.3808
  ‚Ä¢ MRR:      0.1118
  ‚Ä¢ NDCG@10:  0.1085
  ‚Ä¢ NDCG@50:  0.1374

TEST SET SUMMARY

üìä TEST HIT COVERAGE STATISTICS:
--------------------------------------------------------------------------------
  ‚Ä¢ Total Test Users: 29368
  ‚Ä¢ Users with Hit@5: 7501 (25.5%)
  ‚Ä¢ Users with Hit@10: 9242 (31.5%)
  ‚Ä¢ Users with Hit@50: 12844 (43.7%)

üìà TEST AVERAGE METRICS:
--------------------------------------------------------------------------------
  ‚Ä¢ Hit@5:    0.2554
  ‚Ä¢ Hit@10

In [25]:
print("="*80)
print("CONTENT FEATURE ANALYSIS")
print("="*80)

# Calculate feature importance based on feature ranges
author_importance = author_features.shape[1]
category_importance = category_features.shape[1]
publisher_importance = publisher_features.shape[1]
review_importance = review_features.shape[1]

total_features = author_importance + category_importance + publisher_importance + review_importance

print(f"\nüìä FEATURE DISTRIBUTION IN MODEL:")
print("-" * 80)
print(f"  ‚Ä¢ Author features:     {author_importance:6d} ({author_importance/total_features*100:5.2f}%)")
print(f"  ‚Ä¢ Category features:   {category_importance:6d} ({category_importance/total_features*100:5.2f}%)")
print(f"  ‚Ä¢ Publisher features:  {publisher_importance:6d} ({publisher_importance/total_features*100:5.2f}%)")
print(f"  ‚Ä¢ Review features:     {review_importance:6d} ({review_importance/total_features*100:5.2f}%)")
print(f"  ‚Ä¢ TOTAL:               {total_features:6d} (100.00%)")

print(f"\nüîç FEATURE SPACE ANALYSIS:")
print("-" * 80)
print(f"  ‚Ä¢ Feature sparsity: {1 - (item_features.nnz / (item_features.shape[0] * item_features.shape[1])):.4f}")
print(f"  ‚Ä¢ Non-zero elements: {item_features.nnz:,}")
print(f"  ‚Ä¢ Average non-zero per book: {item_features.nnz / item_features.shape[0]:.2f}")

print("\n" + "="*80)
print("üéâ CONTENT-BASED RECOMMENDATION SYSTEM COMPLETE!")
print("="*80)

CONTENT FEATURE ANALYSIS

üìä FEATURE DISTRIBUTION IN MODEL:
--------------------------------------------------------------------------------
  ‚Ä¢ Author features:      16572 (64.19%)
  ‚Ä¢ Category features:     1493 ( 5.78%)
  ‚Ä¢ Publisher features:    2752 (10.66%)
  ‚Ä¢ Review features:       5000 (19.37%)
  ‚Ä¢ TOTAL:                25817 (100.00%)

üîç FEATURE SPACE ANALYSIS:
--------------------------------------------------------------------------------
  ‚Ä¢ Feature sparsity: 0.9987
  ‚Ä¢ Non-zero elements: 4,212,492
  ‚Ä¢ Average non-zero per book: 33.09

üéâ CONTENT-BASED RECOMMENDATION SYSTEM COMPLETE!
