# Matrix Factorization with NumPy/SciPy

Pure numpy implementation of matrix factorization for book recommendations. We'll decompose the user-book rating matrix U × B and use book similarities based on user rating patterns.

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
import matplotlib.pyplot as plt
import wandb
import warnings
warnings.filterwarnings('ignore')

DATA_DIR = "../data"

In [None]:
# Initialize wandb
run = wandb.init(
    project="book-recommendation-kaggle",
    group="matrix-factorization-numpy",
    job_type="train",
    save_code=True,
)

## Data Loading and Preprocessing

In [None]:
# Load data
print("Loading data...")
books = pd.read_csv(f"{DATA_DIR}/Books.csv.zip", compression="zip")
ratings = pd.read_csv(f"{DATA_DIR}/Ratings.csv.zip", compression="zip")
users = pd.read_csv(f"{DATA_DIR}/Users.csv.zip", compression="zip")

print(f"Books: {len(books):,}")
print(f"Ratings: {len(ratings):,}")
print(f"Users: {len(users):,}")

# Show rating distribution
print(f"\nRating distribution:")
print(ratings['Book-Rating'].value_counts().sort_index())

In [None]:
# Filter for users and books with at least 5 interactions
print("\n=== Filtering for Active Users and Popular Books ===")

min_interactions = 5

# Count interactions
user_counts = ratings.groupby('User-ID').size()
book_counts = ratings.groupby('ISBN').size()

# Filter
active_users = user_counts[user_counts >= min_interactions].index
popular_books = book_counts[book_counts >= min_interactions].index

print(f"Active users (≥{min_interactions} ratings): {len(active_users):,} / {len(user_counts):,}")
print(f"Popular books (≥{min_interactions} ratings): {len(popular_books):,} / {len(book_counts):,}")

# Filter ratings to active users and popular books
filtered_ratings = ratings[
    (ratings['User-ID'].isin(active_users)) & 
    (ratings['ISBN'].isin(popular_books))
].copy()

print(f"Filtered ratings: {len(filtered_ratings):,} / {len(ratings):,} ({len(filtered_ratings)/len(ratings)*100:.1f}%)")

# Sample for faster computation if dataset is too large
max_users = 10000  # Limit for computational efficiency
max_books = 5000

if len(active_users) > max_users:
    # Sample most active users
    top_users = user_counts.nlargest(max_users).index
    filtered_ratings = filtered_ratings[filtered_ratings['User-ID'].isin(top_users)]
    print(f"Sampled to top {max_users:,} most active users")

if len(popular_books) > max_books:
    # Sample most popular books
    top_books = book_counts.nlargest(max_books).index
    filtered_ratings = filtered_ratings[filtered_ratings['ISBN'].isin(top_books)]
    print(f"Sampled to top {max_books:,} most popular books")

print(f"Final filtered ratings: {len(filtered_ratings):,}")

In [None]:
# Create compact indices
print("\n=== Creating User-Book Rating Matrix ===")

# Get unique users and books from filtered data
unique_users = sorted(filtered_ratings['User-ID'].unique())
unique_books = sorted(filtered_ratings['ISBN'].unique())

# Create mappings
user_to_idx = {user_id: idx for idx, user_id in enumerate(unique_users)}
book_to_idx = {book_id: idx for idx, book_id in enumerate(unique_books)}
idx_to_user = {idx: user_id for user_id, idx in user_to_idx.items()}
idx_to_book = {idx: book_id for book_id, idx in book_to_idx.items()}

# Map to indices
filtered_ratings['user_idx'] = filtered_ratings['User-ID'].map(user_to_idx)
filtered_ratings['book_idx'] = filtered_ratings['ISBN'].map(book_to_idx)

n_users = len(unique_users)
n_books = len(unique_books)

print(f"Matrix dimensions: {n_users:,} users × {n_books:,} books")
print(f"Total possible ratings: {n_users * n_books:,}")
print(f"Actual ratings: {len(filtered_ratings):,}")
print(f"Sparsity: {(1 - len(filtered_ratings) / (n_users * n_books)) * 100:.2f}%")

In [None]:
# Create book metadata mapping
print("\n=== Creating Book Metadata ===")

# Filter books metadata to only those in our dataset
book_metadata = books[books['ISBN'].isin(unique_books)].copy()
book_metadata['book_idx'] = book_metadata['ISBN'].map(book_to_idx)

# Create title mappings
title_to_idx = {}
idx_to_title = {}
idx_to_author = {}

for _, row in book_metadata.iterrows():
    book_idx = row['book_idx']
    title = row['Book-Title']
    author = row['Book-Author']
    
    title_to_idx[title] = book_idx
    idx_to_title[book_idx] = title
    idx_to_author[book_idx] = author

print(f"Book metadata for {len(book_metadata):,} books")

# Show sample books
print("\nSample books:")
for i, (_, row) in enumerate(book_metadata.head(5).iterrows()):
    print(f"  {row['Book-Title']} by {row['Book-Author']}")

## Create Rating Matrix

In [None]:
# Build the user-book rating matrix
print("\n=== Building Rating Matrix ===")

# Create sparse matrix
user_indices = filtered_ratings['user_idx'].values
book_indices = filtered_ratings['book_idx'].values
rating_values = filtered_ratings['Book-Rating'].values.astype(np.float32)

# Create sparse rating matrix (users x books)
rating_matrix = csr_matrix(
    (rating_values, (user_indices, book_indices)), 
    shape=(n_users, n_books),
    dtype=np.float32
)

print(f"Rating matrix shape: {rating_matrix.shape}")
print(f"Non-zero entries: {rating_matrix.nnz:,}")
print(f"Density: {rating_matrix.nnz / (n_users * n_books) * 100:.3f}%")
print(f"Memory usage: {rating_matrix.data.nbytes + rating_matrix.indices.nbytes + rating_matrix.indptr.nbytes} bytes")

# Convert to dense for small matrices (if feasible)
matrix_size_mb = (n_users * n_books * 4) / (1024**2)  # 4 bytes per float32
print(f"Dense matrix would be: {matrix_size_mb:.1f} MB")

if matrix_size_mb < 500:  # Only convert if less than 500MB
    print("Converting to dense matrix for easier manipulation...")
    rating_matrix_dense = rating_matrix.toarray()
    use_dense = True
else:
    print("Keeping sparse matrix due to size")
    rating_matrix_dense = None
    use_dense = False

## Matrix Factorization Methods

In [None]:
def perform_svd_factorization(matrix, k=50):
    """
    Perform SVD factorization on the rating matrix
    Returns U, sigma, Vt where matrix ≈ U @ diag(sigma) @ Vt
    """
    print(f"\nPerforming SVD with k={k} factors...")
    
    if matrix.shape[1] <= k:
        k = matrix.shape[1] - 1
        print(f"Reduced k to {k} due to matrix dimensions")
    
    # Use sparse SVD
    U, sigma, Vt = svds(matrix, k=k)
    
    # Sort by singular values (descending)
    idx = np.argsort(sigma)[::-1]
    U = U[:, idx]
    sigma = sigma[idx]
    Vt = Vt[idx, :]
    
    print(f"SVD completed:")
    print(f"  U shape: {U.shape} (users × factors)")
    print(f"  Sigma shape: {sigma.shape} (singular values)")
    print(f"  Vt shape: {Vt.shape} (factors × books)")
    print(f"  Top 5 singular values: {sigma[:5]}")
    
    return U, sigma, Vt

def perform_nmf_factorization(matrix, k=50, max_iter=200):
    """
    Perform Non-negative Matrix Factorization
    Returns W, H where matrix ≈ W @ H
    """
    print(f"\nPerforming NMF with k={k} factors...")
    
    # NMF requires non-negative values, so we'll work with the dense matrix
    if use_dense:
        matrix_input = rating_matrix_dense
    else:
        print("Converting sparse matrix to dense for NMF...")
        matrix_input = matrix.toarray()
    
    # Ensure non-negative (ratings should already be non-negative)
    matrix_input = np.maximum(matrix_input, 0)
    
    nmf = NMF(n_components=k, max_iter=max_iter, random_state=42, alpha_W=0.1, alpha_H=0.1)
    W = nmf.fit_transform(matrix_input)  # Users × factors
    H = nmf.components_  # Factors × books
    
    print(f"NMF completed:")
    print(f"  W shape: {W.shape} (users × factors)")
    print(f"  H shape: {H.shape} (factors × books)")
    print(f"  Reconstruction error: {nmf.reconstruction_err_:.4f}")
    print(f"  Iterations: {nmf.n_iter_}")
    
    return W, H, nmf

def reconstruct_matrix(U=None, sigma=None, Vt=None, W=None, H=None, method='svd'):
    """
    Reconstruct the rating matrix from factorization
    """
    if method == 'svd':
        return U @ np.diag(sigma) @ Vt
    elif method == 'nmf':
        return W @ H
    else:
        raise ValueError("Method must be 'svd' or 'nmf'")

In [None]:
# Perform both SVD and NMF factorizations
k_factors = 50  # Number of latent factors

# SVD Factorization
U_svd, sigma_svd, Vt_svd = perform_svd_factorization(rating_matrix, k=k_factors)

# NMF Factorization  
W_nmf, H_nmf, nmf_model = perform_nmf_factorization(rating_matrix, k=k_factors)

# Compute reconstruction errors
print("\n=== Reconstruction Analysis ===")

# For SVD
reconstructed_svd = reconstruct_matrix(U_svd, sigma_svd, Vt_svd, method='svd')
if use_dense:
    # Only compute for non-zero entries (where we have actual ratings)
    mask = rating_matrix_dense > 0
    svd_error = np.mean((rating_matrix_dense[mask] - reconstructed_svd[mask]) ** 2)
    print(f"SVD RMSE on observed ratings: {np.sqrt(svd_error):.4f}")

# For NMF
reconstructed_nmf = reconstruct_matrix(W=W_nmf, H=H_nmf, method='nmf')
if use_dense:
    nmf_error = np.mean((rating_matrix_dense[mask] - reconstructed_nmf[mask]) ** 2)
    print(f"NMF RMSE on observed ratings: {np.sqrt(nmf_error):.4f}")

# Analyze the factors
print(f"\nFactor Analysis:")
print(f"SVD - Explained variance ratio: {(sigma_svd**2).cumsum() / (sigma_svd**2).sum()}")
print(f"NMF - Factor magnitudes: {np.linalg.norm(H_nmf, axis=1)[:10]}")

## Book Similarity Analysis

In [None]:
def compute_book_similarities(method='svd'):
    """
    Compute book similarities using the factorized representations
    
    For SVD: books are represented by Vt.T (books × factors)
    For NMF: books are represented by H.T (books × factors)
    """
    print(f"\n=== Computing Book Similarities using {method.upper()} ===")
    
    if method == 'svd':
        # Books are columns, so we transpose Vt to get (books × factors)
        book_factors = Vt_svd.T
    elif method == 'nmf':
        # H is (factors × books), so transpose to get (books × factors)
        book_factors = H_nmf.T
    else:
        raise ValueError("Method must be 'svd' or 'nmf'")
    
    print(f"Book factors shape: {book_factors.shape}")
    
    # Compute cosine similarity between all book pairs
    book_similarity_matrix = cosine_similarity(book_factors)
    
    print(f"Book similarity matrix shape: {book_similarity_matrix.shape}")
    print(f"Similarity range: [{book_similarity_matrix.min():.3f}, {book_similarity_matrix.max():.3f}]")
    
    # Remove self-similarities for analysis
    np.fill_diagonal(book_similarity_matrix, 0)
    non_diag_similarities = book_similarity_matrix[book_similarity_matrix != 0]
    
    print(f"Non-diagonal similarity stats:")
    print(f"  Mean: {non_diag_similarities.mean():.4f}")
    print(f"  Std: {non_diag_similarities.std():.4f}")
    print(f"  95th percentile: {np.percentile(non_diag_similarities, 95):.4f}")
    
    return book_similarity_matrix, book_factors

# Compute similarities for both methods
book_sim_svd, book_factors_svd = compute_book_similarities('svd')
book_sim_nmf, book_factors_nmf = compute_book_similarities('nmf')

## Recommendation Functions

In [None]:
class BookRecommender:
    def __init__(self, similarity_matrix, book_factors, method_name):
        self.similarity_matrix = similarity_matrix
        self.book_factors = book_factors
        self.method_name = method_name
        
    def find_similar_books(self, book_title, k=10):
        """
        Find k most similar books to the given book title
        """
        # Check if book exists
        if book_title not in title_to_idx:
            return self._suggest_similar_titles(book_title)
        
        book_idx = title_to_idx[book_title]
        
        # Get similarity scores for this book
        similarities = self.similarity_matrix[book_idx]
        
        # Get top k most similar books (excluding the book itself)
        top_indices = np.argsort(similarities)[::-1][:k]
        
        recommendations = []
        for idx in top_indices:
            if idx != book_idx:  # Skip the book itself
                try:
                    title = idx_to_title[idx]
                    author = idx_to_author.get(idx, "Unknown Author")
                    similarity = similarities[idx]
                    recommendations.append((title, author, similarity))
                except KeyError:
                    continue
        
        return recommendations[:k]
    
    def _suggest_similar_titles(self, book_title):
        """
        Suggest similar titles if exact match not found
        """
        query_lower = book_title.lower()
        matches = []
        
        for title in title_to_idx.keys():
            if query_lower in title.lower():
                book_idx = title_to_idx[title]
                author = idx_to_author.get(book_idx, "Unknown Author")
                matches.append((title, author))
        
        if matches:
            print(f"Book '{book_title}' not found. Did you mean one of these?")
            for i, (title, author) in enumerate(matches[:5], 1):
                print(f"  {i}. {title} by {author}")
        else:
            print(f"Book '{book_title}' not found in the dataset.")
        
        return []
    
    def search_books(self, query, max_results=10):
        """
        Search for books by title (partial matching)
        """
        query_lower = query.lower()
        matches = []
        
        for title in title_to_idx.keys():
            if query_lower in title.lower():
                book_idx = title_to_idx[title]
                author = idx_to_author.get(book_idx, "Unknown Author")
                matches.append((title, author))
        
        return matches[:max_results]
    
    def display_recommendations(self, book_title, recommendations, k=10):
        """
        Display recommendations in a nice format
        """
        if not recommendations:
            return
        
        # Get info about the query book
        if book_title in title_to_idx:
            query_idx = title_to_idx[book_title]
            query_author = idx_to_author.get(query_idx, "Unknown Author")
            print(f"\n📚 Books similar to: '{book_title}' by {query_author}")
            print(f"Using {self.method_name} factorization")
        else:
            print(f"\n📚 Books similar to: '{book_title}'")
        
        print("=" * 80)
        
        for i, (title, author, similarity) in enumerate(recommendations[:k], 1):
            print(f"{i:2d}. {title}")
            print(f"    by {author}")
            print(f"    Similarity: {similarity:.3f}")
            print()
    
    def get_book_vector(self, book_title):
        """
        Get the factor vector for a specific book
        """
        if book_title not in title_to_idx:
            print(f"Book '{book_title}' not found")
            return None
        
        book_idx = title_to_idx[book_title]
        return self.book_factors[book_idx]

# Create recommender instances
recommender_svd = BookRecommender(book_sim_svd, book_factors_svd, "SVD")
recommender_nmf = BookRecommender(book_sim_nmf, book_factors_nmf, "NMF")

print("\n✅ Recommendation systems ready!")
print("Available recommenders: recommender_svd, recommender_nmf")

## Test Recommendations

In [None]:
# Test with some popular books
test_queries = [
    "Harry Potter",
    "Lord of the Rings", 
    "To Kill a Mockingbird",
    "1984",
    "Pride and Prejudice"
]

for query in test_queries:
    print(f"\n{'='*60}")
    print(f"Searching for: {query}")
    
    # Search for books matching the query
    matches = recommender_svd.search_books(query, max_results=3)
    
    if matches:
        print(f"\nFound {len(matches)} matching books:")
        for i, (title, author) in enumerate(matches, 1):
            print(f"  {i}. {title} by {author}")
        
        # Use the first match for recommendations
        sample_book = matches[0][0]
        print(f"\nGetting recommendations for: {sample_book}")
        
        # Get recommendations from both methods
        recs_svd = recommender_svd.find_similar_books(sample_book, k=5)
        recs_nmf = recommender_nmf.find_similar_books(sample_book, k=5)
        
        if recs_svd:
            print(f"\n--- SVD Recommendations ---")
            for i, (title, author, sim) in enumerate(recs_svd, 1):
                print(f"{i}. {title} by {author} (sim: {sim:.3f})")
        
        if recs_nmf:
            print(f"\n--- NMF Recommendations ---")
            for i, (title, author, sim) in enumerate(recs_nmf, 1):
                print(f"{i}. {title} by {author} (sim: {sim:.3f})")
    else:
        print(f"No books found matching '{query}'")

## Detailed Example: Harry Potter Recommendations

In [None]:
# Find a Harry Potter book for detailed analysis
harry_potter_books = recommender_svd.search_books("harry potter", max_results=10)

if harry_potter_books:
    print("Harry Potter books in dataset:")
    for i, (title, author) in enumerate(harry_potter_books, 1):
        print(f"  {i}. {title} by {author}")
    
    # Use the first one for detailed recommendations
    hp_book = harry_potter_books[0][0]
    
    print(f"\n{'='*80}")
    print(f"DETAILED ANALYSIS: {hp_book}")
    print(f"{'='*80}")
    
    # Get recommendations from both methods
    svd_recs = recommender_svd.find_similar_books(hp_book, k=10)
    nmf_recs = recommender_nmf.find_similar_books(hp_book, k=10)
    
    # Display SVD recommendations
    print("\n🔍 SVD-based Recommendations:")
    recommender_svd.display_recommendations(hp_book, svd_recs)
    
    # Display NMF recommendations
    print("\n🔍 NMF-based Recommendations:")
    recommender_nmf.display_recommendations(hp_book, nmf_recs)
    
    # Compare the factor representations
    svd_vector = recommender_svd.get_book_vector(hp_book)
    nmf_vector = recommender_nmf.get_book_vector(hp_book)
    
    print(f"\n📊 Factor Analysis for '{hp_book}':")
    print(f"SVD vector norm: {np.linalg.norm(svd_vector):.3f}")
    print(f"NMF vector norm: {np.linalg.norm(nmf_vector):.3f}")
    print(f"Top 5 SVD factors: {svd_vector[:5]}")
    print(f"Top 5 NMF factors: {nmf_vector[:5]}")

else:
    print("No Harry Potter books found in the dataset")

## Visualization and Analysis

In [None]:
# Visualize similarity distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# SVD similarity distribution
svd_similarities = book_sim_svd[book_sim_svd != 0].flatten()
axes[0,0].hist(svd_similarities, bins=50, alpha=0.7, edgecolor='black')
axes[0,0].set_title('SVD: Book Similarity Distribution')
axes[0,0].set_xlabel('Cosine Similarity')
axes[0,0].set_ylabel('Frequency')
axes[0,0].grid(True, alpha=0.3)

# NMF similarity distribution
nmf_similarities = book_sim_nmf[book_sim_nmf != 0].flatten()
axes[0,1].hist(nmf_similarities, bins=50, alpha=0.7, edgecolor='black', color='orange')
axes[0,1].set_title('NMF: Book Similarity Distribution')
axes[0,1].set_xlabel('Cosine Similarity')
axes[0,1].set_ylabel('Frequency')
axes[0,1].grid(True, alpha=0.3)

# Singular values for SVD
axes[1,0].plot(sigma_svd, 'o-', alpha=0.7)
axes[1,0].set_title('SVD: Singular Values')
axes[1,0].set_xlabel('Factor Index')
axes[1,0].set_ylabel('Singular Value')
axes[1,0].set_yscale('log')
axes[1,0].grid(True, alpha=0.3)

# Factor magnitudes for NMF
nmf_factor_norms = np.linalg.norm(H_nmf, axis=1)
axes[1,1].plot(nmf_factor_norms, 'o-', alpha=0.7, color='orange')
axes[1,1].set_title('NMF: Factor Magnitudes')
axes[1,1].set_xlabel('Factor Index')
axes[1,1].set_ylabel('L2 Norm')
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Log to wandb
wandb.log({"analysis_plots": wandb.Image(plt)})

# Print comparison statistics
print("\n📊 Method Comparison:")
print(f"SVD similarities - Mean: {svd_similarities.mean():.4f}, Std: {svd_similarities.std():.4f}")
print(f"NMF similarities - Mean: {nmf_similarities.mean():.4f}, Std: {nmf_similarities.std():.4f}")
# print(f"\nSVD explained variance (top 10): {(sigma_svd[:10]**2 / np.sum(sigma_svd**2) * 100):.1f}%")
print(f"NMF reconstruction error: {nmf_model.reconstruction_err_:.4f}")

## Find Most Similar Book Pairs

In [None]:
def find_most_similar_pairs(similarity_matrix, method_name, top_k=10):
    """
    Find the most similar book pairs
    """
    print(f"\n🔍 Most Similar Book Pairs ({method_name}):")
    print("=" * 60)
    
    # Get upper triangle (avoid duplicates and self-similarities)
    similarity_pairs = []
    for i in range(len(similarity_matrix)):
        for j in range(i+1, len(similarity_matrix)):
            similarity_pairs.append((i, j, similarity_matrix[i, j]))
    
    # Sort by similarity (descending)
    similarity_pairs.sort(key=lambda x: x[2], reverse=True)
    
    for rank, (book1_idx, book2_idx, similarity) in enumerate(similarity_pairs[:top_k], 1):
        try:
            title1 = idx_to_title[book1_idx]
            title2 = idx_to_title[book2_idx]
            author1 = idx_to_author.get(book1_idx, "Unknown")
            author2 = idx_to_author.get(book2_idx, "Unknown")
            
            print(f"{rank:2d}. Similarity: {similarity:.3f}")
            print(f"    📖 {title1}")
            print(f"       by {author1}")
            print(f"    📖 {title2}")
            print(f"       by {author2}")
            print()
        except KeyError:
            continue

# Find most similar pairs for both methods
find_most_similar_pairs(book_sim_svd, "SVD", top_k=10)
find_most_similar_pairs(book_sim_nmf, "NMF", top_k=10)

## Interactive Recommendation Function

In [None]:
def interactive_recommendations():
    """
    Interactive function for book recommendations
    """
    print("\n🎯 Interactive Book Recommendation System")
    print("Commands:")
    print("  - Type a book title to get recommendations")
    print("  - 'search <query>' to search for books")
    print("  - 'method svd' or 'method nmf' to switch methods")
    print("  - 'quit' to exit")
    print("=" * 60)
    
    current_recommender = recommender_svd  # Default to SVD
    
    while True:
        user_input = input(f"\n[{current_recommender.method_name}] Enter command: ").strip()
        
        if user_input.lower() == 'quit':
            print("Goodbye! 📚")
            break
        
        if user_input.lower().startswith('search '):
            query = user_input[7:]  # Remove 'search ' prefix
            matches = current_recommender.search_books(query, max_results=10)
            if matches:
                print(f"\n🔍 Found {len(matches)} books matching '{query}':")
                for i, (title, author) in enumerate(matches, 1):
                    print(f"  {i:2d}. {title} by {author}")
            else:
                print(f"❌ No books found matching '{query}'")
            continue
        
        if user_input.lower().startswith('method '):
            method = user_input[7:].lower()
            if method == 'svd':
                current_recommender = recommender_svd
                print("✅ Switched to SVD method")
            elif method == 'nmf':
                current_recommender = recommender_nmf
                print("✅ Switched to NMF method")
            else:
                print("❌ Unknown method. Use 'svd' or 'nmf'")
            continue
        
        # Try to get recommendations
        recommendations = current_recommender.find_similar_books(user_input, k=8)
        current_recommender.display_recommendations(user_input, recommendations, k=8)

# Uncomment to run interactive session
# interactive_recommendations()

## Save Results

In [None]:
# Save the factorization results and mappings
print("\n=== Saving Results ===")

import pickle

# Save SVD results
np.savez_compressed('matrix_factorization_svd.npz',
                   U=U_svd,
                   sigma=sigma_svd, 
                   Vt=Vt_svd,
                   book_similarity=book_sim_svd,
                   book_factors=book_factors_svd)

# Save NMF results
np.savez_compressed('matrix_factorization_nmf.npz',
                   W=W_nmf,
                   H=H_nmf,
                   book_similarity=book_sim_nmf,
                   book_factors=book_factors_nmf)

# Save mappings and metadata
mappings = {
    'title_to_idx': title_to_idx,
    'idx_to_title': idx_to_title,
    'idx_to_author': idx_to_author,
    'book_to_idx': book_to_idx,
    'idx_to_book': idx_to_book,
    'user_to_idx': user_to_idx,
    'idx_to_user': idx_to_user,
    'n_users': n_users,
    'n_books': n_books,
    'k_factors': k_factors
}

with open('matrix_factorization_mappings.pkl', 'wb') as f:
    pickle.dump(mappings, f)

# Save the rating matrix
from scipy.sparse import save_npz
save_npz('rating_matrix.npz', rating_matrix)

print("✅ Saved:")
print("  - matrix_factorization_svd.npz")
print("  - matrix_factorization_nmf.npz")
print("  - matrix_factorization_mappings.pkl")
print("  - rating_matrix.npz")

print(f"\n📊 Final Summary:")
print(f"  Users: {n_users:,}")
print(f"  Books: {n_books:,}")
print(f"  Ratings: {len(filtered_ratings):,}")
print(f"  Sparsity: {(1 - len(filtered_ratings) / (n_users * n_books)) * 100:.2f}%")
print(f"  Factors: {k_factors}")
print(f"  SVD reconstruction quality: RMSE = {np.sqrt(svd_error):.4f}" if use_dense else "")
print(f"  NMF reconstruction error: {nmf_model.reconstruction_err_:.4f}")

In [None]:
# Finish wandb run
wandb.log({
    'final_metrics': {
        'n_users': n_users,
        'n_books': n_books,
        'n_ratings': len(filtered_ratings),
        'sparsity': (1 - len(filtered_ratings) / (n_users * n_books)) * 100,
        'k_factors': k_factors,
        'svd_explained_variance_top10': (sigma_svd[:10]**2).sum() / (sigma_svd**2).sum() * 100,
        'nmf_reconstruction_error': nmf_model.reconstruction_err_
    }
})

run.finish()