In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load the main files
books = pd.read_csv("books.csv")
ratings = pd.read_csv("ratings.csv")
book_tags = pd.read_csv("book_tags.csv")
tags = pd.read_csv("tags.csv")
to_read = pd.read_csv("to_read.csv")

In [None]:
# Check the first few rows and basic info
print("Books Data:")
print(books.info(),"\n")

print("Ratings Data:")
print(ratings.info(),"\n")

print("Book Tags Data:")
print(book_tags.info(), "\n")

print("Tags Data:")
print(tags.info(), "\n")

print("To Read Data:")
print(to_read.info(),"\n")


In [None]:
books.head()

In [None]:
ratings.head()

In [None]:
book_tags.head()

In [None]:
tags.head()

In [None]:
to_read.head()

In [None]:
print("Missing values:\n")
print("Books:\n", books.isnull().sum(), "\n")
print("Ratings:\n", ratings.isnull().sum(), "\n")
print("Book Tags:\n", book_tags.isnull().sum(), "\n")
print("Tags:\n", tags.isnull().sum(), "\n")
print("To Read:\n", to_read.isnull().sum(), "\n")

In [None]:
# Normalize ratings using Min-Max Scaling (0 to 1)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
ratings['normalized_rating'] = scaler.fit_transform(ratings[['rating']])

# Optional: show a few samples
ratings[['user_id', 'book_id', 'rating', 'normalized_rating']].head()

In [None]:
# Check missing values
print(books.isnull().sum())
print(ratings.isnull().sum())
print(book_tags.isnull().sum())
print(tags.isnull().sum())
print(to_read.isnull().sum())

In [None]:
books['isbn'] = books['isbn'].fillna('0000000000')
books['isbn13'] = books['isbn13'].fillna('0000000000000').astype(str)
books['original_title'] = books['original_title'].fillna(books['title'])
books['original_publication_year'] = books['original_publication_year'].fillna(books['original_publication_year'].median())
books['language_code'] = books['language_code'].fillna('unknown')

In [None]:
print("After Preprocessing:")
print(books.isnull().sum())
print(ratings.isnull().sum())
print(book_tags.isnull().sum())
print(tags.isnull().sum())
print(to_read.isnull().sum())

In [None]:
books.drop_duplicates(inplace=True)
ratings.drop_duplicates(inplace=True)
book_tags.drop_duplicates(inplace=True)
tags.drop_duplicates(inplace=True)
to_read.drop_duplicates(inplace=True)

In [None]:
ratings = ratings[ratings['book_id'].isin(books['book_id'])]
to_read = to_read[to_read['book_id'].isin(books['book_id'])]
book_tags = book_tags[book_tags['goodreads_book_id'].isin(books['book_id'])]

In [None]:
print(books.columns)


In [None]:
# Merge book_tags with tags
tagged_books = pd.merge(book_tags, tags, on="tag_id", how="left")

In [None]:
tagged_books

In [None]:
# Merge book_tags with tags
book_tags_merged = pd.merge(book_tags, tags, on='tag_id', how='left')

# Optional: Merge with books for enriched data
book_tags_full = (
    book_tags
    .merge(tags, on='tag_id', how='left')
    .merge(books, left_on='goodreads_book_id', right_on='best_book_id', how='inner')
)

book_tags_full[['title', 'tag_name', 'count']].head()


In [None]:
tags_per_book = (
    book_tags_full
    .groupby('book_id')['tag_name']
    .apply(lambda s: ' '.join(s.fillna('').astype(str)))
    .reset_index())
tags_per_book.head()

In [None]:
# Ratings distribution
import matplotlib.pyplot as plt
import seaborn as sns

sns.countplot(x='rating', data=ratings)
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

In [None]:
# Step 1: Get top 10 most rated book IDs from ratings
top_book_ids = ratings['book_id'].value_counts().head(10)
top_books_df = top_book_ids.reset_index()
top_books_df.columns = ['book_id', 'num_ratings']  # renamed to avoid column clash

# Step 2: Merge with books to get titles and authors
top_books_merged = top_books_df.merge(books, on='book_id')

# Step 3: Plot
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.barplot(y='title', x='num_ratings', data=top_books_merged, palette='plasma')
plt.title('Top 10 Most Rated Books')
plt.xlabel('Number of Ratings')
plt.ylabel('Book Title')
plt.tight_layout()
plt.show()


In [None]:
# Step 1: Calculate mean ratings and rating counts
mean_ratings = ratings.groupby('book_id')['rating'].mean()
ratings_count = ratings['book_id'].value_counts()

# Step 2: Filter only books with 100+ ratings
popular_books = mean_ratings[ratings_count >= 100].sort_values(ascending=False).head(10)

# Step 3: Get titles and authors (including book_id this time)
top_rated = books[books['book_id'].isin(popular_books.index)][['book_id', 'title', 'authors']]

# Step 4: Map the average ratings back
top_rated['average_rating'] = top_rated['book_id'].map(mean_ratings)

# Step 5: Sort and Plot
top_rated = top_rated.sort_values('average_rating', ascending=False)

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.barplot(y='title', x='average_rating', data=top_rated, palette='viridis')

plt.title('Top 10 Highest Rated Books (Min 100 Ratings)')
plt.xlabel('Average Rating')
plt.ylabel('Book Title')
plt.tight_layout()
plt.show()

In [None]:
popular_authors = books['authors'].value_counts().head(10)
sns.barplot(y=popular_authors.index, x=popular_authors.values)
plt.title('Most Frequent Authors in Dataset')
plt.xlabel('Number of Books')
plt.ylabel('Author')
# Add value labels to bars
for i, value in enumerate(popular_authors.values):
    plt.text(value + 1, i, str(value), va='center')  # va = vertical alignment
plt.show()

In [None]:
lang_counts = books['language_code'].value_counts().head(10)
sns.barplot(x=lang_counts.index, y=lang_counts.values)
plt.title('Top 10 Languages')
plt.xlabel('Language Code')
plt.ylabel('Number of Books')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# If you have a ratings DataFrame with numeric fields (e.g., rating, normalized_rating)
numeric_cols = ratings.select_dtypes(include=['float64', 'int64'])
corr = numeric_cols.corr()

plt.figure(figsize=(6,4))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap (Numeric Features)")
plt.show()

In [None]:
# Popularity = number of ratings per book
book_popularity = ratings.groupby('book_id').size().reset_index(name='rating_count')
book_avg_rating = ratings.groupby('book_id')['rating'].mean().reset_index(name='avg_rating')

popularity_df = book_popularity.merge(book_avg_rating, on='book_id')

plt.figure(figsize=(8,5))
sns.scatterplot(data=popularity_df, x='rating_count', y='avg_rating', alpha=0.6)
plt.title("Book Popularity vs. Average Rating")
plt.xlabel("Number of Ratings")
plt.ylabel("Average Rating")
plt.show()

In [None]:
# Basic statistical summaries for ratings
print("Overall Rating Stats:")
print(ratings['rating'].describe())

# Ratings per book
book_stats = ratings.groupby('book_id')['rating'].agg(['count', 'mean', 'median', 'std']).reset_index()
book_stats.rename(columns={'count': 'num_ratings', 'mean': 'avg_rating', 'std': 'std_dev'}, inplace=True)

print("\nSample Book Rating Stats:")
print(book_stats.head())

# Ratings per user
user_stats = ratings.groupby('user_id')['rating'].agg(['count', 'mean', 'median', 'std']).reset_index()
user_stats.rename(columns={'count': 'num_ratings', 'mean': 'avg_rating', 'std': 'std_dev'}, inplace=True)

print("\nSample User Rating Stats:")
print(user_stats.head())

In [None]:
books.head()

In [None]:
ratings.head()

In [None]:
book_tags.head()

In [None]:
tags.head()

In [None]:
to_read.head()

In [None]:
book_tags_full.head(50)

In [None]:
# Save the cleaned and processed dataframes to new CSV files
books.to_csv('cleaned_books.csv', index=False)
ratings.to_csv('cleaned_ratings.csv', index=False)
book_tags_full.to_csv('cleaned_book_tags.csv', index=False)
to_read.to_csv('cleaned_to_read.csv', index=False)
tagged_books.to_csv('tagged_books.csv', index=False)


In [None]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split

# Load cleaned ratings
ratings = pd.read_csv("cleaned_ratings.csv")

# Reader (rating scale from 1–5)
reader = Reader(rating_scale=(1, 5))

# Use full dataset (but you can sample for user-based CF to avoid memory issues)
data = Dataset.load_from_df(ratings[['user_id', 'book_id', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2)

In [None]:
import pandas as pd
from surprise import Dataset, Reader, KNNBasic, accuracy
from surprise.model_selection import train_test_split

# Load cleaned ratings
ratings = pd.read_csv("cleaned_ratings.csv")

# Reader with rating scale
reader = Reader(rating_scale=(1, 5))

# ---------------------------
# ITEM-BASED COLLABORATIVE FILTERING
# ---------------------------
print("\n🔹 Item-Based CF (Full Dataset)")
data_item = Dataset.load_from_df(ratings[['user_id', 'book_id', 'rating']], reader)
trainset_item, testset_item = train_test_split(data_item, test_size=0.2)

sim_options_item = {'name': 'cosine', 'user_based': False}   # Item-based
item_cf = KNNBasic(sim_options=sim_options_item, verbose=True)
item_cf.fit(trainset_item)
predictions_item = item_cf.test(testset_item)

print("Item-Based CF Results:")
accuracy.rmse(predictions_item)
accuracy.mae(predictions_item)


# ---------------------------
# USER-BASED COLLABORATIVE FILTERING (on sample to avoid memory error)
# ---------------------------
print("\n🔹 User-Based CF (Sampled Dataset)")

# Take 10% of ratings for user-based (to reduce matrix size)
sampled_ratings = ratings.sample(frac=0.1, random_state=42)
data_user = Dataset.load_from_df(sampled_ratings[['user_id', 'book_id', 'rating']], reader)
trainset_user, testset_user = train_test_split(data_user, test_size=0.2)

sim_options_user = {'name': 'cosine', 'user_based': True}    # User-based
user_cf = KNNBasic(sim_options=sim_options_user, verbose=True)
user_cf.fit(trainset_user)
predictions_user = user_cf.test(testset_user)

print("User-Based CF Results (on sample):")
accuracy.rmse(predictions_user)
accuracy.mae(predictions_user)


In [None]:
from surprise import SVD, NMF

# SVD
svd = SVD()
svd.fit(trainset)
predictions_svd = svd.test(testset)
print("SVD")
accuracy.rmse(predictions_svd)
accuracy.mae(predictions_svd)

# NMF
nmf = NMF()
nmf.fit(trainset)
predictions_nmf = nmf.test(testset)
print("NMF")
accuracy.rmse(predictions_nmf)
accuracy.mae(predictions_nmf)

In [None]:
from collections import defaultdict

# Function to get Top-N predictions
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

# Precision, Recall, F1 at K
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions, recalls, f1s = [], [], []

    for uid, user_ratings in user_est_true.items():
        # Sort by estimated rating
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        top_k = user_ratings[:k]

        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)     # all relevant
        n_rel_and_rec = sum((true_r >= threshold) for (_, true_r) in top_k)    # relevant in top-k

        prec = n_rel_and_rec / k if k else 0
        rec = n_rel_and_rec / n_rel if n_rel else 0
        f1 = (2 * prec * rec / (prec + rec)) if (prec + rec) > 0 else 0

        precisions.append(prec)
        recalls.append(rec)
        f1s.append(f1)

    return (sum(precisions) / len(precisions),
            sum(recalls) / len(recalls),
            sum(f1s) / len(f1s))

# Evaluate all models
for name, preds in [
    ("User CF", predictions_user),
    ("Item CF", predictions_item),
    ("SVD", predictions_svd),
    ("NMF", predictions_nmf)
]:
    p, r, f1 = precision_recall_at_k(preds, k=10, threshold=3.5)
    print(f"{name} -> Precision@10: {p:.4f}, Recall@10: {r:.4f}, F1@10: {f1:.4f}")


In [None]:
# Week 3: Content-Based Recommendation System
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import re

# Load your cleaned data
books = pd.read_csv("cleaned_books.csv")
ratings = pd.read_csv("cleaned_ratings.csv")
book_tags_full = pd.read_csv("cleaned_book_tags.csv")
tagged_books = pd.read_csv("tagged_books.csv")

# ========================
# 1. CONTENT-BASED RECOMMENDATION SYSTEM
# ========================

def preprocess_text(text):
    """Clean and preprocess text data"""
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

# Create content features for books
def create_content_features(books_df, book_tags_df):
    """
    Create content-based features by combining:
    - Book titles
    - Authors
    - Tags/genres
    """
    content_df = books_df.copy()
    
    # Preprocess text fields
    content_df['clean_title'] = content_df['title'].apply(preprocess_text)
    content_df['clean_authors'] = content_df['authors'].apply(preprocess_text)
    
    # Get tags for each book
    book_tags_agg = (book_tags_df.groupby('book_id')['tag_name']
                     .apply(lambda x: ' '.join(x.fillna('').astype(str)))
                     .reset_index())
    
    # Merge tags with books
    content_df = content_df.merge(book_tags_agg, on='book_id', how='left')
    content_df['tag_name'] = content_df['tag_name'].fillna('')
    content_df['clean_tags'] = content_df['tag_name'].apply(preprocess_text)
    
    # Combine all text features
    content_df['combined_features'] = (
        content_df['clean_title'] + ' ' + 
        content_df['clean_authors'] + ' ' + 
        content_df['clean_tags']
    )
    
    return content_df

# Create content features
content_books = create_content_features(books, book_tags_full)
print("Content features created!")
print(f"Sample combined features:\n{content_books[['title', 'combined_features']].head()}")

# ========================
# 2. TF-IDF VECTORIZATION
# ========================

# Create TF-IDF matrix
tfidf = TfidfVectorizer(
    max_features=5000,      # Limit features to avoid memory issues
    stop_words='english',   # Remove common English words
    ngram_range=(1, 2),     # Use unigrams and bigrams
    min_df=2,               # Ignore terms appearing in less than 2 documents
    max_df=0.8              # Ignore terms appearing in more than 80% of documents
)

# Fit TF-IDF on combined features
tfidf_matrix = tfidf.fit_transform(content_books['combined_features'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# ========================
# 3. CONTENT-BASED RECOMMENDATION FUNCTIONS
# ========================

def get_content_recommendations(book_id, books_df, tfidf_matrix, top_n=10):
    """
    Get content-based recommendations for a given book
    """
    try:
        # Find the index of the book
        book_idx = books_df[books_df['book_id'] == book_id].index[0]
        
        # Calculate cosine similarity with all other books
        cosine_sim = cosine_similarity(tfidf_matrix[book_idx:book_idx+1], tfidf_matrix).flatten()
        
        # Get similarity scores and sort
        sim_scores = list(enumerate(cosine_sim))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Get top N similar books (excluding the input book itself)
        sim_scores = sim_scores[1:top_n+1]
        
        # Get book indices and similarities
        book_indices = [i[0] for i in sim_scores]
        similarities = [i[1] for i in sim_scores]
        
        # Return recommended books with similarity scores
        recommendations = books_df.iloc[book_indices][['book_id', 'title', 'authors']].copy()
        recommendations['similarity_score'] = similarities
        
        return recommendations
        
    except IndexError:
        print(f"Book ID {book_id} not found in dataset")
        return pd.DataFrame()

def get_user_content_recommendations(user_id, ratings_df, books_df, tfidf_matrix, top_n=10):
    """
    Get content-based recommendations for a user based on their reading history
    """
    # Get books rated highly by the user (rating >= 4)
    user_books = ratings_df[(ratings_df['user_id'] == user_id) & 
                           (ratings_df['rating'] >= 4)]['book_id'].tolist()
    
    if not user_books:
        print(f"No high-rated books found for user {user_id}")
        return pd.DataFrame()
    
    # Get content recommendations for each book the user liked
    all_recommendations = []
    
    for book_id in user_books[:5]:  # Limit to top 5 liked books to avoid overwhelming
        recs = get_content_recommendations(book_id, books_df, tfidf_matrix, top_n=20)
        if not recs.empty:
            all_recommendations.append(recs)
    
    if not all_recommendations:
        return pd.DataFrame()
    
    # Combine all recommendations
    combined_recs = pd.concat(all_recommendations, ignore_index=True)
    
    # Remove books the user has already rated
    user_rated_books = ratings_df[ratings_df['user_id'] == user_id]['book_id'].tolist()
    combined_recs = combined_recs[~combined_recs['book_id'].isin(user_rated_books)]
    
    # Group by book and average similarity scores
    final_recs = (combined_recs.groupby(['book_id', 'title', 'authors'])['similarity_score']
                  .mean().reset_index())
    
    # Sort by similarity score and return top N
    final_recs = final_recs.sort_values('similarity_score', ascending=False).head(top_n)
    
    return final_recs

# ========================
# 4. TEST CONTENT-BASED RECOMMENDATIONS
# ========================

# Test book-to-book recommendations
sample_book_id = content_books['book_id'].iloc[0]
print(f"\n📚 Content-based recommendations for book ID {sample_book_id}:")
print(f"Book: {content_books[content_books['book_id'] == sample_book_id]['title'].iloc[0]}")

book_recs = get_content_recommendations(sample_book_id, content_books, tfidf_matrix, top_n=5)
print(book_recs)

# Test user-based content recommendations
sample_user = ratings['user_id'].iloc[0]
print(f"\n👤 Content-based recommendations for user {sample_user}:")

user_recs = get_user_content_recommendations(sample_user, ratings, content_books, tfidf_matrix, top_n=5)
print(user_recs)

# ========================
# 5. EVALUATION METRICS FOR CONTENT-BASED SYSTEM
# ========================

def evaluate_content_based_system(ratings_df, books_df, tfidf_matrix, sample_users=100):
    """
    Evaluate content-based system using precision and coverage metrics
    """
    from collections import defaultdict
    
    # Sample random users for evaluation
    unique_users = ratings_df['user_id'].unique()
    sample_user_ids = np.random.choice(unique_users, min(sample_users, len(unique_users)), replace=False)
    
    total_precision = 0
    total_coverage = 0
    successful_recs = 0
    
    for user_id in sample_user_ids:
        # Get recommendations
        recs = get_user_content_recommendations(user_id, ratings_df, books_df, tfidf_matrix, top_n=10)
        
        if recs.empty:
            continue
            
        # Get user's actual high ratings (test set)
        user_high_rated = ratings_df[(ratings_df['user_id'] == user_id) & 
                                   (ratings_df['rating'] >= 4)]['book_id'].tolist()
        
        if len(user_high_rated) < 2:  # Need at least 2 books for meaningful evaluation
            continue
            
        # Calculate precision (how many recommended books are actually liked)
        recommended_books = recs['book_id'].tolist()
        relevant_recs = len(set(recommended_books) & set(user_high_rated))
        precision = relevant_recs / len(recommended_books) if recommended_books else 0
        
        total_precision += precision
        successful_recs += 1
    
    avg_precision = total_precision / successful_recs if successful_recs > 0 else 0
    
    print(f"Content-Based System Evaluation:")
    print(f"Average Precision@10: {avg_precision:.4f}")
    print(f"Successfully evaluated users: {successful_recs}/{len(sample_user_ids)}")
    
    return avg_precision

# Evaluate the content-based system
content_precision = evaluate_content_based_system(ratings, content_books, tfidf_matrix, sample_users=50)