**Book Recommendation System
This notebook builds a book recommendation system using collaborative filtering and similarity-based techniques**

Step 1: Importing Required Libraries

In [1]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import numpy as np
import ipywidgets as widgets
from IPython.display import display

Step 2: Loading the Dataset  

In [2]:
ratings_df = pd.read_csv(r"E:\ratings.csv")
books_df = pd.read_csv(r"E:\books.csv")

In [3]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'book_id', 'rating']], reader)

In [4]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
model = SVD()
model.fit(trainset)
predictions = model.test(testset)

In [5]:
print("RMSE:", accuracy.rmse(predictions))
print("MAE:", accuracy.mae(predictions))

RMSE: 0.8429
RMSE: 0.8428999744254491
MAE:  0.6592
MAE: 0.6591772253076086


Step 3: Data Cleaning and Preprocessing  

In [6]:
user_id = 42

rated_books = ratings_df[ratings_df['user_id'] == user_id]['book_id'].tolist()

all_book_ids = books_df['book_id'].unique()

unrated_books = [bid for bid in all_book_ids if bid not in rated_books]

predictions = [model.predict(user_id, bid) for bid in unrated_books]

top_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:10]


In [7]:
top_book_ids = [int(pred.iid) for pred in top_predictions]
recommended_books = books_df[books_df['book_id'].isin(top_book_ids)][['title', 'authors', 'average_rating']]
recommended_books

Unnamed: 0,title,authors,average_rating
144,Deception Point,Dan Brown,3.67
147,Girl with a Pearl Earring,Tracy Chevalier,3.85
232,Love in the Time of Cholera,"Gabriel García Márquez, Edith Grossman",3.89
662,A People's History of the United States,Howard Zinn,4.07
1298,Still Life with Woodpecker,Tom Robbins,4.04
1387,The Last Juror,John Grisham,3.85
3254,The Beautiful and Damned,F. Scott Fitzgerald,3.75
3909,Peter and the Shadow Thieves (Peter and the St...,"Dave Barry, Ridley Pearson, Greg Call",4.15
4871,Villa Incognito,Tom Robbins,3.65
6662,The Taste of Home Cookbook,"Janet Briggs, Beth Wittlinger",4.25


In [8]:
def recommend_books(user_id, model, books_df, ratings_df, top_n=10):
    rated_books = ratings_df[ratings_df['user_id'] == user_id]['book_id'].tolist()

    all_book_ids = books_df['book_id'].unique()

    unrated_books = [bid for bid in all_book_ids if bid not in rated_books]

    predictions = [model.predict(user_id, bid) for bid in unrated_books]

    top_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:top_n]

    top_book_ids = [int(pred.iid) for pred in top_predictions]

    recommended_books = books_df[books_df['book_id'].isin(top_book_ids)][['title', 'authors', 'average_rating']]
    
    return recommended_books


In [9]:
recommended_books = recommend_books(42, model, books_df, ratings_df, top_n=10)
print(recommended_books)


                                                  title  \
144                                     Deception Point   
147                           Girl with a Pearl Earring   
232                         Love in the Time of Cholera   
662             A People's History of the United States   
1298                         Still Life with Woodpecker   
1387                                     The Last Juror   
3254                           The Beautiful and Damned   
3909  Peter and the Shadow Thieves (Peter and the St...   
4871                                    Villa Incognito   
6662                         The Taste of Home Cookbook   

                                     authors  average_rating  
144                                Dan Brown            3.67  
147                          Tracy Chevalier            3.85  
232   Gabriel García Márquez, Edith Grossman            3.89  
662                              Howard Zinn            4.07  
1298                             To

Step 4: Filtering Popular Books  

In [11]:
book_tags_df = pd.read_csv(r"E:\book_tags.csv")
tags_df = pd.read_csv(r"E:\tags.csv")

book_tags_merged = pd.merge(book_tags_df, tags_df, on='tag_id', how='left')

book_tags_grouped = book_tags_merged.groupby('goodreads_book_id')['tag_name'].apply(lambda x: ' '.join(x)).reset_index()

books_df = pd.merge(books_df, book_tags_grouped, left_on='book_id', right_on='goodreads_book_id', how='left')

books_df['metadata'] = books_df['title'] + ' ' + books_df['authors'] + ' ' + books_df['tag_name'].fillna('')


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books_df['metadata'])

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

indices = pd.Series(books_df.index, index=books_df['title'].str.lower())


In [13]:
# Print a list of book titles
print(books_df['title'].head(20))  # Print top 20 books to check for matching titles


0               The Hunger Games (The Hunger Games, #1)
1     Harry Potter and the Sorcerer's Stone (Harry P...
2                               Twilight (Twilight, #1)
3                                 To Kill a Mockingbird
4                                      The Great Gatsby
5                                The Fault in Our Stars
6                                            The Hobbit
7                                The Catcher in the Rye
8                 Angels & Demons  (Robert Langdon, #1)
9                                   Pride and Prejudice
10                                      The Kite Runner
11                            Divergent (Divergent, #1)
12                                                 1984
13                                          Animal Farm
14                            The Diary of a Young Girl
15     The Girl with the Dragon Tattoo (Millennium, #1)
16                 Catching Fire (The Hunger Games, #2)
17    Harry Potter and the Prisoner of Azkaban (

Step 5: Creating the Book-User Matrix  

In [14]:
def recommend_similar_books(book_title, cosine_sim=cosine_sim, top_n=10):
    # Normalize book titles to lowercase for case-insensitive matching
    book_title = book_title.lower()

    # Try to find exact match first
    if book_title in indices.index:
        idx = indices[book_title]
    else:
        # If exact match is not found, try partial matching
        matching_books = books_df[books_df['title'].str.contains(book_title, case=False, na=False)]
        
        if matching_books.empty:
            print(f"❌ '{book_title}' not found in the dataset. Try another title.")
            return None
        
        print(f"Found the following matching books:")
        print(matching_books[['title']].head(10))  # Show top 10 matches
        idx = indices[matching_books.iloc[0]['title'].lower()]
    
    # Get similarity scores for the book
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the books based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top N book indices
    sim_scores = sim_scores[1:top_n+1]
    book_indices = [i[0] for i in sim_scores]
    
    return books_df['title'].iloc[book_indices]


In [15]:
recommended_books_content = recommend_similar_books("Harry Potter and the Sorcerer's Stone")
if recommended_books_content is not None:
    print(recommended_books_content)


Found the following matching books:
                                               title
1  Harry Potter and the Sorcerer's Stone (Harry P...
22      Harry Potter and the Chamber of Secrets (Harry...
26      Harry Potter and the Half-Blood Prince (Harry ...
23      Harry Potter and the Goblet of Fire (Harry Pot...
24      Harry Potter and the Deathly Hallows (Harry Po...
17      Harry Potter and the Prisoner of Azkaban (Harr...
20      Harry Potter and the Order of the Phoenix (Har...
6140    Harry Potter and the Order of the Phoenix (Har...
3752         Harry Potter Collection (Harry Potter, #1-6)
421              Harry Potter Boxset (Harry Potter, #1-7)
2100    The Harry Potter Collection 1-4 (Harry Potter,...
Name: title, dtype: object


In [16]:
from surprise import Dataset, Reader

# Define the format for the ratings DataFrame (userId, bookId, rating)
reader = Reader(rating_scale=(1, 10))  # Assuming ratings are in the range 1-10
data = Dataset.load_from_df(ratings_df[['user_id', 'book_id', 'rating']], reader)


In [17]:
from surprise.model_selection import train_test_split

# Split into train/test set (80% train, 20% test)
trainset, testset = train_test_split(data, test_size=0.2)


In [18]:
from surprise import SVD

# Initialize and train the model
model = SVD()
model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1e21d3cf8f0>

Step 6: Building the Recommendation Model 

In [19]:
from surprise import accuracy

# Make predictions on the test set
predictions = model.test(testset)

# Calculate RMSE
rmse = accuracy.rmse(predictions)
print(f'RMSE: {rmse}')


RMSE: 0.8439
RMSE: 0.8438521411976474


In [21]:
def get_collab_recommendations(user_id, model, books_df, ratings_df, top_n=10):
    # Get all book IDs
    all_book_ids = books_df['book_id'].unique()
    
    # Get books the user has already rated
    rated_books = ratings_df[ratings_df['user_id'] == user_id]['book_id'].tolist()
    
    # Filter out rated books
    books_to_predict = [bid for bid in all_book_ids if bid not in rated_books]
    
    # Predict rating for each book
    predictions = [model.predict(user_id, book_id) for book_id in books_to_predict]
    
    # Sort by predicted rating
    top_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:top_n]
    
    # Get book titles for top predictions
    top_books = [books_df[books_df['book_id'] == int(pred.iid)].title.values[0] for pred in top_predictions]
    
    return top_books


# Function to get content-based recommendations
def get_content_recommendations(book_title, books_df, indices, cosine_sim, top_n=10):
    idx = indices[book_title.lower()]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    book_indices = [i[0] for i in sim_scores]
    return books_df['title'].iloc[book_indices].tolist()


# Combine both lists (Hybrid)
def hybrid_recommendations(user_id, favorite_book_title, model, books_df, ratings_df, indices, cosine_sim, top_n=10):
    collab_recs = get_collab_recommendations(user_id, model, books_df, ratings_df, top_n=top_n)
    content_recs = get_content_recommendations(favorite_book_title, books_df, indices, cosine_sim, top_n=top_n)
    
    # Combine and remove duplicates, prioritize overlap
    combined = list(dict.fromkeys(collab_recs + content_recs))  # Maintains order and removes duplicates
    return combined[:top_n]


Step 7: Making Book Recommendations  

In [22]:
user_id = 123  # Use a valid user ID from your ratings.csv
favorite_book = "The Hunger Games (The Hunger Games, #1)"

recommendations = hybrid_recommendations(user_id, favorite_book, model, books_df, ratings_df, indices, cosine_sim)
print("📚 Recommended Books:")
for book in recommendations:
    print("•", book)


📚 Recommended Books:
• Still Life with Woodpecker
• The Beautiful and Damned
• Amsterdam
• The Curious Incident of the Dog in the Night-Time
• Tears of the Giraffe (No. 1 Ladies' Detective Agency, #2)
• The Taste of Home Cookbook
• The Millionaire Next Door: The Surprising Secrets of Americas Wealthy
• Villa Incognito
• Jewel
• Peter and the Shadow Thieves (Peter and the Starcatchers, #2)


Summary

In [27]:
def get_popular_books(books_df, ratings_df, top_n=10):
    # Merge books with ratings
    merged = ratings_df.merge(books_df, on='book_id')
    popular_books = (
        merged.groupby('title')['rating']
        .agg(['count', 'mean'])
        .sort_values(by=['count', 'mean'], ascending=False)
        .head(top_n)
        .index.tolist()
    )
    return popular_books

def run_enhanced_interface():
    print("📚 Welcome to the Book Recommender!")
    print("You can skip User ID if you want content-based suggestions only.")
    print("Type 'exit' anytime to quit.\n")

    while True:
        user_input = input("Enter User ID (or press Enter to skip / type 'exit'): ").strip()
        if user_input.lower() == 'exit':
            print("👋 Exiting the recommender. Have a great day!")
            break

        user_id = None
        if user_input != '':
            try:
                user_id = int(user_input)
            except ValueError:
                print("❌ Invalid User ID. Please enter a numeric value or leave blank.")
                continue

        book_title = input("Enter a book title (or type 'exit'): ").strip()
        if book_title.lower() == 'exit':
            print("👋 Exiting the recommender. Have a great day!")
            break

        try:
            # Check if title exists in dataset
            if book_title.lower() not in indices:
                print("\n⚠️ Book title not found. Showing popular books instead:\n")
                popular = get_popular_books(books_df, ratings_df)
                for i, title in enumerate(popular, 1):
                    print(f"{i}. {title}")
                print("\n--------------------------------------------\n")
                continue

            if user_id is not None:
                recommendations = hybrid_recommendations(
                    user_id,
                    book_title,
                    model,
                    books_df,
                    ratings_df,
                    indices,
                    cosine_sim
                )
            else:
                recommendations = recommend_similar_books(book_title)

            print("\n✅ Top Recommendations:")
            for idx, book in enumerate(recommendations, 1):
                print(f"{idx}. {book}")
            print("\n--------------------------------------------\n")

        except Exception as e:
            print("❌ Error:", str(e))
            print("⚠️ Please try again with a valid book title.")
            print("\n--------------------------------------------\n")

# Run the interface
run_enhanced_interface()


📚 Welcome to the Book Recommender!
You can skip User ID if you want content-based suggestions only.
Type 'exit' anytime to quit.



Enter User ID (or press Enter to skip / type 'exit'):  
Enter a book title (or type 'exit'):  1984



✅ Top Recommendations:
1. Animal Farm
2. Brave New World
3. Animal Farm / 1984
4. Lord of the Flies
5. Keep the Aspidistra Flying
6. Fahrenheit 451
7. A Clockwork Orange
8. The Catcher in the Rye
9. One Flew Over the Cuckoo's Nest
10. The Invisible Man

--------------------------------------------



Enter User ID (or press Enter to skip / type 'exit'):  exit


👋 Exiting the recommender. Have a great day!


**Method 2**: Genre based Classification

In [28]:
# Merge tag data
tags = pd.read_csv(r"E:\tags.csv")
book_tags = pd.read_csv(r"E:\book_tags.csv")
book_tags = book_tags.merge(tags, on='tag_id')

# Clean tag counts
book_tags = book_tags[book_tags['count'] > 50]  # Optional threshold

# Aggregate tags per book
book_genres = book_tags.groupby('goodreads_book_id')['tag_name'].apply(list).reset_index()
book_genres.columns = ['goodreads_book_id', 'genres']

# Merge with books_df
books_with_genres = books_df.merge(book_genres, left_on='book_id', right_on='goodreads_book_id', how='left')


In [33]:
def filter_by_genre(recommendations, books_df, genre):
    filtered_books = []
    for title in recommendations:
        genres = books_df.loc[books_df['title'] == title, 'genres'].values
        if len(genres) > 0 and isinstance(genres[0], list):
            if any(genre.lower() in g.lower() for g in genres[0]):
                filtered_books.append(title)
    return filtered_books


In [34]:
genre_input = input("Enter a genre to filter by (or press Enter to skip): ").strip()

# After getting `recommendations`, apply genre filter
if genre_input:
    recommendations = filter_by_genre(recommendations, books_with_genres, genre_input)
    if not recommendations:
        print(f"⚠️ No recommendations found for genre: {genre_input}.")
    else:
        print(f"\n✅ Top Recommendations in '{genre_input}':")
        for i, book in enumerate(recommendations, 1):
            print(f"{i}. {book}")
else:
    print("\n✅ Top Recommendations:")
    for i, book in enumerate(recommendations, 1):
        print(f"{i}. {book}")


Enter a genre to filter by (or press Enter to skip):  fiction



✅ Top Recommendations in 'fiction':
1. Tears of the Giraffe (No. 1 Ladies' Detective Agency, #2)
