In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
books = pd.read_csv("Books.csv", dtype={"Column_Name": str}, low_memory=False)
ratings = pd.read_csv("Ratings.csv")
books_data = books.merge(ratings, on="ISBN")

In [5]:
# Preprocessing
df = books_data.copy()
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.drop(columns=["ISBN", "Year-Of-Publication", "Image-URL-S", "Image-URL-M"], axis=1, inplace=True)
df.drop(index=df[df["Book-Rating"] == 0].index, inplace=True)
df["Book-Title"] = df["Book-Title"].apply(lambda x: re.sub(r"[^\w\s]+", " ", x).strip())


In [6]:
# Function to get popular books
def popular_books(df, n=100):
    rating_count = df.groupby("Book-Title").count()["Book-Rating"].reset_index()
    rating_count.rename(columns={"Book-Rating": "NumberOfVotes"}, inplace=True)

    rating_average = df.groupby("Book-Title")["Book-Rating"].mean().reset_index()
    rating_average.rename(columns={"Book-Rating": "AverageRatings"}, inplace=True)

    popularBooks = rating_count.merge(rating_average, on="Book-Title")

    def weighted_rate(x):
        v = x["NumberOfVotes"]
        R = x["AverageRatings"]
        return ((v * R) + (m * C)) / (v + m)

    C = popularBooks["AverageRatings"].mean()
    m = popularBooks["NumberOfVotes"].quantile(0.90)

    popularBooks = popularBooks[popularBooks["NumberOfVotes"] >= 250]
    popularBooks["Popularity"] = popularBooks.apply(weighted_rate, axis=1)
    popularBooks = popularBooks.sort_values(by="Popularity", ascending=False)
    return popularBooks[["Book-Title", "NumberOfVotes", "AverageRatings", "Popularity"]].reset_index(drop=True).head(n)


In [7]:
# Display top popular books
top_ten = popular_books(df, 5)
print("📘 MOST POPULAR 5 BOOKS")
print(top_ten)

📘 MOST POPULAR 5 BOOKS
                                          Book-Title  NumberOfVotes  \
0   Harry Potter and the Prisoner of Azkaban  Book 3            277   
1                              To Kill a Mockingbird            267   
2  Harry Potter and the Sorcerer s Stone  Harry P...            315   
3    Harry Potter and the Chamber of Secrets  Book 2            326   
4  Tuesdays with Morrie  An Old Man  a Young Man ...            250   

   AverageRatings  Popularity  
0        9.043321    9.016359  
1        8.977528    8.950784  
2        8.936508    8.914416  
3        8.840491    8.820584  
4        8.588000    8.567111  


In [8]:
# Select a book for recommendations
book_names = df["Book-Title"].value_counts().index[:200]
print("\nChoose the book you're reading for advice:")
for i, name in enumerate(book_names):
    print(f"{i + 1}: {name}")


Choose the book you're reading for advice:
1: The Lovely Bones  A Novel
2: Wild Animus
3: The Da Vinci Code
4: The Secret Life of Bees
5: The Nanny Diaries  A Novel
6: The Red Tent  Bestselling Backlist
7: Bridget Jones s Diary
8: A Painted House
9: Life of Pi
10: Harry Potter and the Chamber of Secrets  Book 2
11: Divine Secrets of the Ya Ya Sisterhood  A Novel
12: Harry Potter and the Sorcerer s Stone  Harry Potter  Paperback
13: Angels  amp  Demons
14: The Summons
15: Where the Heart Is  Oprah s Book Club  Paperback
16: The Notebook
17: Girl with a Pearl Earring
18: Harry Potter and the Prisoner of Azkaban  Book 3
19: Snow Falling on Cedars
20: The Pilot s Wife   A Novel
21: To Kill a Mockingbird
22: The Catcher in the Rye
23: Timeline
24: The Testament
25: The Girls  Guide to Hunting and Fishing
26: Summer Sisters
27: Tuesdays with Morrie  An Old Man  a Young Man  and Life s Greatest Lesson
28: Harry Potter and the Goblet of Fire  Book 4
29: Good in Bed
30: The Five People You Mee

In [9]:
def content_based(bookTitle):
    bookTitle = str(bookTitle)
    img_list_content = []

    if bookTitle in df["Book-Title"].values:
        rating_count = pd.DataFrame(df["Book-Title"].value_counts())
        rare_books = rating_count[rating_count["Book-Title"] <= 200].index
        common_books = df[~df["Book-Title"].isin(rare_books)]

        if bookTitle in rare_books:
            print("No Recommendations for this Book ☹️")
        else:
            common_books = common_books.drop_duplicates(subset=["Book-Title"])
            common_books.reset_index(inplace=True)
            common_books["index"] = [i for i in range(common_books.shape[0])]
            targets = ["Book-Title", "Book-Author", "Publisher"]
            common_books["all_features"] = [" ".join(common_books[targets].iloc[i,].values) for i in range(common_books[targets].shape[0])]
            vectorizer = CountVectorizer()
            common_booksVector = vectorizer.fit_transform(common_books["all_features"])
            similarity = cosine_similarity(common_booksVector)
            index = common_books[common_books["Book-Title"] == bookTitle]["index"].values[0]
            similar_books = list(enumerate(similarity[index]))
            similar_booksSorted = sorted(similar_books, key=lambda x: x[1], reverse=True)[1:6]
            books = [common_books[common_books["index"] == similar_booksSorted[i][0]]["Book-Title"].item() for i in range(len(similar_booksSorted))]

            print("\n📕 OTHER USERS' SELECTIONS")
            for book in books:
                img_url = common_books.loc[common_books["Book-Title"] == book, "Image-URL-L"][:1].values[0]
                img_list_content.append(img_url)
                print(book, img_url)

content_based(books)