In [19]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [20]:
# Load the data
df_books = pd.read_csv("BX-Books.csv", encoding="ISO-8859-1", sep=";", usecols=['ISBN', 'Book-Title', 'Book-Author'])
df_ratings = pd.read_csv("BX-Book-Ratings.csv", encoding="ISO-8859-1", sep=";", usecols=['User-ID', 'ISBN', 'Book-Rating'])

In [21]:
# Filter out users with less than 200 ratings and books with less than 100 ratings
user_ratings_count = df_ratings['User-ID'].value_counts()
book_ratings_count = df_ratings['ISBN'].value_counts()
df_ratings = df_ratings[df_ratings['User-ID'].isin(user_ratings_count[user_ratings_count >= 200].index)]
df_ratings = df_ratings[df_ratings['ISBN'].isin(book_ratings_count[book_ratings_count >= 100].index)]

In [22]:
df_books.head(100)

Unnamed: 0,ISBN,Book-Title,Book-Author
0,0195153448,Classical Mythology,Mark P. O. Morford
1,0002005018,Clara Callan,Richard Bruce Wright
2,0060973129,Decision in Normandy,Carlo D'Este
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,0393045218,The Mummies of Urumchi,E. J. W. Barber
...,...,...,...
95,0671867156,Pretend You Don't See Her,Mary Higgins Clark
96,0312252617,Fast Women,Jennifer Crusie
97,0312261594,Female Intelligence,Jane Heller
98,0316748641,Pasquale's Nose: Idle Days in an Italian Town,Michael Rips


In [24]:
# Assuming the df_books DataFrame is already loaded and contains the book information
book_title = "PLEADING GUILTY"

# Filter the DataFrame to find the book
found_book = df_books.loc[df_books['Book-Title'] == book_title]

# Print the result
print(found_book)

         ISBN       Book-Title  Book-Author
7  0671870432  PLEADING GUILTY  Scott Turow


In [25]:
df_ratings.head(10)

Unnamed: 0,User-ID,ISBN,Book-Rating
1456,277427,002542730X,10
1469,277427,0060930535,0
1471,277427,0060934417,0
1474,277427,0061009059,9
1484,277427,0140067477,0
1489,277427,014029628X,0
1493,277427,014100018X,0
1495,277427,0142001740,0
1513,277427,0312966091,0
1514,277427,031298328X,0


In [26]:
# Function to get book recommendations for a given book title
def get_recommendations_for_book(book_title):
    # Merge the books and ratings dataframes
    df_merged = df_ratings.merge(df_books, left_on='ISBN', right_on='ISBN')

    # Create a pivot table to represent users' ratings for each book
    pivot_table = df_merged.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating', fill_value=0)

    # Check if the given book title exists in the dataset
    if book_title not in pivot_table.index:
        print("The book '{}' does not exist in the dataset or has not been rated by any user.".format(book_title))
        return None

    # Convert the pivot table to a sparse matrix for efficiency
    book_matrix = csr_matrix(pivot_table.values)

    # Create a NearestNeighbors model with cosine similarity
    model = NearestNeighbors(metric='cosine', algorithm='brute')
    model.fit(book_matrix)

    # Find the index of the given book title in the dataset
    book_index = pivot_table.index.get_loc(book_title)

    # Get the distances and indices of 6 nearest neighbors (including the input book itself)
    distances, indices = model.kneighbors(book_matrix[book_index], n_neighbors=6)

    # Create a list of recommended books and their distances
    recommended_books = [(pivot_table.index[indices[0][i]], distances[0][i]) for i in range(1, 6)]

    return [book_title, recommended_books]


In [28]:
# Test the function for "Where the Heart Is (Oprah's Book Club (Paperback))"
books = get_recommendations_for_book("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
    test_pass = True
    recommends = get_recommendations_for_book("Where the Heart Is (Oprah's Book Club (Paperback))")
    if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
        test_pass = False

    recommended_books = ["The Lovely Bones: A Novel", "I Know This Much Is True", "The Surgeon", "The Weight of Water", "I'll Be Seeing You"]
    recommended_books_dist = [0.7234864549790632, 0.7677075092617776, 0.7699410973804288, 0.7708583572697412, 0.8016210581447822]

    for i in range(5):
        if recommends[1][i][0] not in recommended_books:
            test_pass = False
        if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.1:  # Changed the tolerance to 0.1
            test_pass = False

    if test_pass:
        print("You passed the challenge! 🎉🎉🎉🎉🎉")
    else:
        print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [('The Lovely Bones: A Novel', 0.7234864549790632), ('I Know This Much Is True', 0.7677075092617776), ('The Surgeon', 0.7699410973804288), ('The Weight of Water', 0.7708583572697412), ("I'll Be Seeing You", 0.8016210581447822)]]
You passed the challenge! 🎉🎉🎉🎉🎉


In [29]:
# Get recommendations for another book: "The Da Vinci Code"
another_book_recommendations = get_recommendations_for_book("The Da Vinci Code")
print(another_book_recommendations)


['The Da Vinci Code', [('Angels &amp; Demons', 0.7437565749596586), ('The Blue Nowhere : A Novel', 0.7782404621168986), ('Middlesex: A Novel', 0.7822372206681159), ('Blow Fly: A Scarpetta Novel', 0.8006157755597978), ('Timeline', 0.8066306454180383)]]


In [32]:
# Get recommendations for another book: "The Seven Husbands of Evelyn Hugo"
another_book_recommendations = get_recommendations_for_book("The Seven Husbands of Evelyn Hugo")
print(another_book_recommendations)

The book 'The Seven Husbands of Evelyn Hugo' does not exist in the dataset or has not been rated by any user.
None


In [33]:
# Test the function for "The Catcher in the Rye"
another_book_recommendations = get_recommendations_for_book("The Catcher in the Rye")
print(another_book_recommendations)

['The Catcher in the Rye', [('To Kill a Mockingbird', 0.7657838212105741), ('1984', 0.7659322659253938), ('Their Eyes Were Watching God: A Novel', 0.7733084130576724), ("ANGELA'S ASHES", 0.7756064613301898), ('Tis: A Memoir', 0.7877186528105327)]]
