In [1]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the books dataset
books_df = pd.read_csv('/content/drive/MyDrive/zumit/RECOMMENDER SYSTEM/Books.csv')
books_df = books_df.drop_duplicates()

In [3]:
# Load the ratings dataset
ratings_df = pd.read_csv('/content/drive/MyDrive/zumit/RECOMMENDER SYSTEM/Ratings.csv')


In [4]:
# User input of book ratings
user_input = [
    {'Book-Title': "Where You'll Find Me: And Other Stories", 'Book-Rating': 5},
    {'Book-Title': "The Kitchen God's Wife", 'Book-Rating': 0},
    {'Book-Title': "Nights Below Station Street", 'Book-Rating': 0},
    {'Book-Title': "The Witchfinder (Amos Walker Mystery Series)", 'Book-Rating': 6},
    {'Book-Title': "Jane Doe", 'Book-Rating': 5},
]

In [5]:
# Convert user input to DataFrame
inputBooks = pd.DataFrame(user_input)


In [6]:
# Filter out books from the input that are in the books dataset to get their ISBN
input_id = books_df[books_df['Book-Title'].isin(inputBooks['Book-Title'].tolist())]
inputBooks = pd.merge(input_id, inputBooks)

In [7]:
# Drop unnecessary columns from inputBooks
inputBooks = inputBooks.drop(['Year-Of-Publication', 'Publisher', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L', 'Book-Author'], axis=1)


In [8]:
# Get users who have read the same books as the input
user_subset = ratings_df[ratings_df['ISBN'].isin(inputBooks['ISBN'].to_list())]


In [9]:
# Group the rows by 'ISBN'
user_subset_group = user_subset.groupby(['ISBN'])


In [10]:
# Sort the user subset group by the number of ratings each book has received
user_subset_group = sorted(user_subset_group, key=lambda x: len(x[1]), reverse=True)


In [11]:
# Select a smaller subset of users for Pearson Correlation calculation
user_subset_group = user_subset_group[0:70]

pearsonCorrelationDict = {} # Dictionary to store Pearson coefficients


In [12]:
# Calculate Pearson correlation for every user group in our subset
for name, group in user_subset_group:
    group = group.sort_values(by='ISBN')
    inputBooks = inputBooks.sort_values(by='ISBN')

    nRatings = len(group)
    temp_df = inputBooks[inputBooks['ISBN'].isin(group['ISBN'].tolist())]

    tempRatingList = temp_df['Book-Rating'].tolist()
    tempGroupList = group['Book-Rating'].tolist()

    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList), 2) / float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList), 2) / float(nRatings)
    Sxy = sum(i * j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList) * sum(tempGroupList) / float(nRatings)

    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy / sqrt(Sxx * Syy)
    else:
        pearsonCorrelationDict[name] = 0


In [13]:
# Convert the Pearson correlation dictionary to a DataFrame
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index', columns=['similarityIndex'])
pearsonDF['ISBN'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))


In [14]:
# Select top 30 similar users
topUsers = pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:30]


In [15]:
# Merge with ratings_df to get ratings of these top users
topUsersRating = topUsers.merge(ratings_df, left_on='ISBN', right_on='ISBN', how='inner')


In [16]:
# Multiply the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex'] * topUsersRating['Book-Rating']


In [17]:
# Group by ISBN and calculate the sum of similarity index and weighted ratings
tempTopUsersRating = topUsersRating.groupby('ISBN').sum()[['similarityIndex', 'weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex', 'sum_weightedRating']


In [18]:
# Create a recommendation DataFrame
recommendation_df = pd.DataFrame()
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating'] / tempTopUsersRating['sum_similarityIndex']
recommendation_df['ISBN'] = tempTopUsersRating.index


In [19]:
# Sort the recommendations by the weighted average recommendation score
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)


In [20]:
# Display the top 10 book recommendations
recommended_books = books_df.loc[books_df['ISBN'].isin(recommendation_df.head(10)['ISBN'].tolist())]
print(recommended_books)

Empty DataFrame
Columns: [ISBN, Book-Title, Book-Author, Year-Of-Publication, Publisher, Image-URL-S, Image-URL-M, Image-URL-L]
Index: []
