In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from scipy.stats import pearsonr    
import warnings
warnings.filterwarnings("ignore")
# --- IGNORE --
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

**Recsys diagram**

![image](/Users/avikumart/Documents/GitHub/Book-recommendation-system/data/classicRec.png)

In [None]:
# read the dataframe of the books
books = pd.read_csv('/Users/avikumart/Documents/GitHub/Book-recommendation-system/data/Books.csv', index_col=False, encoding='latin-1')

In [None]:
books.head()

In [None]:
# get the meta data
books.info()

In [None]:
# load the ratings data
ratings = pd.read_csv('/Users/avikumart/Documents/GitHub/Book-recommendation-system/data/Ratings.csv', index_col=False, encoding='latin-1')

In [None]:
ratings.head()

In [None]:
ratings.info()

In [None]:
# clean the data and merge book and ratings data on ISBN
books = books.dropna(inplace=False)
books

In [None]:
# drop the un unnecessary columns
books = books.drop(columns=['Image-URL-S', 'Image-URL-M', 'Image-URL-L', 'Year-Of-Publication'], axis=1)
books.head()

In [None]:
# apply regex of non-alphanumeric characters to clean the data
books['Book-Title'] = books['Book-Title'].apply(lambda x: re.sub("[\W_]+", " ", x).strip())
# merge the dataframes
book_ratings = pd.merge(books, ratings, on='ISBN')
book_ratings.head()

In [None]:
# check the meta data of merged dataframe
book_ratings.info()

In [None]:
# rename the columns for better understanding
book_ratings = book_ratings.rename(columns={'User-ID': 'user_id', 'Book-Rating': 'book_rating', 'Book-Title': 'book_title', 'Book-Author': 'book_author', 'Publisher': 'publisher', 'ISBN': 'isbn'})
book_ratings.shape

In [None]:
# remove the less rated books and users
data = book_ratings["isbn"].value_counts()

In [None]:
# remove lower rated books
ratings_threshold = 1
book_ratings = book_ratings[book_ratings['isbn'].isin(data[data > ratings_threshold].index)]
# remove lower rated users
book_ratings.shape

In [None]:
# show the top 5 rows
book_ratings.head()

In [None]:
# total unique users and books
n_users = book_ratings.user_id.nunique()
n_books = book_ratings.isbn.nunique()
print(f'Total unique users: {n_users}')
print(f'Total unique books: {n_books}')

# top 10 books with highest ratings
top_books = book_ratings.groupby('book_title')['book_rating'].count().sort_values(ascending=False).head(10)
top_books.plot(kind='barh', figsize=(10,6), color='skyblue')
plt.xlabel('Number of Ratings')
plt.title('Top 10 Books with Highest Ratings')
plt.show()


In [None]:
# check the data sparsity
sparsity = 1.0 - (len(book_ratings) / (n_users * n_books))
print(f'Data Sparsity: {sparsity:.4f}')

In [None]:
# check total book authrs and publishers
n_authors = book_ratings.book_author.nunique()
n_publishers = book_ratings.publisher.nunique()
print(f'Total unique authors: {n_authors}')
print(f'Total unique publishers: {n_publishers}')

In [None]:
# save the book ratings dataframe
book_ratings.to_csv('/Users/avikumart/Documents/GitHub/Book-recommendation-system/data/book_ratings.csv', index=False)

In [None]:
# remove the book author and publisher columns for collaborative filtering
book_ratings = book_ratings.drop(columns=['book_author', 'publisher'], axis=1)
book_ratings.head()

In [None]:
# save the cleaned book ratings dataframe
book_ratings.to_csv('/Users/avikumart/Documents/GitHub/Book-recommendation-system/data/cleaned_book_ratings.csv', index=False)