In [23]:
import pickle
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import numpy
from sklearn import preprocessing

In [24]:
# Load Books data
books_df = pd.read_csv('data/Books.csv',error_bad_lines=False, encoding="latin-1")
books_df = books_df[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-L']]
books_df = books_df.rename(columns={"Book-Title": "Title", "Book-Author": "Author", "Year-Of-Publication": "Year", "Image-URL-L": "img_url"})


In [25]:
# Load Users data
users_df = pd.read_csv('data/Users.csv', error_bad_lines=False, encoding="latin-1")

# Load Ratings data
ratings_df = pd.read_csv('data/Ratings.csv', error_bad_lines=False, encoding="latin-1")
ratings_df = ratings_df.rename(columns={"User-ID": "user_id", "Book-Rating": "rating"})

In [26]:
# Filter users with more than 200 ratings
user_counts = ratings_df['user_id'].value_counts() > 200
selected_users = user_counts[user_counts].index
filtered_ratings_df = ratings_df[ratings_df['user_id'].isin(selected_users)]


In [27]:

# Merge Ratings with Books
merged_ratings_books_df = filtered_ratings_df.merge(books_df, on="ISBN")



In [28]:
# Create Total Ratings DataFrame
total_ratings_df = merged_ratings_books_df.groupby('Title')['rating'].count().reset_index()
total_ratings_df = total_ratings_df.rename(columns={"rating": "total_rating"})



In [29]:
# Merge Total Ratings with Merged Ratings and Books DataFrame
final_df = merged_ratings_books_df.merge(total_ratings_df, on='Title')

# Filter titles with at least 50 total ratings
final_df = final_df[final_df['total_rating'] >= 50]

In [30]:
# Remove duplicate entries
final_df = final_df.drop_duplicates(['user_id', 'Title'])



In [31]:
# Create a pivot table
book_pivot = final_df.pivot_table(columns='user_id', index='Title', values='rating').fillna(0)
book_sparse = csr_matrix(book_pivot)



In [32]:
# Build Nearest Neighbors model
model = NearestNeighbors(algorithm="brute")
model.fit(book_sparse)



NearestNeighbors(algorithm='brute', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [33]:
pickle.dump(book_pivot.index, open('pickle/book_names.pkl', 'wb'))
pickle.dump(final_df, open('pickle/final_rating.pkl', 'wb'))
pickle.dump(book_pivot, open('pickle/book_pivot.pkl', 'wb'))
pickle.dump(model, open('pickle/model.pkl', 'wb'))



In [34]:
def recommend_book1(book):
    book_index = np.where(book_pivot.index == book)[0][0]
    _, recommendation = model.kneighbors(book_pivot.iloc[book_index, :].values.reshape(1, -1), n_neighbors=6)

    recommended_books = book_pivot.index[recommendation.flatten()]
    return recommended_books

In [35]:
book_name = "Harry Potter and the Chamber of Secrets (Book 2)"
recommend_book1(book_name)

Index(['Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Sorcerer's Stone (Book 1)', 'Exclusive',
       'The Cradle Will Fall'],
      dtype='object', name='Title')

Index(['Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Sorcerer's Stone (Book 1)', 'Exclusive',
       'The Cradle Will Fall'],
      dtype='object', name='Title')

  books_df = pd.read_csv('data/Books.csv', on_bad_lines='skip', encoding="latin-1")
