# What Can You Expect In this Notebook?

## Goals
1. Create a collaborative filtering feature
1. Make book clustering

In [1]:
# import libraries (you may add additional imports but you may not have to)
import logging
logging.captureWarnings(True)

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix

#viz lib
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns

#
from sklearn.metrics.pairwise import cosine_similarity

#to scale the data using z-score
from sklearn.preprocessing import StandardScaler

#Importing PCA
from sklearn.decomposition import PCA

#
from surprise import Reader

In [2]:
df = pd.read_csv("data/clean_data.csv")

In [3]:
df["isbn_index"] = df["isbn"].astype("category").cat.codes

df.head()

Unnamed: 0.1,Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,mod_title,user_id,book_rating,location,age,isbn_index
0,0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,classical mythology,2.0,0.0,"stockton, california, usa",18.0,24927
1,1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,clara callan,8.0,5.0,"timmins, ontario, canada",24.0,73
2,2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,clara callan,11400.0,0.0,"ottawa, ontario, canada",49.0,73
3,3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,clara callan,11676.0,8.0,"n/a, n/a, n/a",24.0,73
4,4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,clara callan,41385.0,0.0,"sudbury, ontario, canada",24.0,73


## Advice from freeCodeCamp
https://www.freecodecamp.org/learn/machine-learning-with-python/machine-learning-with-python-projects/book-recommendation-engine-using-knn<br>
If you graph the dataset (optional), you will notice that most books are not rated frequently. To ensure statistical significance, remove from the dataset users with less than 200 ratings and books with less than 100 ratings.

In [4]:
# Find how many books have each users read
user_stat = pd.pivot_table(
    data = df,
    values = 'book_title',
    index = 'user_id',
    aggfunc= 'count'
).reset_index()

user_stat.head()

Unnamed: 0,user_id,book_title
0,2.0,1
1,8.0,17
2,9.0,3
3,10.0,1
4,12.0,1


In [5]:
# fig, ax = plt.subplots(figsize = (8,3))

# sns.kdeplot(
#     data = user_stat['book_title'],
#     # orient='h'
# )

In [6]:
# Find how many books have each users read
book_stat = pd.pivot_table(
    data = df,
    values = 'book_rating',
    index = 'isbn',
    aggfunc= 'count'
).reset_index().rename(columns = {"book_rating":"total_like"})

temp = pd.pivot_table(
    data = df[df['book_rating']!=0],
    values = 'book_rating',
    index = 'isbn',
    aggfunc= np.mean
).reset_index().rename(columns = {"book_rating":"average_rating"})

book_stat = book_stat.merge(temp, how = 'left', on = 'isbn').fillna(0)

book_stat.head()

Unnamed: 0,isbn,total_like,average_rating
0,0000913154,1,8.0
1,0001010565,2,0.0
2,0001046438,1,9.0
3,0001046713,1,0.0
4,000104687X,1,6.0


In [7]:
# fig, ax = plt.subplots(figsize = (8,3))

# sns.kdeplot(
#     data = book_stat,
#     # orient='h'
# )

In [9]:
# filter the records according to the advice from freecodecamp (or not)
book_like = 50
user_rate = 200

# books that will be used in the collaborative filtering
used_book = book_stat[book_stat['total_like']>=book_like]['isbn']
# books that will be used in the collaborative filtering
used_user = user_stat[user_stat['book_title']>=user_rate]['user_id'].astype('int')


In [11]:
#User-Book matrix
user_book_df = pd.pivot_table(
    data = df[(df['isbn'].isin(used_book)) & (df['user_id'].isin(used_user))],
    index="isbn_index", 
    columns="user_id", 
    values="book_rating"
    )

# Some of the books have 0 rating, indicating that the books are liked by certain users but never rated. 
# We need to differentiate liked books and rated books.
# We add 1 to the rating, and fill null value with 0 (liked but unrated)
user_book_df += 1

user_book_df.fillna(0, inplace = True)

user_book_df.head()

user_id,254.0,2276.0,2766.0,2977.0,3363.0,4017.0,4385.0,6251.0,6323.0,6543.0,...,271705.0,273979.0,274004.0,274061.0,274301.0,274308.0,275970.0,277427.0,277639.0,278418.0
isbn_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1759,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0


In [13]:
# convert the dataframe to sparse matrix
user_book_sparse = csr_matrix(user_book_df.values)

In [14]:
# calculate the cosine similarity between books according to ratings from users
similarity_scores = cosine_similarity(user_book_sparse)
similarity_scores.shape

(2161, 2161)

In [18]:
# make a function to recommend the books
def recommend(isbn):
    # index fetch
    index = np.where(user_book_df.index==isbn)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[0:6]
    
    data = [user_book_df.index[i[0]] for i in similar_items]
    
    return data

In [21]:
df[df['isbn_index'].isin(recommend(803))][["isbn","book_title","book_author","year_of_publication"]].drop_duplicates()

Unnamed: 0,isbn,book_title,book_author,year_of_publication
61133,000649840X,Angelas Ashes,Frank Mccourt,0
122211,1844262553,Free,Paul Vincent,2003
210053,0860074382,84 Charing Cross Road,Helene Hanff,0
229560,0684859734,Grave Secrets (Temperance Brennan Novel (Hardcover)),Kathy Reichs,2002
250425,3423202327,MÃ?Â¶rder ohne Gesicht.,Henning Mankell,1999
272796,0375758232,Paris to the Moon,Adam Gopnik,2001
