# What Can You Expect In this Notebook?

## Goals
1. Create a collaborative filtering feature
1. Make book clustering

In [1]:
# import libraries (you may add additional imports but you may not have to)
import logging
logging.captureWarnings(True)

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix

#viz lib
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns

#
from sklearn.metrics.pairwise import cosine_similarity

# #to scale the data using z-score
# from sklearn.preprocessing import StandardScaler

# #Importing PCA
# from sklearn.decomposition import PCA

#
# from surprise import Reader

#
import dill as pickle

In [2]:
df = pd.read_csv("data/clean_data.csv")

In [3]:
df.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,mod_title,isbn_index,user_id,book_rating,location,age
0,440234743,The Testament,John Grisham,1999,Dell,the testament,87548,277478.0,0.0,"schiedam, zuid-holland, netherlands",31.0
1,440234743,The Testament,John Grisham,1999,Dell,the testament,87548,278144.0,0.0,"storm lake, iowa, usa",48.0
2,440234743,The Testament,John Grisham,1999,Dell,the testament,87548,243.0,0.0,"arden hills, minnesota, usa",24.0
3,440234743,The Testament,John Grisham,1999,Dell,the testament,87548,2977.0,0.0,"richland, washington, usa",25.0
4,440234743,The Testament,John Grisham,1999,Dell,the testament,87548,3363.0,0.0,"knoxville, tennessee, usa",29.0


In [4]:
#User-Book matrix
user_book_df = pd.pivot_table(
    data = df,
    index="isbn_index", 
    columns="user_id", 
    values="book_rating"
    ).sort_index()

# Some of the books have 0 rating, indicating that the books are liked by certain users but never rated. 
# We need to differentiate liked books and rated books.
# We add 1 to the rating, and fill null value with 0 (liked but unrated)
user_book_df += 1

user_book_df.fillna(0, inplace = True)

user_book_df.head()

user_id,243.0,254.0,507.0,638.0,643.0,741.0,882.0,929.0,1025.0,1211.0,...,277928.0,277965.0,278026.0,278137.0,278144.0,278188.0,278418.0,278582.0,278633.0,278843.0
isbn_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# convert the dataframe to sparse matrix
user_book_sparse = csr_matrix(user_book_df.values)

In [6]:
# calculate the cosine similarity between books according to ratings from users
similarity_scores = cosine_similarity(user_book_sparse)
similarity_scores.shape

(2161, 2161)

In [7]:
# create a mapping dictionary
sparse_to_df_map = {j:i for i,j in zip(range(user_book_sparse.shape[0]),user_book_df.index.unique())}

In [8]:
# make a function to recommend the books
def recommend(isbn_index):
    # index fetch
    index = np.where(user_book_df.index==isbn_index)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[0:6]
    
    data = [user_book_df.index[i[0]] for i in similar_items]
    
    return data

In [9]:
df[df['isbn_index'].isin(recommend(806))][["isbn","book_title","book_author","year_of_publication"]].drop_duplicates()

Unnamed: 0,isbn,book_title,book_author,year_of_publication
27237,000649840X,Angelas Ashes,Frank Mccourt,0
53548,1844262553,Free,Paul Vincent,2003
83517,0330375253,Bridget Jones's Diary,Helen Fielding,2001
85894,0860074382,84 Charing Cross Road,Helene Hanff,0
92367,0684859734,Grave Secrets (Temperance Brennan Novel (Hardcover)),Kathy Reichs,2002
105076,0375758232,Paris to the Moon,Adam Gopnik,2001


## PICKLE RIICKKKK!!!

In [10]:
# make a function to recommend the books
def new_recommend(isbn_index):
    index = sparse_to_df_map[isbn_index]
    similar_items = sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[0:6]
    
    data = [list(sparse_to_df_map.keys())[list(sparse_to_df_map.values()).index(i[0])] for i in similar_items]
    
    return data

In [11]:
pickle.dump(similarity_scores, open('pickles/similarity_scores.pkl','wb'))
pickle.dump(sparse_to_df_map, open('pickles/sparse_to_df_map.pkl','wb'))
pickle.dump(new_recommend, open('pickles/recommender.pkl','wb'))

In [12]:
sim_score = pickle.load(open('pickles/similarity_scores.pkl', 'rb'))
sparse_to_df_map = pickle.load(open('pickles/sparse_to_df_map.pkl', 'rb'))
recommender = pickle.load(open('pickles/recommender.pkl', 'rb'))

In [13]:
df[df['isbn_index'].isin(recommender(806))][["isbn","book_title","book_author","year_of_publication"]].drop_duplicates()

Unnamed: 0,isbn,book_title,book_author,year_of_publication
27237,000649840X,Angelas Ashes,Frank Mccourt,0
53548,1844262553,Free,Paul Vincent,2003
83517,0330375253,Bridget Jones's Diary,Helen Fielding,2001
85894,0860074382,84 Charing Cross Road,Helene Hanff,0
92367,0684859734,Grave Secrets (Temperance Brennan Novel (Hardcover)),Kathy Reichs,2002
105076,0375758232,Paris to the Moon,Adam Gopnik,2001


In [14]:
import sys
sys.getsizeof(sparse_to_df_map)

73816