# What Can You Expect In this Notebook?

## Goals
1. Create a collaborative filtering feature
1. Make book clustering

In [1]:
# import libraries (you may add additional imports but you may not have to)
import logging
logging.captureWarnings(True)

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix

#viz lib
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns

#
from sklearn.metrics.pairwise import cosine_similarity

# #to scale the data using z-score
# from sklearn.preprocessing import StandardScaler

# #Importing PCA
# from sklearn.decomposition import PCA

#
# from surprise import Reader

#
import dill as pickle

In [2]:
df = pd.read_csv("data/clean_data.csv")

In [3]:
df.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,mod_title,isbn_index,user_id,book_rating,location,age
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,classical mythology,25028,2.0,0.0,"stockton, california, usa",18.0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,clara callan,73,8.0,5.0,"timmins, ontario, canada",24.0
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,clara callan,73,11400.0,0.0,"ottawa, ontario, canada",49.0
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,clara callan,73,11676.0,8.0,"n/a, n/a, n/a",24.0
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,clara callan,73,41385.0,0.0,"sudbury, ontario, canada",24.0


In [5]:
#User-Book matrix
user_book_df = pd.pivot_table(
    data = df,
    index="isbn_index", 
    columns="user_id", 
    values="book_rating"
    ).sort_index()

# Some of the books have 0 rating, indicating that the books are liked by certain users but never rated. 
# We need to differentiate liked books and rated books.
# We add 1 to the rating, and fill null value with 0 (liked but unrated)
user_book_df += 1

user_book_df.fillna(0, inplace = True)

user_book_df.head()

ValueError: negative dimensions are not allowed

In [None]:
# convert the dataframe to sparse matrix
user_book_sparse = csr_matrix(user_book_df.values)

In [None]:
# calculate the cosine similarity between books according to ratings from users
similarity_scores = cosine_similarity(user_book_sparse)
similarity_scores.shape

(2161, 2161)

In [None]:
# create a mapping dictionary
sparse_to_df_map = {j:i for i,j in zip(range(user_book_sparse.shape[0]),user_book_df.index.unique())}

In [None]:
# make a function to recommend the books
def recommend(isbn_index):
    # index fetch
    index = np.where(user_book_df.index==isbn_index)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[0:6]
    
    data = [user_book_df.index[i[0]] for i in similar_items]
    
    return data

In [None]:
df[df['isbn_index'].isin(recommend(806))][["isbn","book_title","book_author","year_of_publication"]].drop_duplicates()

Unnamed: 0,isbn,book_title,book_author,year_of_publication
27237,000649840X,Angelas Ashes,Frank Mccourt,0
53548,1844262553,Free,Paul Vincent,2003
83517,0330375253,Bridget Jones's Diary,Helen Fielding,2001
85894,0860074382,84 Charing Cross Road,Helene Hanff,0
92367,0684859734,Grave Secrets (Temperance Brennan Novel (Hardcover)),Kathy Reichs,2002
105076,0375758232,Paris to the Moon,Adam Gopnik,2001


## PICKLE RIICKKKK!!!

In [10]:
# make a function to recommend the books
def new_recommend(isbn_index):
    index = sparse_to_df_map[isbn_index]
    similar_items = sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[0:6]
    
    data = [list(sparse_to_df_map.keys())[list(sparse_to_df_map.values()).index(i[0])] for i in similar_items]
    
    return data

In [11]:
pickle.dump(similarity_scores, open('pickles/similarity_scores.pkl','wb'))
pickle.dump(sparse_to_df_map, open('pickles/sparse_to_df_map.pkl','wb'))
pickle.dump(new_recommend, open('pickles/recommender.pkl','wb'))

In [12]:
sim_score = pickle.load(open('pickles/similarity_scores.pkl', 'rb'))
sparse_to_df_map = pickle.load(open('pickles/sparse_to_df_map.pkl', 'rb'))
recommender = pickle.load(open('pickles/recommender.pkl', 'rb'))

In [13]:
df[df['isbn_index'].isin(recommender(806))][["isbn","book_title","book_author","year_of_publication"]].drop_duplicates()

Unnamed: 0,isbn,book_title,book_author,year_of_publication
27237,000649840X,Angelas Ashes,Frank Mccourt,0
53548,1844262553,Free,Paul Vincent,2003
83517,0330375253,Bridget Jones's Diary,Helen Fielding,2001
85894,0860074382,84 Charing Cross Road,Helene Hanff,0
92367,0684859734,Grave Secrets (Temperance Brennan Novel (Hardcover)),Kathy Reichs,2002
105076,0375758232,Paris to the Moon,Adam Gopnik,2001


In [14]:
import sys
sys.getsizeof(sparse_to_df_map)

73816