# Recommender system
## Books for mystery, thriller, and crime
https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home

In [2]:
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

import pandas as pd
import numpy as np

## Recommendations using LightFM model on subset of entire dataset

In [3]:
books_df = pd.read_csv('goodreads_books_mystery_thriller_crime.csv')
books_df.head(5)

Unnamed: 0,asin,authors,average_rating,book_id,country_code,description,edition_information,format,image_url,is_ebook,...,publication_year,publisher,ratings_count,series,similar_books,text_reviews_count,title,title_without_series,url,work_id
0,,"[{'author_id': '37778', 'role': ''}]",3.93,6066814,US,"London, 1196. At the command of Richard the Li...",,Hardcover,https://images.gr-assets.com/books/1328724803m...,False,...,2009.0,Simon & Schuster UK,186,['169353'],"['439108', '522621', '116770', '1275927', '620...",15,"Crowner Royal (Crowner John Mystery, #13)","Crowner Royal (Crowner John Mystery, #13)",https://www.goodreads.com/book/show/6066814-cr...,6243149
1,B01NCIKAQX,"[{'author_id': '242185', 'role': ''}]",4.33,33394837,US,,,,https://images.gr-assets.com/books/1493114742m...,True,...,,,269,['1052227'],[],60,The House of Memory (Pluto's Snitch #2),The House of Memory (Pluto's Snitch #2),https://www.goodreads.com/book/show/33394837-t...,54143148
2,B01ALOWJN0,"[{'author_id': '15104629', 'role': ''}, {'auth...",3.49,29074697,US,"BATHS, BANKS AND ROMAN INSURRECTION\nDetective...",,,https://s.gr-assets.com/assets/nophoto/book/11...,True,...,,Amazon Digital Services,192,['953679'],[],23,The Slaughtered Virgin of Zenopolis (Inspector...,The Slaughtered Virgin of Zenopolis (Inspector...,https://www.goodreads.com/book/show/29074697-t...,49305010
3,,"[{'author_id': '190988', 'role': ''}]",3.3,1902202,US,"Gerald breezily introduced his wife, Helen, to...",Large Print,Hardcover,https://s.gr-assets.com/assets/nophoto/book/11...,False,...,1975.0,Ulverscroft,52,['408775'],[],8,"Dead in the Morning (Patrick Grant, #1)","Dead in the Morning (Patrick Grant, #1)",https://www.goodreads.com/book/show/1902202.De...,1903897
4,,"[{'author_id': '337108', 'role': ''}, {'author...",3.54,9671977,US,"""I misteri di Eleusi"" e il quinto romanzo di A...",,Paperback,https://images.gr-assets.com/books/1474788304m...,False,...,2006.0,Sellerio,22,['274410'],[],3,Aristotele e i misteri di Eleusi,Aristotele e i misteri di Eleusi,https://www.goodreads.com/book/show/9671977-ar...,2152906


In [4]:
#Select desired metadata fields
books_metadata = books_df[['book_id', 'average_rating', 'is_ebook', 'num_pages', 
'publication_year', 'ratings_count', 'language_code']]

In [5]:
# replace blank cells with NaN
books_metadata.replace('', np.nan, inplace=True)

In [6]:
# replace NaN num_pages with -1 as it is numeric field
books_metadata['num_pages'].replace(np.nan, -1, inplace=True)
books_metadata['num_pages'] = pd.to_numeric(books_metadata['num_pages'])
books_metadata.describe()

Unnamed: 0,book_id,average_rating,num_pages,publication_year,ratings_count
count,219235.0,219235.0,219235.0,164741.0,219235.0
mean,15172140.0,3.831517,216.239341,2007.673445,523.9024
std,10572620.0,0.332023,237.765462,146.558403,9129.935
min,164.0,0.0,-1.0,2.0,0.0
25%,6339338.0,3.65,-1.0,2005.0,10.0
50%,15734380.0,3.85,240.0,2011.0,35.0
75%,23603790.0,4.04,352.0,2014.0,128.0
max,36517160.0,5.0,52015.0,20158.0,2046499.0


In [7]:
#Removing Outliers in num_pages column

# IQR
Q1 = np.percentile(books_metadata['num_pages'], 25, 
                   interpolation = 'midpoint') 
  
Q3 = np.percentile(books_metadata['num_pages'], 75,
                   interpolation = 'midpoint') 
IQR = Q3 - Q1 

# Upper bound
upper = np.where(books_metadata['num_pages'] >= (Q3+1.5*IQR))
# Lower bound
lower = np.where(books_metadata['num_pages'] <= (Q1-1.5*IQR))
  
''' Removing the Outliers '''
books_metadata.drop(upper[0], inplace = True)
books_metadata.drop(lower[0], inplace = True)

books_metadata.describe()

Unnamed: 0,book_id,average_rating,num_pages,publication_year,ratings_count
count,218702.0,218702.0,218702.0,164274.0,218702.0
mean,15179390.0,3.830714,213.201873,2007.571612,520.5538
std,10572640.0,0.331726,184.349848,139.888883,9105.118
min,164.0,0.0,-1.0,2.0,0.0
25%,6342954.0,3.65,-1.0,2005.0,10.0
50%,15737860.0,3.85,240.0,2011.0,35.0
75%,23609270.0,4.04,352.0,2014.0,128.0
max,36517160.0,5.0,881.0,20158.0,2046499.0


In [8]:
# Bin num_pages into discrete intervals 
# books_metadata['num_pages'] = pd.cut(books_metadata['num_pages'], bins=25)
books_metadata.describe()

Unnamed: 0,book_id,average_rating,num_pages,publication_year,ratings_count
count,218702.0,218702.0,218702.0,164274.0,218702.0
mean,15179390.0,3.830714,213.201873,2007.571612,520.5538
std,10572640.0,0.331726,184.349848,139.888883,9105.118
min,164.0,0.0,-1.0,2.0,0.0
25%,6342954.0,3.65,-1.0,2005.0,10.0
50%,15737860.0,3.85,240.0,2011.0,35.0
75%,23609270.0,4.04,352.0,2014.0,128.0
max,36517160.0,5.0,881.0,20158.0,2046499.0


In [9]:
#Replacing Invalid publication_year values with valid values in the range 1000 - 2021
books_metadata['publication_year'].mask(books_metadata['publication_year'] >= 2021, 2021, inplace=True)
books_metadata['publication_year'].mask(books_metadata['publication_year'] <= 1000, 1000, inplace=True)
#Replace missing values with year 2100
books_metadata['publication_year'].replace(np.nan, 2100, inplace=True)
books_metadata['publication_year'] = pd.to_numeric(books_metadata['publication_year'])

In [10]:
books_metadata.describe()

Unnamed: 0,book_id,average_rating,num_pages,publication_year,ratings_count
count,218702.0,218702.0,218702.0,218702.0,218702.0
mean,15179390.0,3.830714,213.201873,2030.084023,520.5538
std,10572640.0,0.331726,184.349848,45.137346,9105.118
min,164.0,0.0,-1.0,1000.0,0.0
25%,6342954.0,3.65,-1.0,2008.0,10.0
50%,15737860.0,3.85,240.0,2013.0,35.0
75%,23609270.0,4.04,352.0,2018.0,128.0
max,36517160.0,5.0,881.0,2100.0,2046499.0


In [11]:
# rounding ratings to nearest .5 score
books_metadata['average_rating'] = books_metadata['average_rating'].apply(lambda x: round(x*2)/2)
# using pandas qcut method to convert fields into quantile-based discrete intervals
books_metadata['ratings_count'] = pd.qcut(books_metadata['ratings_count'], 25)
# replacing missing language values to 'unknown'
books_metadata['language_code'].replace(np.nan, 'unknown', inplace=True)
# convert is_ebook column into 1/0 where true=1 and false=0
books_metadata['is_ebook'] = books_metadata.is_ebook.map(
    lambda x: 1.0*(x == 'true'))

In [12]:
books_metadata.head()

Unnamed: 0,book_id,average_rating,is_ebook,num_pages,publication_year,ratings_count,language_code
0,6066814,4.0,0.0,400.0,2009.0,"(182.0, 253.0]",unknown
1,33394837,4.5,0.0,318.0,2100.0,"(253.0, 381.0]",eng
2,29074697,3.5,0.0,-1.0,2100.0,"(182.0, 253.0]",eng
3,1902202,3.5,0.0,-1.0,1975.0,"(46.0, 55.0]",unknown
4,9671977,3.5,0.0,659.0,2006.0,"(18.0, 22.0]",ita


In [13]:
#Create Book ID to Title Mapping
item_dict ={}
df = books_df[['book_id', 'title']].sort_values('book_id').reset_index()
for i in range(df.shape[0]):
    item_dict[(df.loc[i,'book_id'])] = df.loc[i,'title']

In [14]:
# Convert Categorical columns to numerical
books_metadata_transformed = pd.get_dummies(books_metadata, columns = ['average_rating', 'is_ebook', 'num_pages','publication_year', 'ratings_count', 'language_code'])
books_metadata_transformed = books_metadata_transformed.sort_values('book_id').reset_index().drop('index', axis=1)
books_metadata_transformed.head(5)

Unnamed: 0,book_id,average_rating_0.0,average_rating_1.0,average_rating_1.5,average_rating_2.0,average_rating_2.5,average_rating_3.0,average_rating_3.5,average_rating_4.0,average_rating_4.5,...,language_code_tel,language_code_tgl,language_code_tha,language_code_tur,language_code_ukr,language_code_unknown,language_code_urd,language_code_vie,language_code_vls,language_code_zho
0,164,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,205,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,213,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,214,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,215,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [28]:
from scipy.sparse import csr_matrix
# Convert to CSR matrix
books_metadata_csr = csr_matrix(books_metadata_transformed.drop('book_id', axis=1).values)

In [16]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,user_id,book_id,rating
0,eabd6e8c6dc181d8fb5bd65573e9b51b,25365,3
1,f6bf54164ddeba9eb812c024aff1902c,8467921,3
2,7322c5cb2d2dc8a289374e8ae5c4903a,29960804,2
3,8c4334a0524be2b2464200c8b9330b19,3663410,5
4,a0d3687116b658aba8e527550643ac9e,26025580,5


In [17]:
train_df.shape

(9876021, 3)

In [18]:
from lightfm import LightFM
model = LightFM(loss='warp',
                random_state=2016,
                learning_rate=0.90,
                no_components=150,
                user_alpha=0.000005)

In [18]:
#Tried Batch training but did not work work as expected
# BATCH_SIZE = 10
# train_size = train_df.shape[0]
# n_batch = int(train_df.shape[0]/BATCH_SIZE)
# for n in range(1,n_batch+1):
#         if n == n_batch + 1:
#             user_book_interaction = pd.pivot_table(train_df[n-1*BATCH_SIZE: train_size], index='user_id', columns='book_id', values='rating')
#         else:
#             user_book_interaction = pd.pivot_table(train_df[n-1*BATCH_SIZE: n*BATCH_SIZE], index='user_id', columns='book_id', values='rating')
#         user_book_interaction = user_book_interaction.fillna(0)
#         # convert to csr matrix
#         user_book_interaction_csr = csr_matrix(user_book_interaction)
#         model = model.fit_partial(user_book_interaction_csr,user_features=None,epochs=100,num_threads=16, verbose=False)
        
        

In [19]:
# This was the main bottleneck as generating pivot table is very compute heavy operation
user_book_interaction = pd.pivot_table(train_df[0:80000], index='user_id', columns='book_id', values='rating')
# fill missing values with 0
user_book_interaction = user_book_interaction.fillna(0)
user_id = list(user_book_interaction.index)
user_dict = {}
counter = 0 
for i in user_id:
    user_dict[i] = counter
    counter += 1

In [20]:
# convert to csr matrix
user_book_interaction_csr = csr_matrix(user_book_interaction)

In [21]:
model = model.fit(user_book_interaction_csr,user_features=None,epochs=100,num_threads=16, verbose=False)

In [22]:
def generate_recommendations(model, interactions_df, user_id, user_dict, 
                               item_dict,threshold = 0,nrec_items = 5):
    
    n_users, n_items = interactions_df.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items), item_features=books_metadata_csr))
    scores.index = interactions_df.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    known_items = list(pd.Series(interactions_df.loc[user_id,:] \
                                 [interactions_df.loc[user_id,:] > threshold].index).sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    print ("User: " + str(user_id))
    print("\n Recommended Items:")
    counter = 1
    for i in scores:
        print(str(counter) + '- ' + i)
        counter+=1

In [23]:
generate_recommendations(model, user_book_interaction, '8842281e1d1347389f2ab93d60773d4d', user_dict, item_dict)

User: 8842281e1d1347389f2ab93d60773d4d

 Recommended Items:
1- Murder by the Seaside
2- Something in the Blood (Honey Driver Mystery, #1)
3- Swimming Lessons
4- House Rules
5- The Lion's Game (John Corey, #2)
