# Recommender system
## Books for mystery, thriller, and crime
https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home

In [21]:
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

import pandas as pd
import numpy as np

### Load Data

#### 1) General information for all books

In [2]:
book_id_df = pd.read_csv('book_id_map.csv')
print(book_id_df.shape)
book_id_df.head(3)

(2360650, 2)


Unnamed: 0,book_id_csv,book_id
0,0,34684622
1,1,34536488
2,2,34017076


In [3]:
user_id_df = pd.read_csv('user_id_map.csv')
print(user_id_df.shape)
user_id_df.head(3)

(876145, 2)


Unnamed: 0,user_id_csv,user_id
0,0,8842281e1d1347389f2ab93d60773d4d
1,1,72fb0d0087d28c832f15776b0d936598
2,2,ab2923b738ea3082f5f3efcbbfacb218


In [4]:
authors_df = pd.read_csv('goodreads_book_authors.csv')
print(authors_df.shape)
authors_df.head(3)

(829529, 5)


Unnamed: 0,author_id,average_rating,name,ratings_count,text_reviews_count
0,604031,3.98,Ronald J. Fields,49,7
1,626222,4.08,Anita Diamant,546796,28716
2,10333,3.92,Barbara Hambly,122118,5075


In [5]:
all_interactions_df = pd.read_csv('goodreads_interactions.csv')
print(all_interactions_df.shape)
all_interactions_df.head(3)

(228170405, 5)


Unnamed: 0,user_id,book_id,is_read,rating,is_reviewed
0,0,948,1,5.0,0.0
1,0,947,1,5.0,1.0
2,0,946,1,5.0,0.0


#### 2) Specific information for mystery, thriller, and crim books

In [24]:
books_df = pd.read_csv('goodreads_books_mystery_thriller_crime.csv')
books_df.authors = books_df.authors.str.split(pat = ",").str[0].str.split(pat=":").str[1].str.split(pat="'").str[1]
books_df = books_df.rename(columns={'authors': 'author_id'})
books_df.similar_books = books_df.similar_books.str.replace("^\[|\]$","")
print(books_df.shape)
books_df.head(3)

(219235, 29)


Unnamed: 0,asin,author_id,average_rating,book_id,country_code,description,edition_information,format,image_url,is_ebook,...,publication_year,publisher,ratings_count,series,similar_books,text_reviews_count,title,title_without_series,url,work_id
0,,37778,3.93,6066814,US,"London, 1196. At the command of Richard the Li...",,Hardcover,https://images.gr-assets.com/books/1328724803m...,False,...,2009.0,Simon & Schuster UK,186,['169353'],"'439108', '522621', '116770', '1275927', '6202...",15,"Crowner Royal (Crowner John Mystery, #13)","Crowner Royal (Crowner John Mystery, #13)",https://www.goodreads.com/book/show/6066814-cr...,6243149
1,B01NCIKAQX,242185,4.33,33394837,US,,,,https://images.gr-assets.com/books/1493114742m...,True,...,,,269,['1052227'],,60,The House of Memory (Pluto's Snitch #2),The House of Memory (Pluto's Snitch #2),https://www.goodreads.com/book/show/33394837-t...,54143148
2,B01ALOWJN0,15104629,3.49,29074697,US,"BATHS, BANKS AND ROMAN INSURRECTION\nDetective...",,,https://s.gr-assets.com/assets/nophoto/book/11...,True,...,,Amazon Digital Services,192,['953679'],,23,The Slaughtered Virgin of Zenopolis (Inspector...,The Slaughtered Virgin of Zenopolis (Inspector...,https://www.goodreads.com/book/show/29074697-t...,49305010


In [7]:
interactions_df = pd.read_csv('goodreads_interactions_mystery_thriller_crime.csv')
print(interactions_df.shape)
interactions_df.head(3)

(24799896, 10)


Unnamed: 0,book_id,date_added,date_updated,is_read,rating,read_at,review_id,review_text_incomplete,started_at,user_id
0,6392944,Mon Jul 24 02:48:17 -0700 2017,Sun Jul 30 09:28:03 -0700 2017,True,3,Tue Jul 25 00:00:00 -0700 2017,5e212a62bced17b4dbe41150e5bb9037,I haven't read a fun mystery book in a while a...,Mon Jul 24 00:00:00 -0700 2017,8842281e1d1347389f2ab93d60773d4d
1,2279538,Wed Mar 29 00:27:14 -0700 2017,Wed Mar 29 00:27:14 -0700 2017,False,0,,556e9e3f00fb5b7e4eaa116764b771b0,,,8842281e1d1347389f2ab93d60773d4d
2,20821043,Mon Mar 27 22:52:11 -0700 2017,Mon Mar 27 22:52:12 -0700 2017,False,0,,4b15f8a08f04a17f9a47f2b76fec21e5,,,8842281e1d1347389f2ab93d60773d4d


In [8]:
reviews_df = pd.read_csv('goodreads_reviews_mystery_thriller_crime.csv')
print(reviews_df.shape)
reviews_df.head(3)

(1849236, 11)


Unnamed: 0,book_id,date_added,date_updated,n_comments,n_votes,rating,read_at,review_id,review_text,started_at,user_id
0,6392944,Mon Jul 24 02:48:17 -0700 2017,Sun Jul 30 09:28:03 -0700 2017,0,6,3,Tue Jul 25 00:00:00 -0700 2017,5e212a62bced17b4dbe41150e5bb9037,I haven't read a fun mystery book in a while a...,Mon Jul 24 00:00:00 -0700 2017,8842281e1d1347389f2ab93d60773d4d
1,28684704,Tue Nov 15 11:29:22 -0800 2016,Mon Mar 20 23:40:27 -0700 2017,0,22,3,Sat Mar 18 23:22:42 -0700 2017,2ede853b14dc4583f96cf5d120af636f,"A fun, fast paced science fiction thriller. I ...",Fri Mar 17 23:45:40 -0700 2017,8842281e1d1347389f2ab93d60773d4d
2,32283133,Tue Nov 01 11:09:18 -0700 2016,Tue Nov 01 11:09:44 -0700 2016,0,9,0,,8e4d61801907e591018bdc3442a9cf2b,http://www.telegraph.co.uk/culture/10...,,8842281e1d1347389f2ab93d60773d4d


### Data Exploration

In [25]:
books = books_df[['book_id', 'author_id', 'average_rating', 'num_pages', 'description', 'similar_books', 'title']]
books.sample(5)

Unnamed: 0,book_id,author_id,average_rating,num_pages,description,similar_books,title
91273,12624023,26819,3.93,210.0,"Nore Roberts didn't ask for a new life, but no...","'497769', '464102', '514108', '93783', '137964...",Locked in Time
1362,12372717,3912,4.09,352.0,The defenestration of a ruthless theatre impre...,"'13531847', '122555', '8537082', '15715073', '...",Bryant & May and the Memory of Blood (Bryant &...
177059,1897049,43626,3.71,376.0,"Para o Dr. Jeremy Carter, psicologo no City Ce...","'760054', '848962', '296960'",The Conspiracy Club
206230,823200,55948,4.0,48.0,With her series of books that feature the spir...,"'710853', '166677', '1834583', '284368', '1525...","Blackwater Spirits (Glynis Tryon, #3)"
184194,529285,50818,3.5,200.0,Many people in America think that gang violenc...,,Gangbangers: Understanding the Deadly Minds of...


In [27]:
books.describe()

Unnamed: 0,book_id,average_rating,num_pages
count,219235.0,219235.0,149407.0
mean,15172140.0,3.831517,317.769984
std,10572620.0,0.332023,224.919711
min,164.0,0.0,0.0
25%,6339338.0,3.65,236.0
50%,15734380.0,3.85,313.0
75%,23603790.0,4.04,391.0
max,36517160.0,5.0,52015.0


In [28]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219235 entries, 0 to 219234
Data columns (total 7 columns):
book_id           219235 non-null int64
author_id         219235 non-null object
average_rating    219235 non-null float64
num_pages         149407 non-null float64
description       198488 non-null object
similar_books     219235 non-null object
title             219235 non-null object
dtypes: float64(2), int64(1), object(4)
memory usage: 11.7+ MB


In [29]:
books.isnull().sum(axis=0)

book_id               0
author_id             0
average_rating        0
num_pages         69828
description       20747
similar_books         0
title                 0
dtype: int64

In [30]:
print("Number of books: ", books.book_id.nunique())
print("Number of authors: ", books.author_id.nunique())

Number of books:  219235
Number of authors:  31761


In [53]:
interactions = interactions_df[['book_id', 'user_id', 'rating', 'is_read', 'read_at']]
interactions.sample(5)

Unnamed: 0,book_id,user_id,rating,is_read,started_at
10230047,775346,142c34f9adb566019745273c0b1fc836,3,True,Fri Apr 08 05:47:47 -0700 2016
913094,25196498,a1d3f510b1c20c9f156f4f3d5da2ad69,0,False,
23203667,12140024,b886fc11f088920fea65c7be44208080,0,False,
11040132,3100718,02658f9d21744a7d9a8ccb370b78e25a,0,True,Mon Jul 15 14:22:48 -0700 2013
8499235,14889679,7dfab3ac4ac4b66bdd43532d628dd4d1,0,False,


In [54]:
interactions.describe()

Unnamed: 0,book_id,rating
count,24799900.0,24799900.0
mean,10753710.0,1.819287
std,10813140.0,2.029919
min,164.0,0.0
25%,224520.0,0.0
50%,7738919.0,0.0
75%,19155230.0,4.0
max,36496900.0,5.0


In [55]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24799896 entries, 0 to 24799895
Data columns (total 5 columns):
book_id       int64
user_id       object
rating        int64
is_read       bool
started_at    object
dtypes: bool(1), int64(2), object(2)
memory usage: 780.5+ MB


In [56]:
interactions.isnull().sum(axis=0)

book_id              0
user_id              0
rating               0
is_read              0
started_at    20520521
dtype: int64

In [58]:
print("Number of books: ", interactions.book_id.nunique())
print("Number of users: ", interactions.user_id.nunique())

Number of books:  219234
Number of users:  676075


In [45]:
reviews = reviews_df[['book_id', 'user_id', 'rating', 'read_at', 'review_text']]
reviews.sample(5)

Unnamed: 0,book_id,user_id,rating,read_at,review_text
1469301,102113,7e229fe18a9766316c78d2a8565189da,2,Sat Jul 22 10:31:02 -0700 2017,Decent book. It started off strong. It became ...
889201,32437,f64fb926050c4b6ffe96cebaeba8b58d,2,Thu May 24 22:06:48 -0700 2012,"I have been a Koontz fan for many, many years ..."
1153378,206196,43f0c04fe19f4b6864ef685fa0c69503,4,Mon Apr 29 00:00:00 -0700 2013,The writing and story improved as I read. I en...
412587,16101917,d8529650760407c9c48ee3190bc1cca1,4,Fri May 16 19:57:31 -0700 2014,"I highly recommend this book, Sarah buys the h..."
746852,28815474,4b114ef2d8f57c9d169446b5ffdf6b9d,3,Fri Aug 05 00:00:00 -0700 2016,My Summary: When Anne and Marco Conti decide t...


In [46]:
reviews.describe()

Unnamed: 0,book_id,rating
count,1849236.0,1849236.0
mean,13353710.0,3.683981
std,10630120.0,1.208838
min,164.0,0.0
25%,1914973.0,3.0
50%,13094200.0,4.0
75%,22557270.0,5.0
max,36496900.0,5.0


In [47]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1849236 entries, 0 to 1849235
Data columns (total 5 columns):
book_id        int64
user_id        object
rating         int64
read_at        object
review_text    object
dtypes: int64(2), object(3)
memory usage: 70.5+ MB


In [48]:
reviews.isnull().sum(axis=0)

book_id             0
user_id             0
rating              0
read_at        281802
review_text       426
dtype: int64

In [59]:
print("Number of books: ", reviews.book_id.nunique())
print("Number of users: ", reviews.user_id.nunique())

Number of books:  218987
Number of users:  203655


### Data Visualization

In [60]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

In [None]:
#Book information

In [None]:
#Interaction information

In [61]:
#Review information