In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

## Step 1: Importing Data

### Import poetry book and review JSON files

In [2]:
books = pd.read_json("poetry_data/goodreads_books_poetry.json", lines = True)
reviews = pd.read_json("poetry_data/goodreads_reviews_poetry_cleaned.json", lines = True)

### Displaying first 5 books  (show some sample data ðŸ˜Š)

In [3]:
books.head()

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,,1,[],US,eng,"[{'count': '8', 'name': 'to-read'}, {'count': ...",,False,3.83,,...,11,,1887,https://www.goodreads.com/book/show/16037549-v...,https://images.gr-assets.com/books/1348176637m...,16037549,3,5212748,Vision of Sir Launfal and Other Poems,Vision of Sir Launfal and Other Poems
1,811223981.0,2,[],US,,"[{'count': '100', 'name': 'to-read'}, {'count'...",,False,3.83,B00U2WY9U8,...,4,,2015,https://www.goodreads.com/book/show/22466716-f...,https://images.gr-assets.com/books/1404958407m...,22466716,37,41905435,Fairy Tales: Dramolettes,Fairy Tales: Dramolettes
2,374428115.0,7,[],US,,"[{'count': '32', 'name': 'to-read'}, {'count':...",,False,4.38,,...,7,,2008,https://www.goodreads.com/book/show/926662.Gro...,https://s.gr-assets.com/assets/nophoto/book/11...,926662,45,911665,Growltiger's Last Stand and Other Poems,Growltiger's Last Stand and Other Poems
3,156182890.0,12,[],US,,"[{'count': '554', 'name': 'to-read'}, {'count'...",,False,3.71,B00IWTRB1W,...,3,,1964,https://www.goodreads.com/book/show/926667.The...,https://images.gr-assets.com/books/1382939971m...,926667,115,995066,The Cocktail Party,The Cocktail Party
4,1942004192.0,4,[],US,eng,"[{'count': '228', 'name': 'to-read'}, {'count'...",,False,5.0,,...,12,First,2015,https://www.goodreads.com/book/show/29065952-l...,https://images.gr-assets.com/books/1455198396m...,29065952,9,49294781,Louder Than Everything You Love,Louder Than Everything You Love


### Displaying first 5 reviews (show some sample data ðŸ˜Š)

In [4]:
reviews.head()

Unnamed: 0,user_id,book_id,review_id,review_text,rating,date
0,3ca7375dba942a760e53b726c472a7dd,402128,28423ff309bc896c071a8d9df4a10e8a,I have three younger siblings and we grew up w...,5,Fri Jun 15 11:41:12 -0700 2012
1,0ef32090550901ead25cb0ea21c4d36b,92270,2db1180992e2b0b1631a3ac5644bde84,This is my favorite collection of poetry.,5,Mon Apr 14 18:43:05 -0700 2014
2,d37b46b2190ed7c518259f29b47a9b36,253264,cb1ebc02d8b2aff15735d513877463ce,I just reread this play for a class I am takin...,5,Sat Sep 30 06:39:45 -0700 2017
3,f4c6fe33ef61c38f7f4aeb5224c259a5,13105527,22a588275798c15ed7757db0c526e296,"This ain't a book with to die for characters, ...",5,Tue May 07 10:26:08 -0700 2013
4,4672eb229c808b792b8ea95f01f19784,1420,ca9536ecc762a465f8d220ff87cd88b5,This is why kids don't like to read.,1,Sat Oct 11 01:45:42 -0700 2014


### Total number of poetry books in the dataset

In [5]:
len(books)

36514

### Total number of reviews in the dataset

In [6]:
len(reviews)

109748

## Step 2: Clean the data

Looking into the language codes available in the books dataset - notice an empty one.

In [7]:
books.language_code.unique()

array(['eng', '', 'tur', 'per', 'ara', 'en-US', 'fre', 'cze', 'nl', 'gre',
       'ger', 'ben', 'spa', 'ukr', 'mul', 'rum', 'dan', 'nno', 'ind',
       'por', 'enm', 'fin', 'ita', 'rus', 'bul', 'scr', 'lit', 'msa',
       'swe', 'slv', 'srp', 'pol', 'en-CA', 'nor', 'hin', 'en-GB', 'slo',
       'kat', 'jpn', 'pes', 'lav', 'cat', 'urd', 'vie', 'tlh', 'est',
       'fil', 'heb', 'tam', 'mar', 'zho', 'grc', 'fao', 'mal', 'hun',
       'bos', 'arw', 'lat', 'isl', 'ang', 'glg', 'pan', 'tgl', 'gmh',
       'ira', 'hye', '--', 'tha', 'mon', 'dum', 'sin', 'kur', 'san',
       'peo', 'nob', 'amh', 'aze', 'frm', 'sco', 'kor', 'afr', 'bel',
       'nep', 'dut', 'gla', 'tel', 'dgr', 'ota', 'fro', 'aus', 'sqi',
       'mkd', 'snd'], dtype=object)

We will only be looking at english books for the purpose of this assignment - those language codes are: `en-US, eng, en-GB, and en-CA`

In [8]:
books = books.loc[books['language_code'].isin(['en-US', 'eng', 'en-GB', 'en-CA'])]
books = books[books['similar_books'].astype(bool)]

For the reviews data, we are using the already cleaned dataset. We will now group the reviews by book_id so each book has an entry with all reviews combined.

### Merge both books and reviews based on the book_id

In [9]:
books_and_reviews = pd.merge(reviews, books, on=['book_id'])

### Display the first few rows of new dataframe

In [10]:
books_and_reviews.head()

Unnamed: 0,user_id,book_id,review_id,review_text,rating,date,isbn,text_reviews_count,series,country_code,...,isbn13,publication_month,edition_information,publication_year,url,image_url,ratings_count,work_id,title,title_without_series
0,3ca7375dba942a760e53b726c472a7dd,402128,28423ff309bc896c071a8d9df4a10e8a,I have three younger siblings and we grew up w...,5,Fri Jun 15 11:41:12 -0700 2012,151686564,626,[],US,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats
1,408ad43536dd5340cf6f955fc44f8014,402128,b4499b5b99b4462b2ebdb883561cf6b3,Although T.S. Eliot displays his complete mast...,3,Sat Dec 28 07:35:21 -0800 2013,151686564,626,[],US,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats
2,a93cf495a6ec60a24ed11c1ae6f45aa9,402128,a87527ad1fe429dc41cc332723fb91de,"Eh, as an avid cat lover I was a little disapp...",3,Wed Jun 12 14:32:42 -0700 2013,151686564,626,[],US,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats
3,8d742d4c7a925de61e1574884bd2985a,402128,5e77a73030136445dcc359de17bd3e6a,This one was given to me for my graduation fro...,5,Sun May 18 21:15:35 -0700 2008,151686564,626,[],US,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats
4,d4f1cec123612b1bfc04752fa0c190ab,402128,8521d4c827321716b81d1d36bf9a6f6e,I love cats and T.S Elliot. Why I have never r...,5,Thu Aug 18 20:26:59 -0700 2016,151686564,626,[],US,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats


### Total Number of Rows in the New Dataframe

In [11]:
len(books_and_reviews)

50014

In [12]:
# making one entry per book - essentially merging all reviews together (arely) - we can drop columns that we dont need
books_and_reviews = books_and_reviews.groupby(['book_id'], as_index=False).agg(' '.join)
books_and_reviews.head()

Unnamed: 0,book_id,user_id,review_id,review_text,date,isbn,country_code,language_code,asin,is_ebook,...,num_pages,publication_day,isbn13,publication_month,edition_information,publication_year,url,image_url,title,title_without_series
0,289,9fb72b46fb5e148a8a085b359f83420c 274923c607c1d...,c3700d2dbd14b55f1f03b49d49902816 64144beaf3890...,"Of course, the various stories and poems in th...",Tue Jun 23 12:14:41 -0700 2015 Wed Sep 29 09:5...,014019553X 014019553X 014019553X 014019553X,US US US US,eng eng eng eng,,false false false false,...,102 102 102 102,1 1 1 1,9780140195538 9780140195538 9780140195538 9780...,1 1 1 1,,1998 1998 1998 1998,https://www.goodreads.com/book/show/289.The_Be...,https://s.gr-assets.com/assets/nophoto/book/11...,The Beloved: Reflections on the Path of the He...,The Beloved: Reflections on the Path of the He...
1,290,669d7e4cac34818ef3bedb0b258b85bd c24e8a7a4c2c3...,5ae101f2c913c22af90898cc60ecd840 d15fe6439e2bd...,kahlil gibran always makes me speechless... An...,Fri Sep 28 01:36:16 -0700 2007 Sat Dec 28 09:3...,0394431243 0394431243 0394431243 0394431243 03...,US US US US US,eng eng eng eng eng,,false false false false false,...,216 216 216 216 216,21 21 21 21 21,9780394431246 9780394431246 9780394431246 9780...,2 2 2 2 2,,1995 1995 1995 1995 1995,https://www.goodreads.com/book/show/290.Jesus_...,https://images.gr-assets.com/books/1374680606m...,Jesus the Son of Man Jesus the Son of Man Jesu...,Jesus the Son of Man Jesus the Son of Man Jesu...
2,291,98ff24d29e10421bc146a4f4628a44e8 12c322a9fb854...,2f5b2512a17cfe49cca787317d0a5493 f6e06696f79d1...,That's Why I'm Proud To Be Lebanese . Awesome ...,Sat Feb 08 11:02:26 -0800 2014 Fri Sep 23 05:0...,0806501901 0806501901 0806501901 0806501901 08...,US US US US US US US US US US US US US US US U...,eng eng eng eng eng eng eng eng eng eng eng en...,,false false false false false false false fals...,...,132 132 132 132 132 132 132 132 132 132 132 13...,3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ...,9780806501901 9780806501901 9780806501901 9780...,3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ...,,2003 2003 2003 2003 2003 2003 2003 2003 2003 2...,https://www.goodreads.com/book/show/291.The_Br...,https://s.gr-assets.com/assets/nophoto/book/11...,The Broken Wings The Broken Wings The Broken W...,The Broken Wings The Broken Wings The Broken W...
3,292,81be7ae2d750bf2fa5dd66cfb8e2beae 3de98571562b1...,0eb3bdf942c051525d8ce1db01f98a5d 6912388623fb1...,It's a good book if you need guidance when you...,Thu Dec 04 08:33:44 -0800 2014 Wed Feb 03 19:1...,067943920X 067943920X 067943920X 067943920X 06...,US US US US US US US US US US US US US US US U...,eng eng eng eng eng eng eng eng eng eng eng en...,,false false false false false false false fals...,...,100 100 100 100 100 100 100 100 100 100 100 10...,14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 1...,9780679439202 9780679439202 9780679439202 9780...,6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6,Kahlil Gibran Pocket Library Kahlil Gibran Poc...,2011 2011 2011 2011 2011 2011 2011 2011 2011 2...,https://www.goodreads.com/book/show/292.Sand_a...,https://s.gr-assets.com/assets/nophoto/book/11...,Sand and Foam Sand and Foam Sand and Foam Sand...,Sand and Foam Sand and Foam Sand and Foam Sand...
4,484,4a18ad470b98d44009a07351584548b2,3db564bc1ebf91f3f6f5aeb01f99c933,I vote for kitchen tables as an excellent plac...,Sat Nov 24 00:35:23 -0800 2007,0393318281,US,eng,,false,...,576,17,9780393318289,9,,1998,https://www.goodreads.com/book/show/484.Reinve...,https://images.gr-assets.com/books/1450753166m...,Reinventing the Enemy's Language: Contemporary...,Reinventing the Enemy's Language: Contemporary...


In [13]:
# add extra column of only text (title, description, and review text)
books_and_reviews['text_data'] = books_and_reviews['title'] + books_and_reviews['description'] + books_and_reviews['review_text'] 

In [14]:
len(books_and_reviews)

4430

## Step 3: Initialize module

### Get the system started based only on text descriptions

In [15]:
# initialize vectorizer from sklearn and remove stopwords
tf_idf = TfidfVectorizer(stop_words='english')

# generate vectors for text data
vectors = tf_idf.fit_transform(books_and_reviews['text_data'])

# calculate cosine similarities (matrix)
cosine_sim_matrix = linear_kernel(vectors, vectors)

## Step 4: Decision module

### Given a book_id, we will calculate the cosine similarities and sort them to get the top recommended books

In [16]:
indices = pd.Series(books_and_reviews.index, index=books_and_reviews['book_id'])
def recommend_book(book_id):
    # get the index of the book we're referencing/comparing to for recommendations
    index = indices[book_id]
    
    # get pairwise cosine similarities
    cosine_sim = list(enumerate(cosine_sim_matrix[index]))
    
    # sort on cosine similarity scores (descending order)
    cosine_sim = sorted(cosine_sim, key=lambda x: x[1], reverse=True)
    
    # get the top 7 books
    cosine_sim = cosine_sim[1:8]
    top_7 = [score[0] for score in cosine_sim]
    return top_7

### Generate a random book to generate recommendations for

In [17]:
#select a random book to use and generate recommendations on 
random_book = books_and_reviews.sample()
book_selected = random_book['book_id'].values[0]
print(books[books.book_id == book_selected].title.values[0])

On Poetry


### Now, generate the top 7 book recommendations based on the book selected

In [40]:
similar_books = books[books.book_id == book_selected].similar_books
type(similar_books)
for book in similar_books:
    print(book)

['1322596', '12030685', '202158', '80992', '10332278', '1830161', '440025', '81705', '540174', '20763601', '3998688', '896812', '13166604', '3878581', '13571581', '7517330', '23719365', '206633']


In [None]:
# pass the book_id to recommend_book to get the top 7 book recommendations
top_7 = recommend_book(book_selected)
count = 0
for book_index in top_7:
    id = books_and_reviews.iloc[book_index].book_id
    if str(id) in similar_books:
        count += 1
    print(id)

In [31]:

if str(80992) in similar_books:
    print("yea~")

### In the approach above, we were only able to generate a book recommendation based on one single book selected so we could do better than that. For this next approach, we will attempt to optimize our recommender by creating a user profile and generating recommendations for said user!

In [None]:
# number of unique users
len(books_and_reviews.user_id.unique())

In [None]:
# select a random user to use
user_selected = books_and_reviews.sample()['user_id'].values[0]
user_selected

In [None]:
# pull all reviews to build a user profile
user_profile = books_and_reviews[books_and_reviews['user_id'] == user_selected]
len(user_profile)

In [None]:
# build text data for user profile
user_profile['text_data'] = user_profile['title'] + user_profile['description'] + user_profile['review_text'] 

user_profile.head()