In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

## Step 1: Importing Data

### Import poetry book and review JSON files

In [3]:
books = pd.read_json("data/goodreads_books_poetry.json", lines = True)
reviews = pd.read_json("data/goodreads_reviews_poetry_cleaned.json", lines = True)

### Displaying first 5 books  (show some sample data 😊)

In [4]:
books.head()

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,,1,[],US,eng,"[{'count': '8', 'name': 'to-read'}, {'count': ...",,False,3.83,,...,11,,1887,https://www.goodreads.com/book/show/16037549-v...,https://images.gr-assets.com/books/1348176637m...,16037549,3,5212748,Vision of Sir Launfal and Other Poems,Vision of Sir Launfal and Other Poems
1,811223981.0,2,[],US,,"[{'count': '100', 'name': 'to-read'}, {'count'...",,False,3.83,B00U2WY9U8,...,4,,2015,https://www.goodreads.com/book/show/22466716-f...,https://images.gr-assets.com/books/1404958407m...,22466716,37,41905435,Fairy Tales: Dramolettes,Fairy Tales: Dramolettes
2,374428115.0,7,[],US,,"[{'count': '32', 'name': 'to-read'}, {'count':...",,False,4.38,,...,7,,2008,https://www.goodreads.com/book/show/926662.Gro...,https://s.gr-assets.com/assets/nophoto/book/11...,926662,45,911665,Growltiger's Last Stand and Other Poems,Growltiger's Last Stand and Other Poems
3,156182890.0,12,[],US,,"[{'count': '554', 'name': 'to-read'}, {'count'...",,False,3.71,B00IWTRB1W,...,3,,1964,https://www.goodreads.com/book/show/926667.The...,https://images.gr-assets.com/books/1382939971m...,926667,115,995066,The Cocktail Party,The Cocktail Party
4,1942004192.0,4,[],US,eng,"[{'count': '228', 'name': 'to-read'}, {'count'...",,False,5.0,,...,12,First,2015,https://www.goodreads.com/book/show/29065952-l...,https://images.gr-assets.com/books/1455198396m...,29065952,9,49294781,Louder Than Everything You Love,Louder Than Everything You Love


### Displaying first 5 reviews (show some sample data 😊)

In [5]:
reviews.head()

Unnamed: 0,user_id,book_id,review_id,review_text,rating,date
0,3ca7375dba942a760e53b726c472a7dd,402128,28423ff309bc896c071a8d9df4a10e8a,I have three younger siblings and we grew up w...,5,Fri Jun 15 11:41:12 -0700 2012
1,0ef32090550901ead25cb0ea21c4d36b,92270,2db1180992e2b0b1631a3ac5644bde84,This is my favorite collection of poetry.,5,Mon Apr 14 18:43:05 -0700 2014
2,d37b46b2190ed7c518259f29b47a9b36,253264,cb1ebc02d8b2aff15735d513877463ce,I just reread this play for a class I am takin...,5,Sat Sep 30 06:39:45 -0700 2017
3,f4c6fe33ef61c38f7f4aeb5224c259a5,13105527,22a588275798c15ed7757db0c526e296,"This ain't a book with to die for characters, ...",5,Tue May 07 10:26:08 -0700 2013
4,4672eb229c808b792b8ea95f01f19784,1420,ca9536ecc762a465f8d220ff87cd88b5,This is why kids don't like to read.,1,Sat Oct 11 01:45:42 -0700 2014


### Total number of poetry books in the dataset

In [6]:
len(books)

36514

### Total number of reviews in the dataset

In [7]:
len(reviews)

109748

### Merge both books and reviews based on the book_id

In [8]:
books_and_reviews = pd.merge(reviews, books, on=['book_id'])

### Display the first few rows of new dataframe

In [9]:
books_and_reviews.head()

Unnamed: 0,user_id,book_id,review_id,review_text,rating,date,isbn,text_reviews_count,series,country_code,...,isbn13,publication_month,edition_information,publication_year,url,image_url,ratings_count,work_id,title,title_without_series
0,3ca7375dba942a760e53b726c472a7dd,402128,28423ff309bc896c071a8d9df4a10e8a,I have three younger siblings and we grew up w...,5,Fri Jun 15 11:41:12 -0700 2012,151686564,626,[],US,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats
1,408ad43536dd5340cf6f955fc44f8014,402128,b4499b5b99b4462b2ebdb883561cf6b3,Although T.S. Eliot displays his complete mast...,3,Sat Dec 28 07:35:21 -0800 2013,151686564,626,[],US,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats
2,a93cf495a6ec60a24ed11c1ae6f45aa9,402128,a87527ad1fe429dc41cc332723fb91de,"Eh, as an avid cat lover I was a little disapp...",3,Wed Jun 12 14:32:42 -0700 2013,151686564,626,[],US,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats
3,8d742d4c7a925de61e1574884bd2985a,402128,5e77a73030136445dcc359de17bd3e6a,This one was given to me for my graduation fro...,5,Sun May 18 21:15:35 -0700 2008,151686564,626,[],US,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats
4,d4f1cec123612b1bfc04752fa0c190ab,402128,8521d4c827321716b81d1d36bf9a6f6e,I love cats and T.S Elliot. Why I have never r...,5,Thu Aug 18 20:26:59 -0700 2016,151686564,626,[],US,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats


### Total Number of Rows in the New Dataframe

In [10]:
len(books_and_reviews)

109748

## Step 2: Clean Data (further)

In [11]:
# making one entry per book - essentially merging all reviews together - we can drop columns that we dont need
books_and_reviews = books_and_reviews.groupby(['book_id', 'title', 'description'], as_index=False).agg(' '.join)
books_and_reviews.head()

Unnamed: 0,book_id,title,description,user_id,review_id,review_text,date,isbn,country_code,language_code,...,publisher,num_pages,publication_day,isbn13,publication_month,edition_information,publication_year,url,image_url,title_without_series
0,234,The Complete Verse and Other Nonsense,This collection demonstrates the varied ways i...,b0ed09d14a462cb2d27327b87edf6f68 440348b13a770...,ac69e58a4c11b7620230b537abef3447 1d8b0a10d3ad6...,"Well, he's not PC, but some of the poems are q...",Fri Oct 14 12:33:32 -0700 2016 Tue Oct 23 07:0...,0142002275 0142002275 0142002275 0142002275 01...,US US US US US,,...,,,,9780142002278 9780142002278 9780142002278 9780...,,,,https://www.goodreads.com/book/show/234.The_Co...,https://s.gr-assets.com/assets/nophoto/book/11...,The Complete Verse and Other Nonsense The Comp...
1,236,The Complete Nonsense of Edward Lear,"The Owl and the Pussy-Cat, Calico Pie and The ...",0f6b8c04f811e05c8978bd6b66ce7685 16552ce64bf04...,59489408a21800b9b9f8e2854334d425 6ce6f4dcb673d...,I'm not sure which edition of Edward Lear's no...,Mon Dec 08 07:42:41 -0800 2014 Mon Dec 08 07:4...,089190090X 089190090X 089190090X 089190090X 08...,US US US US US US,,...,Amereon Limited Amereon Limited Amereon Limite...,287 287 287 287 287 287,1 1 1 1 1 1,9780891900900 9780891900900 9780891900900 9780...,6 6 6 6 6 6,,1940 1940 1940 1940 1940 1940,https://www.goodreads.com/book/show/236.The_Co...,https://s.gr-assets.com/assets/nophoto/book/11...,The Complete Nonsense of Edward Lear The Compl...
2,241,A Was Once an Apple Pie,Hand painted cut paper artwork by Caldecott wi...,ff6d428e075e07702736c9d047b2c046 9e776d13dbc6d...,c1794a9d79e3af528ef09bff506b8e6f 702ff640b190a...,Handpainted cut paper artwork by Caldecott win...,Mon Feb 22 11:29:08 -0800 2016 Tue Apr 09 18:3...,0439660564 0439660564 0439660564 0439660564 04...,US US US US US US US US US US US,,...,Orchard Books Orchard Books Orchard Books Orch...,32 32 32 32 32 32 32 32 32 32 32,1 1 1 1 1 1 1 1 1 1 1,9780439660563 9780439660563 9780439660563 9780...,9 9 9 9 9 9 9 9 9 9 9,,2005 2005 2005 2005 2005 2005 2005 2005 2005 2...,https://www.goodreads.com/book/show/241.A_Was_...,https://images.gr-assets.com/books/1328868141m...,A Was Once an Apple Pie A Was Once an Apple Pi...
3,244,The Puffin Book Of Nonsense Verse,Ever eaten Poodle Strudel? Slain a Jabberwock?...,0d7f985364140270f38979a08b54ce64 f1c65b0db8a70...,9884a91605b165994bb902e8f1c1a00b b236bf469b5e5...,"notable - ogden nash, -the purist-. Laura Rich...",Mon Dec 31 18:22:40 -0800 2012 Fri May 16 03:3...,0140366601 0140366601,US US,eng eng,...,Puffin Puffin,287 287,3 3,9780140366600 9780140366600,10 10,,1996 1996,https://www.goodreads.com/book/show/244.The_Pu...,https://images.gr-assets.com/books/1356458077m...,The Puffin Book Of Nonsense Verse The Puffin B...
4,254,The Time of the Assassins: a Study of Rimbaud,The social function of the creative personalit...,2f1a60ed9be2d8a674d4280123f0b181 ae34bdae58ab6...,31f08519c9cd14b8bb3b5b61f00e84ca 1ac3873237276...,This was my first Miller book and I wasn't qui...,Fri Sep 09 12:35:30 -0700 2016 Thu Dec 17 04:3...,0811201155 0811201155 0811201155 0811201155 08...,US US US US US US US US US US US US US US,,...,New Directions New Directions New Directions N...,160 160 160 160 160 160 160 160 160 160 160 16...,17 17 17 17 17 17 17 17 17 17 17 17 17 17,9780811201155 9780811201155 9780811201155 9780...,1 1 1 1 1 1 1 1 1 1 1 1 1 1,,1962 1962 1962 1962 1962 1962 1962 1962 1962 1...,https://www.goodreads.com/book/show/254.The_Ti...,https://s.gr-assets.com/assets/nophoto/book/11...,The Time of the Assassins: a Study of Rimbaud...


In [12]:
# add extra column of only text (title, description, and review text)
books_and_reviews['text_data'] = books_and_reviews['title'] + books_and_reviews['description'] + books_and_reviews['review_text'] 

In [13]:
len(books_and_reviews)

29112

## Step 3: Initialize module

### Get the system started based only on text descriptions

In [14]:
# initialize vectorizer from sklearn and remove stopwords
tf_idf = TfidfVectorizer(stop_words='english')

# generate vectors for text data
vectors = tf_idf.fit_transform(books_and_reviews['text_data'])

# calculate cosine similarities (matrix)
cosine_sim_matrix = linear_kernel(vectors, vectors)

## Step 4: Decision module

### Given a book_id, we will calculate the cosine similarities and sort them to get the top recommended books

In [15]:
indices = pd.Series(books_and_reviews.index, index=books_and_reviews['book_id']).drop_duplicates()
def recommend_book(book_id):
    # get the index of the book we're referencing/comparing to for recommendations
    index = indices[book_id]
    
    # get pairwise cosine similarities
    cosine_sim = list(enumerate(cosine_sim_matrix[index]))
    
    # sort on cosine similarity scores (descending order)
    cosine_sim = sorted(cosine_sim, key=lambda x: x[1], reverse=True)
    
    # get the top 10 books
    cosine_sim = cosine_sim[1:11]
    top_10 = [score[0] for score in cosine_sim]
    return top_10

### Generate a random book to generate recommendations for

In [16]:
#select a random book to use and generate recommendations on
random_book = books_and_reviews.sample()
book_selected = random_book['book_id'].values[0]
print('=================book selected=================')
print(random_book['title'].values[0])

The Monkey's Straw Raincoat and Other Poetry of the Basho School


### Now, generate the top 10 book recommendations based on the book selected

In [17]:
print('=================testing - generating recommendations=================')
# pass the book_id to recommend_book to get the top 10 book recommendations
top_10 = recommend_book(book_selected)
for book_index in top_10:
    book = books_and_reviews.iloc[book_index]
    id = book.book_id
    print(book.title)

The Heart of Haiku
On Love and Barley: Haiku of Basho
Basho: The Complete Haiku
The Narrow Road to the Deep North and Other Travel Sketches
Lips Too Chilled
Narrow Road to the Interior
The Essential Basho
Basho's Narrow Road: Spring and Autumn Passages
Narrow Road to the Interior: And Other Writings
The Essential Haiku: Versions of Basho, Buson, and Issa
