In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

## Step 1: Importing Data

### Import poetry book and review JSON files

In [2]:
books = pd.read_json("poetry_data/goodreads_books_poetry.json", lines = True)
reviews = pd.read_json("poetry_data/goodreads_reviews_poetry.json", lines = True)

### Displaying first 5 books  (show some sample data 😊)

In [3]:
books.head()

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,,1,[],US,eng,"[{'count': '8', 'name': 'to-read'}, {'count': ...",,False,3.83,,...,11,,1887,https://www.goodreads.com/book/show/16037549-v...,https://images.gr-assets.com/books/1348176637m...,16037549,3,5212748,Vision of Sir Launfal and Other Poems,Vision of Sir Launfal and Other Poems
1,811223981.0,2,[],US,,"[{'count': '100', 'name': 'to-read'}, {'count'...",,False,3.83,B00U2WY9U8,...,4,,2015,https://www.goodreads.com/book/show/22466716-f...,https://images.gr-assets.com/books/1404958407m...,22466716,37,41905435,Fairy Tales: Dramolettes,Fairy Tales: Dramolettes
2,374428115.0,7,[],US,,"[{'count': '32', 'name': 'to-read'}, {'count':...",,False,4.38,,...,7,,2008,https://www.goodreads.com/book/show/926662.Gro...,https://s.gr-assets.com/assets/nophoto/book/11...,926662,45,911665,Growltiger's Last Stand and Other Poems,Growltiger's Last Stand and Other Poems
3,156182890.0,12,[],US,,"[{'count': '554', 'name': 'to-read'}, {'count'...",,False,3.71,B00IWTRB1W,...,3,,1964,https://www.goodreads.com/book/show/926667.The...,https://images.gr-assets.com/books/1382939971m...,926667,115,995066,The Cocktail Party,The Cocktail Party
4,1942004192.0,4,[],US,eng,"[{'count': '228', 'name': 'to-read'}, {'count'...",,False,5.0,,...,12,First,2015,https://www.goodreads.com/book/show/29065952-l...,https://images.gr-assets.com/books/1455198396m...,29065952,9,49294781,Louder Than Everything You Love,Louder Than Everything You Love


### Displaying first 5 reviews (show some sample data 😊)

In [4]:
reviews.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,3ca7375dba942a760e53b726c472a7dd,402128,28423ff309bc896c071a8d9df4a10e8a,5,I have three younger siblings and we grew up w...,Tue Jun 12 08:59:04 -0700 2012,Fri Jun 15 11:41:12 -0700 2012,,,0,0
1,0ef32090550901ead25cb0ea21c4d36b,92270,2db1180992e2b0b1631a3ac5644bde84,5,This is my favorite collection of poetry.,Mon Apr 14 18:42:40 -0700 2014,Mon Apr 14 18:43:05 -0700 2014,Wed Jan 01 00:00:00 -0800 1997,,0,0
2,0ef32090550901ead25cb0ea21c4d36b,908708,bca57fa40e92c9261b00b03dbebd96fe,4,"He's so disturbing. So very, very disturbing.",Tue Apr 22 13:58:10 -0700 2008,Tue Apr 22 13:58:33 -0700 2008,,,0,0
3,d37b46b2190ed7c518259f29b47a9b36,253264,cb1ebc02d8b2aff15735d513877463ce,5,I just reread this play for a class I am takin...,Wed Sep 27 19:08:08 -0700 2017,Sat Sep 30 06:39:45 -0700 2017,Wed Sep 27 00:00:00 -0700 2017,Tue Sep 26 00:00:00 -0700 2017,1,0
4,af157d0205b8a901dee6d4a2aed7e6ad,70885,8dca128b8e869048a7442c18659dbece,5,"Cuanto mas leo, mas me gusta. Su poesia es env...",Thu Jun 18 20:00:03 -0700 2015,Thu Jun 18 20:01:29 -0700 2015,Thu Jun 18 00:00:00 -0700 2015,Tue Jun 16 00:00:00 -0700 2015,0,0


### Total number of poetry books in the dataset

In [5]:
len(books)

36514

### Total number of reviews in the dataset

In [6]:
len(reviews)

154555

### Merge both book and review dataset on book_id

In [7]:
books_and_reviews = pd.merge(reviews, books, on=['book_id'])

### Displaying first 5 rows of data

In [8]:
books_and_reviews.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,...,isbn13,publication_month,edition_information,publication_year,url,image_url,ratings_count,work_id,title,title_without_series
0,3ca7375dba942a760e53b726c472a7dd,402128,28423ff309bc896c071a8d9df4a10e8a,5,I have three younger siblings and we grew up w...,Tue Jun 12 08:59:04 -0700 2012,Fri Jun 15 11:41:12 -0700 2012,,,0,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats
1,408ad43536dd5340cf6f955fc44f8014,402128,b4499b5b99b4462b2ebdb883561cf6b3,3,Although T.S. Eliot displays his complete mast...,Sat Dec 28 07:30:50 -0800 2013,Sat Dec 28 07:35:21 -0800 2013,Fri Dec 27 00:00:00 -0800 2013,,0,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats
2,a93cf495a6ec60a24ed11c1ae6f45aa9,402128,a87527ad1fe429dc41cc332723fb91de,3,"Eh, as an avid cat lover I was a little disapp...",Thu Jun 21 18:01:08 -0700 2012,Wed Jun 12 14:32:42 -0700 2013,Wed Jun 12 14:32:42 -0700 2013,Wed Jun 12 00:00:00 -0700 2013,0,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats
3,8d742d4c7a925de61e1574884bd2985a,402128,5e77a73030136445dcc359de17bd3e6a,5,This one was given to me for my graduation fro...,Sun May 18 21:10:26 -0700 2008,Sun May 18 21:15:35 -0700 2008,,,0,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats
4,d4f1cec123612b1bfc04752fa0c190ab,402128,8521d4c827321716b81d1d36bf9a6f6e,5,I love cats and T.S Elliot. Why I have never r...,Sat Jul 16 11:33:49 -0700 2016,Thu Aug 18 20:26:59 -0700 2016,Thu Aug 18 00:00:00 -0700 2016,Tue Aug 16 00:00:00 -0700 2016,3,...,9780151686568,8,Illustrated Edition,1982,https://www.goodreads.com/book/show/402128.Old...,https://images.gr-assets.com/books/1327882662m...,15716,372536,Old Possum's Book of Practical Cats,Old Possum's Book of Practical Cats


### Total Number of Rows in Dataframe

In [9]:
len(books_and_reviews)

154555

## Step 2: Clean the data

Looking into the language codes available in the dataset - notice an empty one.

In [10]:
books_and_reviews.language_code.unique()

array(['en-US', 'eng', '', 'ara', 'fin', 'ind', 'swe', 'en-GB', 'en-CA',
       'spa', 'msa', 'por', 'nl', 'mul', 'fre', 'ger', 'cze', 'per',
       'fil', 'srp', 'ita', 'bul', 'tur', 'lit', 'rum', 'hin', 'pol',
       'ben', 'nor', 'nno', 'nob', 'vie', 'rus', 'ukr', 'est', 'scr',
       'tlh', 'nep', 'enm', 'dan', 'lav', 'slo', 'gre', 'urd', 'afr',
       'mon', 'arw', 'heb', 'lat', 'sco', 'grc', 'dum', 'hye', '--',
       'slv', 'kat', 'tel', 'pes', 'tam', 'mkd', 'peo', 'sqi', 'hun',
       'jpn', 'tha', 'ota', 'glg', 'ang', 'sin', 'aus', 'bel', 'snd',
       'tgl', 'ira', 'mal', 'cat', 'isl', 'aze', 'amh', 'zho', 'dgr',
       'pan', 'fro', 'bos', 'gmh', 'kur', 'san', 'frm', 'mar', 'fao',
       'dut', 'gla', 'kor'], dtype=object)

We will only be looking at english books for the purpose of this assignment - those language codes are: `en-US, eng, en-GB, and en-CA`

In [11]:
books_and_reviews = books_and_reviews.loc[books_and_reviews['language_code'].isin(['en-US', 'eng', 'en-GB', 'en-CA'])]

### New number of rows after removing books not in english

In [12]:
len(books_and_reviews)

66394

In [13]:
# making one entry per book - essentially merging all reviews together (arely) - we can drop columns that we dont need
temp = books_and_reviews.groupby(['book_id', 'title', 'description'], as_index=False).agg(' '.join)
temp.head()

Unnamed: 0,book_id,title,description,user_id,review_id,review_text,date_added,date_updated,read_at,started_at,...,publisher,num_pages,publication_day,isbn13,publication_month,edition_information,publication_year,url,image_url,title_without_series
0,244,The Puffin Book Of Nonsense Verse,Ever eaten Poodle Strudel? Slain a Jabberwock?...,0d7f985364140270f38979a08b54ce64 f1c65b0db8a70...,9884a91605b165994bb902e8f1c1a00b b236bf469b5e5...,"notable - ogden nash, -the purist-. Laura Rich...",Sun Dec 23 22:42:27 -0800 2012 Thu Feb 20 16:1...,Mon Dec 31 18:22:40 -0800 2012 Fri May 16 03:3...,Mon Dec 31 00:00:00 -0800 2012 Sat May 10 00:0...,Mon Dec 24 00:00:00 -0800 2012 Thu Feb 20 00:0...,...,Puffin Puffin,287 287,3 3,9780140366600 9780140366600,10 10,,1996 1996,https://www.goodreads.com/book/show/244.The_Pu...,https://images.gr-assets.com/books/1356458077m...,The Puffin Book Of Nonsense Verse The Puffin B...
1,289,The Beloved: Reflections on the Path of the Heart,"Exquisite writings on love, marriage, and the ...",9fb72b46fb5e148a8a085b359f83420c 274923c607c1d...,c3700d2dbd14b55f1f03b49d49902816 64144beaf3890...,"Of course, the various stories and poems in th...",Fri Jul 25 09:17:36 -0700 2014 Sat Aug 30 16:1...,Tue Jun 23 12:14:41 -0700 2015 Wed Sep 29 09:5...,Fri Jul 25 09:23:37 -0700 2014 Sat Apr 16 00...,Thu Jul 24 00:00:00 -0700 2014 Sat Apr 02 00...,...,Penguin Books Penguin Books Penguin Books Peng...,102 102 102 102 102,1 1 1 1 1,9780140195538 9780140195538 9780140195538 9780...,1 1 1 1 1,,1998 1998 1998 1998 1998,https://www.goodreads.com/book/show/289.The_Be...,https://s.gr-assets.com/assets/nophoto/book/11...,The Beloved: Reflections on the Path of the He...
2,290,Jesus the Son of Man,In Jesus Son of Man Jesus is portrayed through...,da3b9a5ea91c12e5cfa098588afa737d ec54bc6493644...,9c4fa7b9913339fdec91473847e241f2 d6cb5b14612b4...,n kn lkl dyn khlq ...fkhlq rsl@ `ys~ hy lmHb@ ...,Tue Nov 05 03:52:17 -0800 2013 Wed Mar 12 19:3...,Sun Jan 05 08:41:27 -0800 2014 Tue Dec 30 15:0...,Tue Mar 11 00:00:00 -0700 2014 Thu Feb 09 16:...,Thu Feb 09 00:00:00 -0800 2017 Tue Feb 11 00...,...,Knopf Knopf Knopf Knopf Knopf Knopf Knopf Knop...,216 216 216 216 216 216 216 216 216 216 216 21...,21 21 21 21 21 21 21 21 21 21 21 21 21 21,9780394431246 9780394431246 9780394431246 9780...,2 2 2 2 2 2 2 2 2 2 2 2 2 2,,1995 1995 1995 1995 1995 1995 1995 1995 1995 1...,https://www.goodreads.com/book/show/290.Jesus_...,https://images.gr-assets.com/books/1374680606m...,Jesus the Son of Man Jesus the Son of Man Jesu...
3,291,The Broken Wings,This is the exquisitely tender story of love t...,a2643d991f999dfb99752b56d2f45d53 d716956e51e09...,7b0db8717fbd37fe4ff7c214185a3271 4f2e5f58f04d2...,fy hdhh lrwy@ km hy'l mn lmsh`r lrqyq@ lSdq@ S...,Thu Oct 10 11:15:09 -0700 2013 Sun Feb 10 07:0...,Thu Dec 05 14:24:07 -0800 2013 Fri Aug 02 03:5...,Fri Dec 06 00:00:00 -0800 2013 Tue May 14 00:0...,Thu Dec 05 00:00:00 -0800 2013 Thu May 09 00:0...,...,Citadel Citadel Citadel Citadel Citadel Citade...,132 132 132 132 132 132 132 132 132 132 132 13...,3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ...,9780806501901 9780806501901 9780806501901 9780...,3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ...,...,2003 2003 2003 2003 2003 2003 2003 2003 2003 2...,https://www.goodreads.com/book/show/291.The_Br...,https://s.gr-assets.com/assets/nophoto/book/11...,The Broken Wings The Broken Wings The Broken W...
4,292,Sand and Foam,"A book of aphorisms, poems, and parables by th...",55ee51ab3a7247d6e736b82a3e4de91b e00e45b1c3f58...,8b33b1979cf17b352c8c2a60ab7b79a6 d11d6e45a5c93...,Hzyn wlkn@ kthr mn ry'` hn ydhkrny jbrn 'n 'sm...,Tue Aug 05 12:29:13 -0700 2014 Mon Mar 09 17:5...,Mon Aug 18 14:06:47 -0700 2014 Mon Mar 09 20:1...,Mon Aug 18 14:06:47 -0700 2014 Tue Oct 21 00...,Tue Aug 05 00:00:00 -0700 2014 Mon Dec 29 0...,...,Knopf Knopf Knopf Knopf Knopf Knopf Knopf Knop...,100 100 100 100 100 100 100 100 100 100 100 10...,14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 1...,9780679439202 9780679439202 9780679439202 9780...,6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 ...,Kahlil Gibran Pocket Library Kahlil Gibran Poc...,2011 2011 2011 2011 2011 2011 2011 2011 2011 2...,https://www.goodreads.com/book/show/292.Sand_a...,https://s.gr-assets.com/assets/nophoto/book/11...,Sand and Foam Sand and Foam Sand and Foam Sand...


In [14]:
# add extra column of only text (title, description, and review text)
temp['text_data'] = temp['title'] + temp['description'] + temp['review_text'] 

In [15]:
len(temp)

9620

## Step 3: Initialize module

### Get the system started based only on text descriptions

In [16]:
# initialize vectorizer from sklearn and remove stopwords
tf_idf = TfidfVectorizer(stop_words='english')

# generate vectors for text data
vectors = tf_idf.fit_transform(temp['text_data'])

# calculate cosine similarities (matrix)
cosine_sim_matrix = linear_kernel(vectors, vectors)

## Step 4: Decision module

### Given a book_id, we will calculate the cosine similarities and sort them to get the top recommended books

In [17]:
indices = pd.Series(temp.index, index=temp['book_id'])
def recommend_book(book_id):
    # get the index of the book we're referencing/comparing to for recommendations
    index = indices[book_id]
    
    # get pairwise cosine similarities
    cosine_sim = list(enumerate(cosine_sim_matrix[index]))
    
    # sort on cosine similarity scores (descending order)
    cosine_sim = sorted(cosine_sim, key=lambda x: x[1], reverse=True)
    
    # get the top 7 books
    cosine_sim = cosine_sim[1:8]
    top_7 = [score[0] for score in cosine_sim]
    return top_7

### Generate a random book to generate recommendations for

In [18]:
#select a random book to use and generate recommendations on 
random_book = temp.sample()
book_selected = random_book['book_id'].values[0]
print(random_book['title'].values[0])

Ascent of Mount Carmel


### Now, generate the top 7 book recommendations

In [19]:
# pass the book_id to recommend_book to get the top 7 book recommendations
top_7 = recommend_book(book_selected)
for book_index in top_7:
    print(temp.iloc[book_index]['title'])

The Collected Works of St. John of the Cross
The Poems of St John of the Cross
Love Poems from God: Twelve Sacred Voices from the East and West
Moonlight
Heavenly City Earthly City
Overpass
St. Peter's B-List: Contemporary Poems Inspired by the Saints


# Ignore this for now - work in progress for building a user profile

### instead of recommending based on content only; this would recommend based on what the user likes

In [20]:
# number of unique users
len(books_and_reviews.user_id.unique())

30085

In [21]:
# select a random user to use
user_selected = books_and_reviews.sample()['user_id'].values[0]
user_selected

'520534fe4762cd8160d019417d89daf0'

In [22]:
# pull all reviews to build a user profile
user_profile = books_and_reviews[books_and_reviews['user_id'] == user_selected]
len(user_profile)

7

In [23]:
# build text data for user profile
user_profile['text_data'] = user_profile['title'] + user_profile['description'] + user_profile['review_text'] 

#tf idf
'''df_user_profile = '''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_profile['text_data'] = user_profile['title'] + user_profile['description'] + user_profile['review_text']


'df_user_profile = '