In [30]:
import pandas as pd
import numpy as np
import pymongo
import re
import json
import pprint
import warnings

In [31]:
#Connect to mongodb

myclient = pymongo.MongoClient('mongodb://localhost:27017/')
mydb = myclient['goodreads_books']
mycol = mydb['book data']

In [32]:
#Pull data from database

myquery = {'language_code': 'eng'}

In [33]:
#Execute query

cursor = mycol.find(myquery)

In [34]:
list_cur = list(cursor[0:50000])

In [35]:
df = pd.DataFrame(list_cur)

In [36]:
df.head()

Unnamed: 0,_id,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,623e55ba01c96d282f264672,,7,[189911],US,eng,"[{'count': '58', 'name': 'to-read'}, {'count':...",B00071IKUY,False,4.03,...,,Book Club Edition,1987.0,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,7327624,140,8948723,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ..."
1,623e55ba01c96d282f264673,0743294297,3282,[],US,eng,"[{'count': '7615', 'name': 'to-read'}, {'count...",,False,3.49,...,7.0,,2009.0,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,6066819,51184,6243154,Best Friends Forever,Best Friends Forever
2,623e55ba01c96d282f26467b,,60,[1052227],US,eng,"[{'count': '54', 'name': 'currently-reading'},...",B01NCIKAQX,True,4.33,...,,,,https://www.goodreads.com/book/show/33394837-t...,https://images.gr-assets.com/books/1493114742m...,33394837,269,54143148,The House of Memory (Pluto's Snitch #2),The House of Memory (Pluto's Snitch #2)
3,623e55ba01c96d282f26467e,555118000X,19,[],US,eng,"[{'count': '3488', 'name': 'to-read'}, {'count...",,False,3.82,...,,,,https://www.goodreads.com/book/show/89373.The_...,https://s.gr-assets.com/assets/nophoto/book/11...,89373,77,1080201,The Bonfire of the Vanities,The Bonfire of the Vanities
4,623e55ba01c96d282f264680,0842379428,566,[],US,eng,"[{'count': '6393', 'name': 'to-read'}, {'count...",,False,4.26,...,,,,https://www.goodreads.com/book/show/89376.Heaven,https://images.gr-assets.com/books/1406508230m...,89376,7345,86257,Heaven,Heaven


In [37]:
#Subset df

df = df[['title', 'description', 'average_rating', 'num_pages']]

In [38]:
df['num_pages'] = pd.to_numeric(df['num_pages'])

In [39]:
#Remove empty strings and null values

df.replace('', np.nan, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(keep="last")

Unnamed: 0,title,description,average_rating,num_pages
0,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",Omnibus book club edition containing the Ladie...,4.03,600.0
1,Best Friends Forever,Addie Downs and Valerie Adler were eight when ...,3.49,368.0
4,Heaven,What is Heaven really going to be like? What w...,4.26,533.0
5,Dog Heaven,In Newbery Medalist Cynthia Rylant's classic b...,4.43,40.0
8,Untold Secrets: Fire & Ice,Arrianna Williams is an ordinary 25 yr. old wo...,0.00,168.0
...,...,...,...,...
49990,"Eros (Olympiana, #1)","William Shakespeare once wrote that, ""Cupid is...",3.49,332.0
49991,Fire Play,Fire demiurge Lau had it made until the Mother...,3.39,190.0
49996,Geisha,"In this classic best-seller, Liza Dalby, the o...",3.99,367.0
49997,The Book of Nod,Legends of the Undead\nThe first vampires reme...,3.86,134.0


In [40]:
df['num_pages'] = df['num_pages']>=200

In [41]:
df.shape

(32719, 4)

Tokenization

In [42]:
import nltk

stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces AND remove null fields
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()

    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

norm_corpus = normalize_corpus(list(df['description']))
len(norm_corpus)

32719

Preprocessing

In [43]:
#Extract TF-IDF Features

from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
tfidf_matrix = tf.fit_transform(norm_corpus)
tfidf_matrix.shape

(32719, 317068)

In [44]:
#Pairwise Similarity

from sklearn.metrics.pairwise import cosine_similarity

doc_sim = cosine_similarity(tfidf_matrix)
doc_sim_df = pd.DataFrame(doc_sim)
doc_sim_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32709,32710,32711,32712,32713,32714,32715,32716,32717,32718
0,1.0,0.0,0.005981,0.00437,0.014802,0.0,0.005056,0.0,0.00692,0.0,...,0.008026,0.0,0.0,0.013299,0.0,0.0,0.0,0.0,0.026608,0.0
1,0.0,1.0,0.002182,0.003718,0.005749,0.0,0.00668,0.009121,0.005101,0.0,...,0.0,0.004916,0.004498,0.015013,0.014073,0.003637,0.008619,0.0,0.00291,0.006097
2,0.005981,0.002182,1.0,0.085938,0.022028,0.0,0.049752,0.006239,0.002752,0.0,...,0.00546,0.003548,0.015108,0.01696,0.002255,0.018128,0.013087,0.010381,0.005844,0.007399
3,0.00437,0.003718,0.085938,1.0,0.009635,0.0,0.026515,0.011033,0.007503,0.0,...,0.002332,0.006754,0.0,0.005366,0.011157,0.011132,0.001596,0.049253,0.004967,0.010517
4,0.014802,0.005749,0.022028,0.009635,1.0,0.0,0.013786,0.005105,0.009344,0.0,...,0.0079,0.008748,0.003867,0.019142,0.0,0.017924,0.021673,0.002541,0.010339,0.007452


In [45]:
#Extract book titles

book_list = df['title'].values
book_list, book_list.shape

(array(['The Unschooled Wizard (Sun Wolf and Starhawk, #1-2)',
        'Best Friends Forever', 'Heaven', ..., 'Geisha', 'The Book of Nod',
        'Claimed Beauty (The Cubi, #2)'], dtype=object),
 (32719,))

In [46]:
#Top 5 Most Similar Titles

#Find ID

book_idx = np.where(book_list == 'Untold Secrets: Fire & Ice')[0][0]
book_idx

4

In [47]:
book_similarities = doc_sim_df.iloc[book_idx].values
book_similarities

array([0.01480158, 0.00574926, 0.0220278 , ..., 0.00254134, 0.01033887,
       0.0074517 ])

In [48]:
similar_book_idxs = np.argsort(-book_similarities)[1:6]
similar_book_idxs

array([ 9399, 26301, 19383, 14348, 23668], dtype=int64)

In [49]:
similar_books = book_list[similar_book_idxs]
similar_books

array(['City of Secrets (Saskia Trilogy, #2)', 'Hermit: A Novella',
       'Bublos', 'Her Fallen Angel (Her Angel, #2)',
       'The Wind-up Forest  (Archangel Chronicles #4)'], dtype=object)

In [50]:
def book_recommender(book_title, books=book_list, doc_sims=doc_sim_df):
    # find book id
    book_idx = np.where(books == book_title)[0][0]
    # get book similarities
    book_similarities = doc_sims.iloc[book_idx].values
    # get top 5 similar book IDs
    similar_book_idxs = np.argsort(-book_similarities)[1:6]
    # get top 5 books
    similar_books = books[similar_book_idxs]
    # return the top 5 books
    return similar_books

In [51]:
#Most Popular

pop_books = df.sort_values(by='average_rating', ascending=False)
pop_books.head()

Unnamed: 0,title,description,average_rating,num_pages
18099,The Silent Soldiers: Back To School (The Silen...,"Luke continues to follow his destiny, and fulf...",5.0,True
2275,The Successful Author's Handbook,The Successful Author's Handbook is a collecti...,5.0,False
39594,Chickenhood,poetry chapbook,5.0,False
30526,The Next Victim,"After her mother's brutal murder, Elle finds h...",5.0,True
30530,"The 'Fishy' Adventures of Gus, Toby and TooToo...","Hi, it's me, Gus, again! Your rainbow trout fr...",5.0,False


In [52]:
popular_books = pop_books.title[:10]

In [53]:
for book in popular_books:
    print('Book Title:', book)
    print('Top 5 recommended Books:\n', book_recommender(book_title=book))
    print()

Book Title: The Silent Soldiers: Back To School (The Silent Soldiers, #2)
Top 5 recommended Books:
 ['Desert Chimera (Stone Quest, #1)' 'Anton and the Battle' 'Hold'
 'Staying For Good' 'Homecoming Ranch (Pine River, #1)']

Book Title: The Successful Author's Handbook
Top 5 recommended Books:
 ['Public Speaking for Authors, Creatives and Other Introverts'
 'Public Speaking Boot Camp: A practical guide to delivering talks with substance, clarity & confidence'
 "Is Self-Publishing For  You?: A Quick Guide For Navigating Today's Book Publishing Maze"
 'Finding Author Success'
 'Do You Talk Funny? 7 Comedy Habits to Become a Better (and Funnier) Public Speaker']

Book Title: Chickenhood
Top 5 recommended Books:
 ['Navigating the Old Road' 'Aphasia' 'The Writing'
 'On the Kitchen Table From Which Everything Has Been Hastily Removed'
 'Correspondence in D Minor']

Book Title: The Next Victim
Top 5 recommended Books:
 ['What Came Before'
 "The Billionaire's Unlikely Bride (The Billionaire's B