In [6]:
import pandas as pd
import numpy as np

# keyword extraction and clean up
import nltk
from rake_nltk import Rake
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
import re


#Text Blob for sentiment analysis
from textblob import TextBlob


#tf-idf model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


from sklearn.metrics.pairwise import linear_kernel


import string
import random
from PIL import Image
import requests
from io import BytesIO

from langdetect import detect


In [7]:
df = pd.read_csv("../data/goodreads_final_cleaned.csv")
df.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,Young Adult,374,Suzanne Collins,en
1,https://www.goodreads.com/book/show/2.Harry_Po...,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,4.5,2668409,45724,Fantasy,870,J.K. Rowling,en
2,https://www.goodreads.com/book/show/2657.To_Ki...,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4.28,4772918,95595,Classics,324,Harper Lee,en
3,https://www.goodreads.com/book/show/1885.Pride...,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,4.27,3206070,74020,Classics,279,Jane Austen,en
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,3.61,5231000,107619,Young Adult,501,Stephenie Meyer,en


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8863 entries, 0 to 8862
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   url          8863 non-null   object 
 1   bookTitle    8863 non-null   object 
 2   bookImage    8863 non-null   object 
 3   bookDesc     8863 non-null   object 
 4   bookRating   8863 non-null   float64
 5   ratingCount  8863 non-null   int64  
 6   reviewCount  8863 non-null   int64  
 7   Genre        8863 non-null   object 
 8   pageCount    8863 non-null   int64  
 9   Author       8863 non-null   object 
 10  lang         8863 non-null   object 
dtypes: float64(1), int64(3), object(7)
memory usage: 761.8+ KB


### key words processing

In [9]:
#data pre-processing to remove stop words, punctuation, white space, and convert all words to lower case from book Description
df_new = df.copy()
key_words = []
title_key_words = []

r = Rake()

for index, row in df_new.iterrows():
    r.extract_keywords_from_text(row['bookDesc'])
    key_words_dict_scores = r.get_word_degrees()
    keywords = list(key_words_dict_scores.keys())
    
    
    key_words.append(keywords)

    r.extract_keywords_from_text(row['bookTitle'])
    key_words_dict_scores = r.get_word_degrees()
    title_keywords = list(key_words_dict_scores.keys())
    
    title_key_words.append(title_keywords)
    
df_new["key_words"] = key_words
df_new["title_key_words"] = title_key_words

df_new.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,key_words,title_key_words
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,Young Adult,374,Suzanne Collins,en,"[could, survive, wild, every, one, make, sure,...","[hunger, games]"
1,https://www.goodreads.com/book/show/2.Harry_Po...,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,4.5,2668409,45724,Fantasy,870,J.K. Rowling,en,"[door, end, silent, corridor, ’, haunting, har...","[harry, potter, order, phoenix]"
2,https://www.goodreads.com/book/show/2657.To_Ki...,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4.28,4772918,95595,Classics,324,Harper Lee,en,"[unforgettable, novel, childhood, sleepy, sout...","[kill, mockingbird]"
3,https://www.goodreads.com/book/show/1885.Pride...,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,4.27,3206070,74020,Classics,279,Jane Austen,en,"[alternate, cover, edition, isbn, 978067978326...","[pride, prejudice]"
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,3.61,5231000,107619,Young Adult,501,Stephenie Meyer,en,"[three, things, absolutely, positive, first, e...",[twilight]


In [10]:
df_new2 = df_new.copy()

In [11]:
# clean up keywords: drop symbols and numbers, as well as find root words

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)


df_new2['clean_keywords']=df_new2['key_words'].map(lambda s:preprocess(s)) 

In [12]:
df_new2.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,key_words,title_key_words,clean_keywords
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,Young Adult,374,Suzanne Collins,en,"[could, survive, wild, every, one, make, sure,...","[hunger, games]",could survive wild every one make sure live se...
1,https://www.goodreads.com/book/show/2.Harry_Po...,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,4.5,2668409,45724,Fantasy,870,J.K. Rowling,en,"[door, end, silent, corridor, ’, haunting, har...","[harry, potter, order, phoenix]",door end silent corridor haunting harry pottte...
2,https://www.goodreads.com/book/show/2657.To_Ki...,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4.28,4772918,95595,Classics,324,Harper Lee,en,"[unforgettable, novel, childhood, sleepy, sout...","[kill, mockingbird]",unforgettable novel childhood sleepy southern ...
3,https://www.goodreads.com/book/show/1885.Pride...,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,4.27,3206070,74020,Classics,279,Jane Austen,en,"[alternate, cover, edition, isbn, 978067978326...","[pride, prejudice]",alternate cover edition isbn since immediate s...
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,3.61,5231000,107619,Young Adult,501,Stephenie Meyer,en,"[three, things, absolutely, positive, first, e...",[twilight],three things absolutely positive first edward ...


In [13]:
df_new2 = df_new2.drop(columns=['key_words'])

In [14]:
df_new2.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,Young Adult,374,Suzanne Collins,en,"[hunger, games]",could survive wild every one make sure live se...
1,https://www.goodreads.com/book/show/2.Harry_Po...,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,4.5,2668409,45724,Fantasy,870,J.K. Rowling,en,"[harry, potter, order, phoenix]",door end silent corridor haunting harry pottte...
2,https://www.goodreads.com/book/show/2657.To_Ki...,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4.28,4772918,95595,Classics,324,Harper Lee,en,"[kill, mockingbird]",unforgettable novel childhood sleepy southern ...
3,https://www.goodreads.com/book/show/1885.Pride...,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,4.27,3206070,74020,Classics,279,Jane Austen,en,"[pride, prejudice]",alternate cover edition isbn since immediate s...
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,3.61,5231000,107619,Young Adult,501,Stephenie Meyer,en,[twilight],three things absolutely positive first edward ...


### Sentiment Analysis with Text Blob

In [15]:
# test Text Blob program
# Preparing an input sentence
sentence = '''The platform provides universal access to the world's best education, partnering with top universities and organizations to offer courses online.'''

# Creating a textblob object and assigning the sentiment property
analysis = TextBlob(sentence).sentiment
print(analysis)
print(analysis[0])
print(analysis[1])

Sentiment(polarity=0.5, subjectivity=0.26666666666666666)
0.5
0.26666666666666666


In [16]:
df_sub = df_new2.copy()

In [17]:
# run text blob in a loop

polarities = []
subjectivities = []

for index, row in df_sub.iterrows():
    sentence = row['bookDesc']
    analysis = TextBlob(sentence).sentiment
    polarity = analysis[0]
    subjectivity = analysis[1]

    # populate lists
    polarities.append(polarity)
    subjectivities.append(subjectivity)


df_sub['sentiment_polarity'] = polarities
df_sub['sentiment_subjectivity']= subjectivities

df_sub.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords,sentiment_polarity,sentiment_subjectivity
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,Young Adult,374,Suzanne Collins,en,"[hunger, games]",could survive wild every one make sure live se...,0.101623,0.484921
1,https://www.goodreads.com/book/show/2.Harry_Po...,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,4.5,2668409,45724,Fantasy,870,J.K. Rowling,en,"[harry, potter, order, phoenix]",door end silent corridor haunting harry pottte...,-0.087273,0.420909
2,https://www.goodreads.com/book/show/2657.To_Ki...,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4.28,4772918,95595,Classics,324,Harper Lee,en,"[kill, mockingbird]",unforgettable novel childhood sleepy southern ...,0.165686,0.368067
3,https://www.goodreads.com/book/show/1885.Pride...,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,4.27,3206070,74020,Classics,279,Jane Austen,en,"[pride, prejudice]",alternate cover edition isbn since immediate s...,0.475556,0.68
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,3.61,5231000,107619,Young Adult,501,Stephenie Meyer,en,[twilight],three things absolutely positive first edward ...,0.24,0.62


### TF-IDF feature columns

In [18]:
df_tf = df_sub.copy()

In [19]:

from sklearn.feature_extraction.text import TfidfVectorizer


vect = TfidfVectorizer()
tfidf_matrix = vect.fit_transform(df_tf['clean_keywords'])
df_data = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())
df_data.head()

Unnamed: 0,_____,__________,________________________alternate,__________years,aahz,aang,aap,aaron,aarons,aarti,...,گدھ,मपद,美しさと哀しみと,ﬁerce,ﬁnd,ﬁred,ﬁrst,ﬂame,ﬂed,ﬂoors
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8863 entries, 0 to 8862
Columns: 46216 entries, _____ to ﬂoors
dtypes: float64(46216)
memory usage: 3.1 GB


In [21]:
#save dataframe to csv
#df_data.to_csv("../data/goodreads_tfidf_data.csv", index=False)

#file too large for github: 1.65 GB

In [23]:
df_knn =  pd.concat([df_tf, df_data], axis=0)
df_knn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17726 entries, 0 to 8862
Columns: 46229 entries, url to ﬂoors
dtypes: float64(46220), object(9)
memory usage: 6.1+ GB


In [24]:
df_knn.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,...,گدھ,मपद,美しさと哀しみと,ﬁerce,ﬁnd,ﬁred,ﬁrst,ﬂame,ﬂed,ﬂoors
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",4.32,6717635.0,176054.0,Young Adult,374.0,Suzanne Collins,...,,,,,,,,,,
1,https://www.goodreads.com/book/show/2.Harry_Po...,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,4.5,2668409.0,45724.0,Fantasy,870.0,J.K. Rowling,...,,,,,,,,,,
2,https://www.goodreads.com/book/show/2657.To_Ki...,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4.28,4772918.0,95595.0,Classics,324.0,Harper Lee,...,,,,,,,,,,
3,https://www.goodreads.com/book/show/1885.Pride...,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,4.27,3206070.0,74020.0,Classics,279.0,Jane Austen,...,,,,,,,,,,
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,3.61,5231000.0,107619.0,Young Adult,501.0,Stephenie Meyer,...,,,,,,,,,,


In [25]:
#save dataframe to csv
#df_knn.to_csv("../data/goodreads_final_knn.csv", index=False)
#file 2.06GB and too big for github

### Bag of Words DF

In [26]:
df_bog = df_sub.copy()

In [27]:
#make columns lower case

df_bog['genre_bag'] = df_bog['Genre'].str.lower()

#make columns one string
df_bog['genre_bag'] = df_bog['genre_bag'].str.replace(" ","")

#add brackets for processing in next step
df_bog['genre_bag'] = df_bog['genre_bag'].apply(lambda x: x.split(','))

df_bog.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords,sentiment_polarity,sentiment_subjectivity,genre_bag
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,Young Adult,374,Suzanne Collins,en,"[hunger, games]",could survive wild every one make sure live se...,0.101623,0.484921,[youngadult]
1,https://www.goodreads.com/book/show/2.Harry_Po...,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,4.5,2668409,45724,Fantasy,870,J.K. Rowling,en,"[harry, potter, order, phoenix]",door end silent corridor haunting harry pottte...,-0.087273,0.420909,[fantasy]
2,https://www.goodreads.com/book/show/2657.To_Ki...,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4.28,4772918,95595,Classics,324,Harper Lee,en,"[kill, mockingbird]",unforgettable novel childhood sleepy southern ...,0.165686,0.368067,[classics]
3,https://www.goodreads.com/book/show/1885.Pride...,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,4.27,3206070,74020,Classics,279,Jane Austen,en,"[pride, prejudice]",alternate cover edition isbn since immediate s...,0.475556,0.68,[classics]
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,3.61,5231000,107619,Young Adult,501,Stephenie Meyer,en,[twilight],three things absolutely positive first edward ...,0.24,0.62,[youngadult]


In [28]:
df_bog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8863 entries, 0 to 8862
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   url                     8863 non-null   object 
 1   bookTitle               8863 non-null   object 
 2   bookImage               8863 non-null   object 
 3   bookDesc                8863 non-null   object 
 4   bookRating              8863 non-null   float64
 5   ratingCount             8863 non-null   int64  
 6   reviewCount             8863 non-null   int64  
 7   Genre                   8863 non-null   object 
 8   pageCount               8863 non-null   int64  
 9   Author                  8863 non-null   object 
 10  lang                    8863 non-null   object 
 11  title_key_words         8863 non-null   object 
 12  clean_keywords          8863 non-null   object 
 13  sentiment_polarity      8863 non-null   float64
 14  sentiment_subjectivity  8863 non-null   

In [29]:
#make book description column a list of words
clean_keywords= []
for index, row in df_bog.iterrows():
    txt = row['clean_keywords']
    clean_keyword_list =  list(txt.split())
    
    clean_keywords.append(clean_keyword_list)
    
df_bog['clean_keywords'] = clean_keywords
df_bog.head()                               

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords,sentiment_polarity,sentiment_subjectivity,genre_bag
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,Young Adult,374,Suzanne Collins,en,"[hunger, games]","[could, survive, wild, every, one, make, sure,...",0.101623,0.484921,[youngadult]
1,https://www.goodreads.com/book/show/2.Harry_Po...,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,4.5,2668409,45724,Fantasy,870,J.K. Rowling,en,"[harry, potter, order, phoenix]","[door, end, silent, corridor, haunting, harry,...",-0.087273,0.420909,[fantasy]
2,https://www.goodreads.com/book/show/2657.To_Ki...,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4.28,4772918,95595,Classics,324,Harper Lee,en,"[kill, mockingbird]","[unforgettable, novel, childhood, sleepy, sout...",0.165686,0.368067,[classics]
3,https://www.goodreads.com/book/show/1885.Pride...,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,4.27,3206070,74020,Classics,279,Jane Austen,en,"[pride, prejudice]","[alternate, cover, edition, isbn, since, immed...",0.475556,0.68,[classics]
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,3.61,5231000,107619,Young Adult,501,Stephenie Meyer,en,[twilight],"[three, things, absolutely, positive, first, e...",0.24,0.62,[youngadult]


In [30]:
#join columns to make bag
#create a bag of words columns with processed columns
df_bog2 = df_bog.copy()

df_bog2['bag_of_words'] = ''

columns = ['genre_bag', 'title_key_words', 'clean_keywords']

bagWords = []

for index, row in df_bog2.iterrows():
    words = ''
    for col in columns:
        words += ' '.join(row[col]) + ' '
    bagWords.append(words) 
    
df_bog2['bag_of_words'] = bagWords
df_bog2.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords,sentiment_polarity,sentiment_subjectivity,genre_bag,bag_of_words
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,Young Adult,374,Suzanne Collins,en,"[hunger, games]","[could, survive, wild, every, one, make, sure,...",0.101623,0.484921,[youngadult],youngadult hunger games could survive wild eve...
1,https://www.goodreads.com/book/show/2.Harry_Po...,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,4.5,2668409,45724,Fantasy,870,J.K. Rowling,en,"[harry, potter, order, phoenix]","[door, end, silent, corridor, haunting, harry,...",-0.087273,0.420909,[fantasy],fantasy harry potter order phoenix door end si...
2,https://www.goodreads.com/book/show/2657.To_Ki...,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4.28,4772918,95595,Classics,324,Harper Lee,en,"[kill, mockingbird]","[unforgettable, novel, childhood, sleepy, sout...",0.165686,0.368067,[classics],classics kill mockingbird unforgettable novel ...
3,https://www.goodreads.com/book/show/1885.Pride...,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,4.27,3206070,74020,Classics,279,Jane Austen,en,"[pride, prejudice]","[alternate, cover, edition, isbn, since, immed...",0.475556,0.68,[classics],classics pride prejudice alternate cover editi...
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,3.61,5231000,107619,Young Adult,501,Stephenie Meyer,en,[twilight],"[three, things, absolutely, positive, first, e...",0.24,0.62,[youngadult],youngadult twilight three things absolutely po...


In [31]:
#save dataframe to csv
df_bog2.to_csv("../data/goodreads_final_bagowords.csv", index=False)