In [1]:
import pandas as pd
import numpy as np

# keyword extraction and clean up
import nltk
from rake_nltk import Rake
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
import re


#Text Blob for sentiment analysis
from textblob import TextBlob


#tf-idf model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


from sklearn.metrics.pairwise import linear_kernel


import string
import random
from PIL import Image
import requests
from io import BytesIO

from langdetect import detect


In [2]:
df = pd.read_csv("../data/goodreads_final_seven.csv")
df.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang
0,https://www.goodreads.com/book/show/987587.The...,The Crippled Lamb,https://i.gr-assets.com/images/S/compressed.ph...,"Because Joshua the Lamb was different, he ofte...",4.46,6670,224,Other,32,Max Lucado,en
1,https://www.goodreads.com/book/show/73931.Hood,Hood,https://i.gr-assets.com/images/S/compressed.ph...,"Robin HoodThe Legend Begins AnewFor centuries,...",3.88,13570,1389,Fantasy,490,Stephen R. Lawhead,en
2,https://www.goodreads.com/book/show/10065.Ways...,Wayside School Is Falling Down,https://i.gr-assets.com/images/S/compressed.ph...,Louis yard teacher starts off 30 tales of unus...,4.15,58464,1059,Childrens,152,Louis Sachar,en
3,https://www.goodreads.com/book/show/26114523-t...,The Summer that Melted Everything,https://i.gr-assets.com/images/S/compressed.ph...,Fielding Bliss has never forgotten the summer ...,3.97,7209,1581,Fiction,310,Tiffany McDaniel,en
4,https://www.goodreads.com/book/show/7736182-th...,The Lost Hero,https://i.gr-assets.com/images/S/compressed.ph...,JASON HAS A PROBLEM. He doesn’t remember anyth...,4.32,670659,23261,Fantasy,553,Rick Riordan,en


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   url          5000 non-null   object 
 1   bookTitle    5000 non-null   object 
 2   bookImage    5000 non-null   object 
 3   bookDesc     5000 non-null   object 
 4   bookRating   5000 non-null   float64
 5   ratingCount  5000 non-null   int64  
 6   reviewCount  5000 non-null   int64  
 7   Genre        5000 non-null   object 
 8   pageCount    5000 non-null   int64  
 9   Author       5000 non-null   object 
 10  lang         5000 non-null   object 
dtypes: float64(1), int64(3), object(7)
memory usage: 429.8+ KB


### key words processing

In [4]:
#data pre-processing to remove stop words, punctuation, white space, and convert all words to lower case from book Description
df_new = df.copy()
key_words = []
title_key_words = []

r = Rake()

for index, row in df_new.iterrows():
    r.extract_keywords_from_text(row['bookDesc'])
    key_words_dict_scores = r.get_word_degrees()
    keywords = list(key_words_dict_scores.keys())
    
    
    key_words.append(keywords)

    r.extract_keywords_from_text(row['bookTitle'])
    key_words_dict_scores = r.get_word_degrees()
    title_keywords = list(key_words_dict_scores.keys())
    
    title_key_words.append(title_keywords)
    
df_new["key_words"] = key_words
df_new["title_key_words"] = title_key_words

df_new.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,key_words,title_key_words
0,https://www.goodreads.com/book/show/987587.The...,The Crippled Lamb,https://i.gr-assets.com/images/S/compressed.ph...,"Because Joshua the Lamb was different, he ofte...",4.46,6670,224,Other,32,Max Lucado,en,"[joshua, lamb, different, often, felt, left, l...","[crippled, lamb]"
1,https://www.goodreads.com/book/show/73931.Hood,Hood,https://i.gr-assets.com/images/S/compressed.ph...,"Robin HoodThe Legend Begins AnewFor centuries,...",3.88,13570,1389,Fantasy,490,Stephen R. Lawhead,en,"[robin, hoodthe, legend, begins, anewfor, cent...",[hood]
2,https://www.goodreads.com/book/show/10065.Ways...,Wayside School Is Falling Down,https://i.gr-assets.com/images/S/compressed.ph...,Louis yard teacher starts off 30 tales of unus...,4.15,58464,1059,Childrens,152,Louis Sachar,en,"[louis, yard, teacher, starts, 30, tales, unus...","[wayside, school, falling]"
3,https://www.goodreads.com/book/show/26114523-t...,The Summer that Melted Everything,https://i.gr-assets.com/images/S/compressed.ph...,Fielding Bliss has never forgotten the summer ...,3.97,7209,1581,Fiction,310,Tiffany McDaniel,en,"[fielding, bliss, never, forgotten, summer, 19...","[summer, melted, everything]"
4,https://www.goodreads.com/book/show/7736182-th...,The Lost Hero,https://i.gr-assets.com/images/S/compressed.ph...,JASON HAS A PROBLEM. He doesn’t remember anyth...,4.32,670659,23261,Fantasy,553,Rick Riordan,en,"[jason, problem, ’, remember, anything, waking...","[lost, hero]"


In [5]:
df_new2 = df_new.copy()

In [6]:
# clean up keywords: drop symbols and numbers, as well as find root words

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)


df_new2['clean_keywords']=df_new2['key_words'].map(lambda s:preprocess(s)) 

In [7]:
df_new2.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,key_words,title_key_words,clean_keywords
0,https://www.goodreads.com/book/show/987587.The...,The Crippled Lamb,https://i.gr-assets.com/images/S/compressed.ph...,"Because Joshua the Lamb was different, he ofte...",4.46,6670,224,Other,32,Max Lucado,en,"[joshua, lamb, different, often, felt, left, l...","[crippled, lamb]",joshua lamb different often felt left like win...
1,https://www.goodreads.com/book/show/73931.Hood,Hood,https://i.gr-assets.com/images/S/compressed.ph...,"Robin HoodThe Legend Begins AnewFor centuries,...",3.88,13570,1389,Fantasy,490,Stephen R. Lawhead,en,"[robin, hoodthe, legend, begins, anewfor, cent...",[hood],robin hoodthe legend begins anewfor centuries ...
2,https://www.goodreads.com/book/show/10065.Ways...,Wayside School Is Falling Down,https://i.gr-assets.com/images/S/compressed.ph...,Louis yard teacher starts off 30 tales of unus...,4.15,58464,1059,Childrens,152,Louis Sachar,en,"[louis, yard, teacher, starts, 30, tales, unus...","[wayside, school, falling]",louis yard teacher starts tales unusual studen...
3,https://www.goodreads.com/book/show/26114523-t...,The Summer that Melted Everything,https://i.gr-assets.com/images/S/compressed.ph...,Fielding Bliss has never forgotten the summer ...,3.97,7209,1581,Fiction,310,Tiffany McDaniel,en,"[fielding, bliss, never, forgotten, summer, 19...","[summer, melted, everything]",fielding bliss never forgotten summer year hea...
4,https://www.goodreads.com/book/show/7736182-th...,The Lost Hero,https://i.gr-assets.com/images/S/compressed.ph...,JASON HAS A PROBLEM. He doesn’t remember anyth...,4.32,670659,23261,Fantasy,553,Rick Riordan,en,"[jason, problem, ’, remember, anything, waking...","[lost, hero]",jason problem remember anything waking bus ful...


In [8]:
df_new2 = df_new2.drop(columns=['key_words'])

In [9]:
df_new2.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords
0,https://www.goodreads.com/book/show/987587.The...,The Crippled Lamb,https://i.gr-assets.com/images/S/compressed.ph...,"Because Joshua the Lamb was different, he ofte...",4.46,6670,224,Other,32,Max Lucado,en,"[crippled, lamb]",joshua lamb different often felt left like win...
1,https://www.goodreads.com/book/show/73931.Hood,Hood,https://i.gr-assets.com/images/S/compressed.ph...,"Robin HoodThe Legend Begins AnewFor centuries,...",3.88,13570,1389,Fantasy,490,Stephen R. Lawhead,en,[hood],robin hoodthe legend begins anewfor centuries ...
2,https://www.goodreads.com/book/show/10065.Ways...,Wayside School Is Falling Down,https://i.gr-assets.com/images/S/compressed.ph...,Louis yard teacher starts off 30 tales of unus...,4.15,58464,1059,Childrens,152,Louis Sachar,en,"[wayside, school, falling]",louis yard teacher starts tales unusual studen...
3,https://www.goodreads.com/book/show/26114523-t...,The Summer that Melted Everything,https://i.gr-assets.com/images/S/compressed.ph...,Fielding Bliss has never forgotten the summer ...,3.97,7209,1581,Fiction,310,Tiffany McDaniel,en,"[summer, melted, everything]",fielding bliss never forgotten summer year hea...
4,https://www.goodreads.com/book/show/7736182-th...,The Lost Hero,https://i.gr-assets.com/images/S/compressed.ph...,JASON HAS A PROBLEM. He doesn’t remember anyth...,4.32,670659,23261,Fantasy,553,Rick Riordan,en,"[lost, hero]",jason problem remember anything waking bus ful...


### Sentiment Analysis with Text Blob

In [10]:
# test Text Blob program
# Preparing an input sentence
sentence = '''The platform provides universal access to the world's best education, partnering with top universities and organizations to offer courses online.'''

# Creating a textblob object and assigning the sentiment property
analysis = TextBlob(sentence).sentiment
print(analysis)
print(analysis[0])
print(analysis[1])

Sentiment(polarity=0.5, subjectivity=0.26666666666666666)
0.5
0.26666666666666666


In [11]:
df_sub = df_new2.copy()

In [12]:
# run text blob in a loop

polarities = []
subjectivities = []

for index, row in df_sub.iterrows():
    sentence = row['bookDesc']
    analysis = TextBlob(sentence).sentiment
    polarity = analysis[0]
    subjectivity = analysis[1]

    # populate lists
    polarities.append(polarity)
    subjectivities.append(subjectivity)


df_sub['sentiment_polarity'] = polarities
df_sub['sentiment_subjectivity']= subjectivities

df_sub.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords,sentiment_polarity,sentiment_subjectivity
0,https://www.goodreads.com/book/show/987587.The...,The Crippled Lamb,https://i.gr-assets.com/images/S/compressed.ph...,"Because Joshua the Lamb was different, he ofte...",4.46,6670,224,Other,32,Max Lucado,en,"[crippled, lamb]",joshua lamb different often felt left like win...,0.067857,0.485714
1,https://www.goodreads.com/book/show/73931.Hood,Hood,https://i.gr-assets.com/images/S/compressed.ph...,"Robin HoodThe Legend Begins AnewFor centuries,...",3.88,13570,1389,Fantasy,490,Stephen R. Lawhead,en,[hood],robin hoodthe legend begins anewfor centuries ...,0.142803,0.427121
2,https://www.goodreads.com/book/show/10065.Ways...,Wayside School Is Falling Down,https://i.gr-assets.com/images/S/compressed.ph...,Louis yard teacher starts off 30 tales of unus...,4.15,58464,1059,Childrens,152,Louis Sachar,en,"[wayside, school, falling]",louis yard teacher starts tales unusual studen...,0.29375,0.4625
3,https://www.goodreads.com/book/show/26114523-t...,The Summer that Melted Everything,https://i.gr-assets.com/images/S/compressed.ph...,Fielding Bliss has never forgotten the summer ...,3.97,7209,1581,Fiction,310,Tiffany McDaniel,en,"[summer, melted, everything]",fielding bliss never forgotten summer year hea...,0.028214,0.514821
4,https://www.goodreads.com/book/show/7736182-th...,The Lost Hero,https://i.gr-assets.com/images/S/compressed.ph...,JASON HAS A PROBLEM. He doesn’t remember anyth...,4.32,670659,23261,Fantasy,553,Rick Riordan,en,"[lost, hero]",jason problem remember anything waking bus ful...,0.016883,0.41961


In [13]:
#save dataframe to csv
df_sub.to_csv("../data/goodreads_sentiment.csv", index=False)

### TF-IDF feature columns

In [14]:
#df_tf = df_sub.copy()

In [15]:

#from sklearn.feature_extraction.text import TfidfVectorizer


#vect = TfidfVectorizer()
#tfidf_matrix = vect.fit_transform(df_tf['clean_keywords'])
#df_data = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())
#df_data.head()

In [16]:
#df_data.max()

In [17]:
#df_data.info()

In [18]:
#save dataframe to csv
#df_data.to_csv("../data/goodreads_tfidf_data.csv", index=False)

#file too large for github: 1.65 GB

In [19]:
#df_data.to_parquet("../data/goodreads_tfidf_data.parquet", index=False, compression=None, engine="fastparquet")

In [20]:
#df_knn =  pd.concat([df_tf, df_data], axis=0)
#df_knn.info()

In [21]:
#df_knn.head()

In [22]:
#save dataframe to csv
#df_knn.to_csv("../data/goodreads_final_knn.csv", index=False)
#file 2.06GB and too big for github

### Bag of Words DF

In [23]:
df_bog = df_sub.copy()

In [24]:
#make columns lower case

df_bog['genre_bag'] = df_bog['Genre'].str.lower()

#make columns one string
df_bog['genre_bag'] = df_bog['genre_bag'].str.replace(" ","")

#add brackets for processing in next step
df_bog['genre_bag'] = df_bog['genre_bag'].apply(lambda x: x.split(','))

df_bog.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords,sentiment_polarity,sentiment_subjectivity,genre_bag
0,https://www.goodreads.com/book/show/987587.The...,The Crippled Lamb,https://i.gr-assets.com/images/S/compressed.ph...,"Because Joshua the Lamb was different, he ofte...",4.46,6670,224,Other,32,Max Lucado,en,"[crippled, lamb]",joshua lamb different often felt left like win...,0.067857,0.485714,[other]
1,https://www.goodreads.com/book/show/73931.Hood,Hood,https://i.gr-assets.com/images/S/compressed.ph...,"Robin HoodThe Legend Begins AnewFor centuries,...",3.88,13570,1389,Fantasy,490,Stephen R. Lawhead,en,[hood],robin hoodthe legend begins anewfor centuries ...,0.142803,0.427121,[fantasy]
2,https://www.goodreads.com/book/show/10065.Ways...,Wayside School Is Falling Down,https://i.gr-assets.com/images/S/compressed.ph...,Louis yard teacher starts off 30 tales of unus...,4.15,58464,1059,Childrens,152,Louis Sachar,en,"[wayside, school, falling]",louis yard teacher starts tales unusual studen...,0.29375,0.4625,[childrens]
3,https://www.goodreads.com/book/show/26114523-t...,The Summer that Melted Everything,https://i.gr-assets.com/images/S/compressed.ph...,Fielding Bliss has never forgotten the summer ...,3.97,7209,1581,Fiction,310,Tiffany McDaniel,en,"[summer, melted, everything]",fielding bliss never forgotten summer year hea...,0.028214,0.514821,[fiction]
4,https://www.goodreads.com/book/show/7736182-th...,The Lost Hero,https://i.gr-assets.com/images/S/compressed.ph...,JASON HAS A PROBLEM. He doesn’t remember anyth...,4.32,670659,23261,Fantasy,553,Rick Riordan,en,"[lost, hero]",jason problem remember anything waking bus ful...,0.016883,0.41961,[fantasy]


In [25]:
df_bog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   url                     5000 non-null   object 
 1   bookTitle               5000 non-null   object 
 2   bookImage               5000 non-null   object 
 3   bookDesc                5000 non-null   object 
 4   bookRating              5000 non-null   float64
 5   ratingCount             5000 non-null   int64  
 6   reviewCount             5000 non-null   int64  
 7   Genre                   5000 non-null   object 
 8   pageCount               5000 non-null   int64  
 9   Author                  5000 non-null   object 
 10  lang                    5000 non-null   object 
 11  title_key_words         5000 non-null   object 
 12  clean_keywords          5000 non-null   object 
 13  sentiment_polarity      5000 non-null   float64
 14  sentiment_subjectivity  5000 non-null   

In [26]:
#make book description column a list of words
clean_keywords= []
for index, row in df_bog.iterrows():
    txt = row['clean_keywords']
    clean_keyword_list =  list(txt.split())
    
    clean_keywords.append(clean_keyword_list)
    
df_bog['clean_keywords'] = clean_keywords
df_bog.head()                               

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords,sentiment_polarity,sentiment_subjectivity,genre_bag
0,https://www.goodreads.com/book/show/987587.The...,The Crippled Lamb,https://i.gr-assets.com/images/S/compressed.ph...,"Because Joshua the Lamb was different, he ofte...",4.46,6670,224,Other,32,Max Lucado,en,"[crippled, lamb]","[joshua, lamb, different, often, felt, left, l...",0.067857,0.485714,[other]
1,https://www.goodreads.com/book/show/73931.Hood,Hood,https://i.gr-assets.com/images/S/compressed.ph...,"Robin HoodThe Legend Begins AnewFor centuries,...",3.88,13570,1389,Fantasy,490,Stephen R. Lawhead,en,[hood],"[robin, hoodthe, legend, begins, anewfor, cent...",0.142803,0.427121,[fantasy]
2,https://www.goodreads.com/book/show/10065.Ways...,Wayside School Is Falling Down,https://i.gr-assets.com/images/S/compressed.ph...,Louis yard teacher starts off 30 tales of unus...,4.15,58464,1059,Childrens,152,Louis Sachar,en,"[wayside, school, falling]","[louis, yard, teacher, starts, tales, unusual,...",0.29375,0.4625,[childrens]
3,https://www.goodreads.com/book/show/26114523-t...,The Summer that Melted Everything,https://i.gr-assets.com/images/S/compressed.ph...,Fielding Bliss has never forgotten the summer ...,3.97,7209,1581,Fiction,310,Tiffany McDaniel,en,"[summer, melted, everything]","[fielding, bliss, never, forgotten, summer, ye...",0.028214,0.514821,[fiction]
4,https://www.goodreads.com/book/show/7736182-th...,The Lost Hero,https://i.gr-assets.com/images/S/compressed.ph...,JASON HAS A PROBLEM. He doesn’t remember anyth...,4.32,670659,23261,Fantasy,553,Rick Riordan,en,"[lost, hero]","[jason, problem, remember, anything, waking, b...",0.016883,0.41961,[fantasy]


In [27]:
#join columns to make bag
#create a bag of words columns with processed columns
df_bog2 = df_bog.copy()

df_bog2['bag_of_words'] = ''

columns = ['genre_bag', 'title_key_words', 'clean_keywords']

bagWords = []

for index, row in df_bog2.iterrows():
    words = ''
    for col in columns:
        words += ' '.join(row[col]) + ' '
    bagWords.append(words) 
    
df_bog2['bag_of_words'] = bagWords
df_bog2.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords,sentiment_polarity,sentiment_subjectivity,genre_bag,bag_of_words
0,https://www.goodreads.com/book/show/987587.The...,The Crippled Lamb,https://i.gr-assets.com/images/S/compressed.ph...,"Because Joshua the Lamb was different, he ofte...",4.46,6670,224,Other,32,Max Lucado,en,"[crippled, lamb]","[joshua, lamb, different, often, felt, left, l...",0.067857,0.485714,[other],other crippled lamb joshua lamb different ofte...
1,https://www.goodreads.com/book/show/73931.Hood,Hood,https://i.gr-assets.com/images/S/compressed.ph...,"Robin HoodThe Legend Begins AnewFor centuries,...",3.88,13570,1389,Fantasy,490,Stephen R. Lawhead,en,[hood],"[robin, hoodthe, legend, begins, anewfor, cent...",0.142803,0.427121,[fantasy],fantasy hood robin hoodthe legend begins anewf...
2,https://www.goodreads.com/book/show/10065.Ways...,Wayside School Is Falling Down,https://i.gr-assets.com/images/S/compressed.ph...,Louis yard teacher starts off 30 tales of unus...,4.15,58464,1059,Childrens,152,Louis Sachar,en,"[wayside, school, falling]","[louis, yard, teacher, starts, tales, unusual,...",0.29375,0.4625,[childrens],childrens wayside school falling louis yard te...
3,https://www.goodreads.com/book/show/26114523-t...,The Summer that Melted Everything,https://i.gr-assets.com/images/S/compressed.ph...,Fielding Bliss has never forgotten the summer ...,3.97,7209,1581,Fiction,310,Tiffany McDaniel,en,"[summer, melted, everything]","[fielding, bliss, never, forgotten, summer, ye...",0.028214,0.514821,[fiction],fiction summer melted everything fielding blis...
4,https://www.goodreads.com/book/show/7736182-th...,The Lost Hero,https://i.gr-assets.com/images/S/compressed.ph...,JASON HAS A PROBLEM. He doesn’t remember anyth...,4.32,670659,23261,Fantasy,553,Rick Riordan,en,"[lost, hero]","[jason, problem, remember, anything, waking, b...",0.016883,0.41961,[fantasy],fantasy lost hero jason problem remember anyth...


In [28]:
#save dataframe to csv
df_bog2.to_csv("../data/goodreads_final_bagowords.csv", index=False)