In [1]:
import pandas as pd
import numpy as np

# keyword extraction and clean up
import nltk
from rake_nltk import Rake
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
import re


#Text Blob for sentiment analysis
from textblob import TextBlob


#tf-idf model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


from sklearn.metrics.pairwise import linear_kernel


import string
import random
from PIL import Image
import requests
from io import BytesIO

from langdetect import detect


In [2]:
df = pd.read_csv("../data/goodreads_final_seven.csv")
df.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang
0,https://www.goodreads.com/book/show/25743.The_...,The Slave,https://i.gr-assets.com/images/S/compressed.ph...,Four years after the Chmielnicki massacres of ...,4.19,2886,259,Fiction,320,Isaac Bashevis Singer,en
1,https://www.goodreads.com/book/show/13547289-t...,Tales from a Not-So-Smart Miss Know-It-All,https://i.gr-assets.com/images/S/compressed.ph...,Nikki Maxwell authors an advice column for the...,4.39,25705,806,Education,368,Rachel Renée Russell,en
2,https://www.goodreads.com/book/show/21899546-l...,"Love, in English",https://i.gr-assets.com/images/S/compressed.ph...,He’s thirty-eight. I’m twenty-three.He speaks ...,4.07,15358,2024,Romance,439,Karina Halle,en
3,https://www.goodreads.com/book/show/31328891-s...,Stuck in the Game,https://i.gr-assets.com/images/S/compressed.ph...,After a terrible car accident puts seventeen-y...,3.84,277,59,Fantasy,226,Christopher Keene,en
4,https://www.goodreads.com/book/show/10507293-t...,The Selection,https://i.gr-assets.com/images/S/compressed.ph...,"For thirty-five girls, the Selection is the ch...",4.12,1004868,53737,Young Adult,336,Kiera Cass,en


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   url          5000 non-null   object 
 1   bookTitle    5000 non-null   object 
 2   bookImage    5000 non-null   object 
 3   bookDesc     5000 non-null   object 
 4   bookRating   5000 non-null   float64
 5   ratingCount  5000 non-null   int64  
 6   reviewCount  5000 non-null   int64  
 7   Genre        5000 non-null   object 
 8   pageCount    5000 non-null   int64  
 9   Author       5000 non-null   object 
 10  lang         5000 non-null   object 
dtypes: float64(1), int64(3), object(7)
memory usage: 429.8+ KB


### key words processing

In [4]:
#data pre-processing to remove stop words, punctuation, white space, and convert all words to lower case from book Description
df_new = df.copy()
key_words = []
title_key_words = []

r = Rake()

for index, row in df_new.iterrows():
    r.extract_keywords_from_text(row['bookDesc'])
    key_words_dict_scores = r.get_word_degrees()
    keywords = list(key_words_dict_scores.keys())
    
    
    key_words.append(keywords)

    r.extract_keywords_from_text(row['bookTitle'])
    key_words_dict_scores = r.get_word_degrees()
    title_keywords = list(key_words_dict_scores.keys())
    
    title_key_words.append(title_keywords)
    
df_new["key_words"] = key_words
df_new["title_key_words"] = title_key_words

df_new.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,key_words,title_key_words
0,https://www.goodreads.com/book/show/25743.The_...,The Slave,https://i.gr-assets.com/images/S/compressed.ph...,Four years after the Chmielnicki massacres of ...,4.19,2886,259,Fiction,320,Isaac Bashevis Singer,en,"[four, years, chmielnicki, massacres, seventee...",[slave]
1,https://www.goodreads.com/book/show/13547289-t...,Tales from a Not-So-Smart Miss Know-It-All,https://i.gr-assets.com/images/S/compressed.ph...,Nikki Maxwell authors an advice column for the...,4.39,25705,806,Education,368,Rachel Renée Russell,en,"[nikki, maxwell, authors, advice, column, scho...","[tales, smart, miss, know]"
2,https://www.goodreads.com/book/show/21899546-l...,"Love, in English",https://i.gr-assets.com/images/S/compressed.ph...,He’s thirty-eight. I’m twenty-three.He speaks ...,4.07,15358,2024,Romance,439,Karina Halle,en,"[’, thirty, eight, twenty, three, speaks, span...","[love, english]"
3,https://www.goodreads.com/book/show/31328891-s...,Stuck in the Game,https://i.gr-assets.com/images/S/compressed.ph...,After a terrible car accident puts seventeen-y...,3.84,277,59,Fantasy,226,Christopher Keene,en,"[terrible, car, accident, puts, seventeen, yea...","[stuck, game]"
4,https://www.goodreads.com/book/show/10507293-t...,The Selection,https://i.gr-assets.com/images/S/compressed.ph...,"For thirty-five girls, the Selection is the ch...",4.12,1004868,53737,Young Adult,336,Kiera Cass,en,"[thirty, five, girls, selection, chance, lifet...",[selection]


In [5]:
df_new2 = df_new.copy()

In [6]:
# clean up keywords: drop symbols and numbers, as well as find root words

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)


df_new2['clean_keywords']=df_new2['key_words'].map(lambda s:preprocess(s)) 

In [7]:
df_new2.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,key_words,title_key_words,clean_keywords
0,https://www.goodreads.com/book/show/25743.The_...,The Slave,https://i.gr-assets.com/images/S/compressed.ph...,Four years after the Chmielnicki massacres of ...,4.19,2886,259,Fiction,320,Isaac Bashevis Singer,en,"[four, years, chmielnicki, massacres, seventee...",[slave],four years chmielnicki massacres seventeenth c...
1,https://www.goodreads.com/book/show/13547289-t...,Tales from a Not-So-Smart Miss Know-It-All,https://i.gr-assets.com/images/S/compressed.ph...,Nikki Maxwell authors an advice column for the...,4.39,25705,806,Education,368,Rachel Renée Russell,en,"[nikki, maxwell, authors, advice, column, scho...","[tales, smart, miss, know]",nikki maxwell authors advice column school new...
2,https://www.goodreads.com/book/show/21899546-l...,"Love, in English",https://i.gr-assets.com/images/S/compressed.ph...,He’s thirty-eight. I’m twenty-three.He speaks ...,4.07,15358,2024,Romance,439,Karina Halle,en,"[’, thirty, eight, twenty, three, speaks, span...","[love, english]",thirty eight twenty three speaks spanish speak...
3,https://www.goodreads.com/book/show/31328891-s...,Stuck in the Game,https://i.gr-assets.com/images/S/compressed.ph...,After a terrible car accident puts seventeen-y...,3.84,277,59,Fantasy,226,Christopher Keene,en,"[terrible, car, accident, puts, seventeen, yea...","[stuck, game]",terrible car accident puts seventeen year old ...
4,https://www.goodreads.com/book/show/10507293-t...,The Selection,https://i.gr-assets.com/images/S/compressed.ph...,"For thirty-five girls, the Selection is the ch...",4.12,1004868,53737,Young Adult,336,Kiera Cass,en,"[thirty, five, girls, selection, chance, lifet...",[selection],thirty five girls selection chance lifetime op...


In [8]:
df_new2 = df_new2.drop(columns=['key_words'])

In [9]:
df_new2.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords
0,https://www.goodreads.com/book/show/25743.The_...,The Slave,https://i.gr-assets.com/images/S/compressed.ph...,Four years after the Chmielnicki massacres of ...,4.19,2886,259,Fiction,320,Isaac Bashevis Singer,en,[slave],four years chmielnicki massacres seventeenth c...
1,https://www.goodreads.com/book/show/13547289-t...,Tales from a Not-So-Smart Miss Know-It-All,https://i.gr-assets.com/images/S/compressed.ph...,Nikki Maxwell authors an advice column for the...,4.39,25705,806,Education,368,Rachel Renée Russell,en,"[tales, smart, miss, know]",nikki maxwell authors advice column school new...
2,https://www.goodreads.com/book/show/21899546-l...,"Love, in English",https://i.gr-assets.com/images/S/compressed.ph...,He’s thirty-eight. I’m twenty-three.He speaks ...,4.07,15358,2024,Romance,439,Karina Halle,en,"[love, english]",thirty eight twenty three speaks spanish speak...
3,https://www.goodreads.com/book/show/31328891-s...,Stuck in the Game,https://i.gr-assets.com/images/S/compressed.ph...,After a terrible car accident puts seventeen-y...,3.84,277,59,Fantasy,226,Christopher Keene,en,"[stuck, game]",terrible car accident puts seventeen year old ...
4,https://www.goodreads.com/book/show/10507293-t...,The Selection,https://i.gr-assets.com/images/S/compressed.ph...,"For thirty-five girls, the Selection is the ch...",4.12,1004868,53737,Young Adult,336,Kiera Cass,en,[selection],thirty five girls selection chance lifetime op...


### Sentiment Analysis with Text Blob

In [10]:
# test Text Blob program
# Preparing an input sentence
sentence = '''The platform provides universal access to the world's best education, partnering with top universities and organizations to offer courses online.'''

# Creating a textblob object and assigning the sentiment property
analysis = TextBlob(sentence).sentiment
print(analysis)
print(analysis[0])
print(analysis[1])

Sentiment(polarity=0.5, subjectivity=0.26666666666666666)
0.5
0.26666666666666666


In [11]:
df_sub = df_new2.copy()

In [12]:
# run text blob in a loop

polarities = []
subjectivities = []

for index, row in df_sub.iterrows():
    sentence = row['bookDesc']
    analysis = TextBlob(sentence).sentiment
    polarity = analysis[0]
    subjectivity = analysis[1]

    # populate lists
    polarities.append(polarity)
    subjectivities.append(subjectivity)


df_sub['sentiment_polarity'] = polarities
df_sub['sentiment_subjectivity']= subjectivities

df_sub.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords,sentiment_polarity,sentiment_subjectivity
0,https://www.goodreads.com/book/show/25743.The_...,The Slave,https://i.gr-assets.com/images/S/compressed.ph...,Four years after the Chmielnicki massacres of ...,4.19,2886,259,Fiction,320,Isaac Bashevis Singer,en,[slave],four years chmielnicki massacres seventeenth c...,0.042338,0.341429
1,https://www.goodreads.com/book/show/13547289-t...,Tales from a Not-So-Smart Miss Know-It-All,https://i.gr-assets.com/images/S/compressed.ph...,Nikki Maxwell authors an advice column for the...,4.39,25705,806,Education,368,Rachel Renée Russell,en,"[tales, smart, miss, know]",nikki maxwell authors advice column school new...,-0.008807,0.450042
2,https://www.goodreads.com/book/show/21899546-l...,"Love, in English",https://i.gr-assets.com/images/S/compressed.ph...,He’s thirty-eight. I’m twenty-three.He speaks ...,4.07,15358,2024,Romance,439,Karina Halle,en,"[love, english]",thirty eight twenty three speaks spanish speak...,0.175129,0.47509
3,https://www.goodreads.com/book/show/31328891-s...,Stuck in the Game,https://i.gr-assets.com/images/S/compressed.ph...,After a terrible car accident puts seventeen-y...,3.84,277,59,Fantasy,226,Christopher Keene,en,"[stuck, game]",terrible car accident puts seventeen year old ...,-0.151667,0.438819
4,https://www.goodreads.com/book/show/10507293-t...,The Selection,https://i.gr-assets.com/images/S/compressed.ph...,"For thirty-five girls, the Selection is the ch...",4.12,1004868,53737,Young Adult,336,Kiera Cass,en,[selection],thirty five girls selection chance lifetime op...,0.126263,0.573148


In [13]:
#save dataframe to csv
df_sub.to_csv("../data/goodreads_sentiment.csv", index=False)

### TF-IDF feature columns

In [14]:
#df_tf = df_sub.copy()

In [15]:

#from sklearn.feature_extraction.text import TfidfVectorizer


#vect = TfidfVectorizer()
#tfidf_matrix = vect.fit_transform(df_tf['clean_keywords'])
#df_data = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())
#df_data.head()

In [16]:
#df_data.max()

In [17]:
#df_data.info()

In [18]:
#save dataframe to csv
#df_data.to_csv("../data/goodreads_tfidf_data.csv", index=False)

#file too large for github: 1.65 GB

In [19]:
#df_data.to_parquet("../data/goodreads_tfidf_data.parquet", index=False, compression=None, engine="fastparquet")

In [20]:
#df_knn =  pd.concat([df_tf, df_data], axis=0)
#df_knn.info()

In [21]:
#df_knn.head()

In [22]:
#save dataframe to csv
#df_knn.to_csv("../data/goodreads_final_knn.csv", index=False)
#file 2.06GB and too big for github

### Bag of Words DF

In [23]:
df_bog = df_sub.copy()

In [24]:
#make columns lower case

df_bog['genre_bag'] = df_bog['Genre'].str.lower()

#make columns one string
df_bog['genre_bag'] = df_bog['genre_bag'].str.replace(" ","")

#add brackets for processing in next step
df_bog['genre_bag'] = df_bog['genre_bag'].apply(lambda x: x.split(','))

df_bog.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords,sentiment_polarity,sentiment_subjectivity,genre_bag
0,https://www.goodreads.com/book/show/25743.The_...,The Slave,https://i.gr-assets.com/images/S/compressed.ph...,Four years after the Chmielnicki massacres of ...,4.19,2886,259,Fiction,320,Isaac Bashevis Singer,en,[slave],four years chmielnicki massacres seventeenth c...,0.042338,0.341429,[fiction]
1,https://www.goodreads.com/book/show/13547289-t...,Tales from a Not-So-Smart Miss Know-It-All,https://i.gr-assets.com/images/S/compressed.ph...,Nikki Maxwell authors an advice column for the...,4.39,25705,806,Education,368,Rachel Renée Russell,en,"[tales, smart, miss, know]",nikki maxwell authors advice column school new...,-0.008807,0.450042,[education]
2,https://www.goodreads.com/book/show/21899546-l...,"Love, in English",https://i.gr-assets.com/images/S/compressed.ph...,He’s thirty-eight. I’m twenty-three.He speaks ...,4.07,15358,2024,Romance,439,Karina Halle,en,"[love, english]",thirty eight twenty three speaks spanish speak...,0.175129,0.47509,[romance]
3,https://www.goodreads.com/book/show/31328891-s...,Stuck in the Game,https://i.gr-assets.com/images/S/compressed.ph...,After a terrible car accident puts seventeen-y...,3.84,277,59,Fantasy,226,Christopher Keene,en,"[stuck, game]",terrible car accident puts seventeen year old ...,-0.151667,0.438819,[fantasy]
4,https://www.goodreads.com/book/show/10507293-t...,The Selection,https://i.gr-assets.com/images/S/compressed.ph...,"For thirty-five girls, the Selection is the ch...",4.12,1004868,53737,Young Adult,336,Kiera Cass,en,[selection],thirty five girls selection chance lifetime op...,0.126263,0.573148,[youngadult]


In [25]:
df_bog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   url                     5000 non-null   object 
 1   bookTitle               5000 non-null   object 
 2   bookImage               5000 non-null   object 
 3   bookDesc                5000 non-null   object 
 4   bookRating              5000 non-null   float64
 5   ratingCount             5000 non-null   int64  
 6   reviewCount             5000 non-null   int64  
 7   Genre                   5000 non-null   object 
 8   pageCount               5000 non-null   int64  
 9   Author                  5000 non-null   object 
 10  lang                    5000 non-null   object 
 11  title_key_words         5000 non-null   object 
 12  clean_keywords          5000 non-null   object 
 13  sentiment_polarity      5000 non-null   float64
 14  sentiment_subjectivity  5000 non-null   

In [26]:
#make book description column a list of words
clean_keywords= []
for index, row in df_bog.iterrows():
    txt = row['clean_keywords']
    clean_keyword_list =  list(txt.split())
    
    clean_keywords.append(clean_keyword_list)
    
df_bog['clean_keywords'] = clean_keywords
df_bog.head()                               

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords,sentiment_polarity,sentiment_subjectivity,genre_bag
0,https://www.goodreads.com/book/show/25743.The_...,The Slave,https://i.gr-assets.com/images/S/compressed.ph...,Four years after the Chmielnicki massacres of ...,4.19,2886,259,Fiction,320,Isaac Bashevis Singer,en,[slave],"[four, years, chmielnicki, massacres, seventee...",0.042338,0.341429,[fiction]
1,https://www.goodreads.com/book/show/13547289-t...,Tales from a Not-So-Smart Miss Know-It-All,https://i.gr-assets.com/images/S/compressed.ph...,Nikki Maxwell authors an advice column for the...,4.39,25705,806,Education,368,Rachel Renée Russell,en,"[tales, smart, miss, know]","[nikki, maxwell, authors, advice, column, scho...",-0.008807,0.450042,[education]
2,https://www.goodreads.com/book/show/21899546-l...,"Love, in English",https://i.gr-assets.com/images/S/compressed.ph...,He’s thirty-eight. I’m twenty-three.He speaks ...,4.07,15358,2024,Romance,439,Karina Halle,en,"[love, english]","[thirty, eight, twenty, three, speaks, spanish...",0.175129,0.47509,[romance]
3,https://www.goodreads.com/book/show/31328891-s...,Stuck in the Game,https://i.gr-assets.com/images/S/compressed.ph...,After a terrible car accident puts seventeen-y...,3.84,277,59,Fantasy,226,Christopher Keene,en,"[stuck, game]","[terrible, car, accident, puts, seventeen, yea...",-0.151667,0.438819,[fantasy]
4,https://www.goodreads.com/book/show/10507293-t...,The Selection,https://i.gr-assets.com/images/S/compressed.ph...,"For thirty-five girls, the Selection is the ch...",4.12,1004868,53737,Young Adult,336,Kiera Cass,en,[selection],"[thirty, five, girls, selection, chance, lifet...",0.126263,0.573148,[youngadult]


In [27]:
#join columns to make bag
#create a bag of words columns with processed columns
df_bog2 = df_bog.copy()

df_bog2['bag_of_words'] = ''

columns = ['genre_bag', 'title_key_words', 'clean_keywords']

bagWords = []

for index, row in df_bog2.iterrows():
    words = ''
    for col in columns:
        words += ' '.join(row[col]) + ' '
    bagWords.append(words) 
    
df_bog2['bag_of_words'] = bagWords
df_bog2.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords,sentiment_polarity,sentiment_subjectivity,genre_bag,bag_of_words
0,https://www.goodreads.com/book/show/25743.The_...,The Slave,https://i.gr-assets.com/images/S/compressed.ph...,Four years after the Chmielnicki massacres of ...,4.19,2886,259,Fiction,320,Isaac Bashevis Singer,en,[slave],"[four, years, chmielnicki, massacres, seventee...",0.042338,0.341429,[fiction],fiction slave four years chmielnicki massacres...
1,https://www.goodreads.com/book/show/13547289-t...,Tales from a Not-So-Smart Miss Know-It-All,https://i.gr-assets.com/images/S/compressed.ph...,Nikki Maxwell authors an advice column for the...,4.39,25705,806,Education,368,Rachel Renée Russell,en,"[tales, smart, miss, know]","[nikki, maxwell, authors, advice, column, scho...",-0.008807,0.450042,[education],education tales smart miss know nikki maxwell ...
2,https://www.goodreads.com/book/show/21899546-l...,"Love, in English",https://i.gr-assets.com/images/S/compressed.ph...,He’s thirty-eight. I’m twenty-three.He speaks ...,4.07,15358,2024,Romance,439,Karina Halle,en,"[love, english]","[thirty, eight, twenty, three, speaks, spanish...",0.175129,0.47509,[romance],romance love english thirty eight twenty three...
3,https://www.goodreads.com/book/show/31328891-s...,Stuck in the Game,https://i.gr-assets.com/images/S/compressed.ph...,After a terrible car accident puts seventeen-y...,3.84,277,59,Fantasy,226,Christopher Keene,en,"[stuck, game]","[terrible, car, accident, puts, seventeen, yea...",-0.151667,0.438819,[fantasy],fantasy stuck game terrible car accident puts ...
4,https://www.goodreads.com/book/show/10507293-t...,The Selection,https://i.gr-assets.com/images/S/compressed.ph...,"For thirty-five girls, the Selection is the ch...",4.12,1004868,53737,Young Adult,336,Kiera Cass,en,[selection],"[thirty, five, girls, selection, chance, lifet...",0.126263,0.573148,[youngadult],youngadult selection thirty five girls selecti...


In [28]:
#save dataframe to csv
df_bog2.to_csv("../data/goodreads_final_bagowords.csv", index=False)