In [2]:
# data science imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

# data processing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from scipy.cluster.hierarchy import dendrogram, linkage

# unsupervised algorithms
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering

from langdetect import detect

# metrics
from sklearn.metrics import silhouette_samples, silhouette_score


In [3]:
pd.get_option('display.max_columns')

20

In [4]:
# read in file
file = 'data/goodreads_sentiment.csv'

In [5]:
# read in file 
df = pd.read_csv(file)
df.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords,sentiment_polarity,sentiment_subjectivity
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,Young Adult,374,Suzanne Collins,en,"['hunger', 'games']",could survive wild every one make sure live se...,0.101623,0.484921
1,https://www.goodreads.com/book/show/2.Harry_Po...,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,4.5,2668409,45724,Fantasy,870,J.K. Rowling,en,"['harry', 'potter', 'order', 'phoenix']",door end silent corridor haunting harry pottte...,-0.087273,0.420909
2,https://www.goodreads.com/book/show/2657.To_Ki...,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4.28,4772918,95595,Classics,324,Harper Lee,en,"['kill', 'mockingbird']",unforgettable novel childhood sleepy southern ...,0.165686,0.368067
3,https://www.goodreads.com/book/show/1885.Pride...,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,4.27,3206070,74020,Classics,279,Jane Austen,en,"['pride', 'prejudice']",alternate cover edition isbn since immediate s...,0.475556,0.68
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,3.61,5231000,107619,Young Adult,501,Stephenie Meyer,en,['twilight'],three things absolutely positive first edward ...,0.24,0.62


In [6]:
print(df.Genre.nunique())

46


In [7]:
df.Genre.value_counts()


Fantasy                          1667
Fiction                          1637
Young Adult                       948
Romance                           595
Classics                          488
Historical-Historical Fiction     464
Nonfiction                        416
Science Fiction                   342
Mystery                           303
Horror                            206
Sequential Art-Graphic Novels     168
Religion                          157
Fantasy-Paranormal                107
Paranormal-Vampires               100
History                            94
Poetry                             88
Other                              87
Thriller                           86
Romance-Paranormal Romance         83
Philosophy                         82
Womens Fiction-Chick Lit           66
Childrens                          64
Short Stories                      52
Science                            43
Contemporary                       43
New Adult                          43
Plays       

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8863 entries, 0 to 8862
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   url                     8863 non-null   object 
 1   bookTitle               8863 non-null   object 
 2   bookImage               8863 non-null   object 
 3   bookDesc                8863 non-null   object 
 4   bookRating              8863 non-null   float64
 5   ratingCount             8863 non-null   int64  
 6   reviewCount             8863 non-null   int64  
 7   Genre                   8863 non-null   object 
 8   pageCount               8863 non-null   int64  
 9   Author                  8863 non-null   object 
 10  lang                    8863 non-null   object 
 11  title_key_words         8863 non-null   object 
 12  clean_keywords          8863 non-null   object 
 13  sentiment_polarity      8863 non-null   float64
 14  sentiment_subjectivity  8863 non-null   

In [9]:
df.columns

Index(['url', 'bookTitle', 'bookImage', 'bookDesc', 'bookRating',
       'ratingCount', 'reviewCount', 'Genre', 'pageCount', 'Author', 'lang',
       'title_key_words', 'clean_keywords', 'sentiment_polarity',
       'sentiment_subjectivity'],
      dtype='object')

In [10]:
# get number unique book names
df.bookTitle.nunique()

8863

In [11]:
# drop duplicate books, including books with same title but different author
df_drop_dups = df.drop_duplicates(subset = 'bookTitle')

In [12]:
df_drop_dups.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords,sentiment_polarity,sentiment_subjectivity
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,Young Adult,374,Suzanne Collins,en,"['hunger', 'games']",could survive wild every one make sure live se...,0.101623,0.484921
1,https://www.goodreads.com/book/show/2.Harry_Po...,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,4.5,2668409,45724,Fantasy,870,J.K. Rowling,en,"['harry', 'potter', 'order', 'phoenix']",door end silent corridor haunting harry pottte...,-0.087273,0.420909
2,https://www.goodreads.com/book/show/2657.To_Ki...,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4.28,4772918,95595,Classics,324,Harper Lee,en,"['kill', 'mockingbird']",unforgettable novel childhood sleepy southern ...,0.165686,0.368067
3,https://www.goodreads.com/book/show/1885.Pride...,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,4.27,3206070,74020,Classics,279,Jane Austen,en,"['pride', 'prejudice']",alternate cover edition isbn since immediate s...,0.475556,0.68
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,3.61,5231000,107619,Young Adult,501,Stephenie Meyer,en,['twilight'],three things absolutely positive first edward ...,0.24,0.62


In [13]:
# drop unneeded columns
df_drop_dups.drop(['url', 'pageCount', 'title_key_words', 'clean_keywords'], axis = 1, inplace = True)


In [14]:
df_drop_dups.head()

Unnamed: 0,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,Author,lang,sentiment_polarity,sentiment_subjectivity
0,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,Young Adult,Suzanne Collins,en,0.101623,0.484921
1,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,4.5,2668409,45724,Fantasy,J.K. Rowling,en,-0.087273,0.420909
2,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4.28,4772918,95595,Classics,Harper Lee,en,0.165686,0.368067
3,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,4.27,3206070,74020,Classics,Jane Austen,en,0.475556,0.68
4,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,3.61,5231000,107619,Young Adult,Stephenie Meyer,en,0.24,0.62


In [15]:
df2 = df_drop_dups.copy()
# columns no transformation needs to happen on
keep_col = ['bookImage', 'bookDesc', 'bookTitle', 'bookRating', 'Genre', 'Author', 'sentiment_polarity', 'sentiment_subjectivity']
df_keep = df2.loc[:, keep_col]

In [16]:
# one-hot-encode
cols = ['Genre', 'Author']
df_oh = df2.loc[:, cols]
df_oh = pd.get_dummies(df_oh, prefix = "", prefix_sep = "" )

df_oh.head()

Unnamed: 0,Adventure,Art,Autobiography-Memoir,Biography,Business,Childrens,Childrens-Picture Books,Classics,Contemporary,Culture,...,Zoe Sugg,Zoltan Andrejkovics,Zora Neale Hurston,Zoraida Córdova,Zoë Heller,kkat,pleasefindthis,Ágota Kristóf,Åsne Seierstad,Émile Zola
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# scale rating count
scaler = StandardScaler()

cols = ['bookRating','ratingCount', 'reviewCount']
df_scaled = df2.loc[:, cols]

In [18]:
# concat columns
df_con = pd.concat([df_keep, df_scaled, df_oh], axis=1)

df_con.head()

Unnamed: 0,bookImage,bookDesc,bookTitle,bookRating,Genre,Author,sentiment_polarity,sentiment_subjectivity,ratingCount,reviewCount,...,Zoe Sugg,Zoltan Andrejkovics,Zora Neale Hurston,Zoraida Córdova,Zoë Heller,kkat,pleasefindthis,Ágota Kristóf,Åsne Seierstad,Émile Zola
0,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",The Hunger Games,4.32,Young Adult,Suzanne Collins,0.101623,0.484921,6717635,176054,...,0,0,0,0,0,0,0,0,0,0
1,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,Harry Potter and the Order of the Phoenix,4.5,Fantasy,J.K. Rowling,-0.087273,0.420909,2668409,45724,...,0,0,0,0,0,0,0,0,0,0
2,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,To Kill a Mockingbird,4.28,Classics,Harper Lee,0.165686,0.368067,4772918,95595,...,0,0,0,0,0,0,0,0,0,0
3,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,Pride and Prejudice,4.27,Classics,Jane Austen,0.475556,0.68,3206070,74020,...,0,0,0,0,0,0,0,0,0,0
4,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,Twilight,3.61,Young Adult,Stephenie Meyer,0.24,0.62,5231000,107619,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Output File (CSV)
output_data_file = "knn_books_sentiment.csv"

# Export to CSV
df_con.to_csv(output_data_file,index=False)

In [None]:
# knn
def recommender(df, book_title, n_neighbors=11):
    
    df_sub = df_con.drop(["bookImage", "bookDesc","bookTitle", "Genre", "Author"], axis=1)
    model_knn = NearestNeighbors(metric='cosine', n_neighbors=n_neighbors)
    model_knn.fit(df_sub)
    
    book = df_con.loc[df["bookTitle"] == book_title]
    book = book.drop(["bookImage", "bookTitle", "bookDesc", "Genre", "Author"], axis=1)
    book = book.to_numpy()
    
    distances, indices = model_knn.kneighbors(book, n_neighbors = n_neighbors)
    
    result = df.iloc[indices.flatten()]
    result["Distance"] = distances.flatten()
    
    return result

In [None]:
# book recommender
df_result = recommender(df_con, "Harry Potter and the Sorcerer's Stone")

In [None]:
df_result