In [5]:
# data science imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

# data processing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from scipy.cluster.hierarchy import dendrogram, linkage

# unsupervised algorithms
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering

from langdetect import detect

# metrics
from sklearn.metrics import silhouette_samples, silhouette_score

In [6]:
pd.get_option('display.max_columns')

20

In [7]:
# read in file
file = 'data/goodreads_sentiment.csv'

In [8]:
# read in file 
df = pd.read_csv(file)
df.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords,sentiment_polarity,sentiment_subjectivity
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,Young Adult,374,Suzanne Collins,en,"['hunger', 'games']",could survive wild every one make sure live se...,0.101623,0.484921
1,https://www.goodreads.com/book/show/2.Harry_Po...,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,4.5,2668409,45724,Fantasy,870,J.K. Rowling,en,"['harry', 'potter', 'order', 'phoenix']",door end silent corridor haunting harry pottte...,-0.087273,0.420909
2,https://www.goodreads.com/book/show/2657.To_Ki...,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4.28,4772918,95595,Classics,324,Harper Lee,en,"['kill', 'mockingbird']",unforgettable novel childhood sleepy southern ...,0.165686,0.368067
3,https://www.goodreads.com/book/show/1885.Pride...,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,4.27,3206070,74020,Classics,279,Jane Austen,en,"['pride', 'prejudice']",alternate cover edition isbn since immediate s...,0.475556,0.68
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,3.61,5231000,107619,Young Adult,501,Stephenie Meyer,en,['twilight'],three things absolutely positive first edward ...,0.24,0.62


In [9]:
print(df.Genre.nunique())

46


In [10]:
df.Genre.value_counts()


Fantasy                          1667
Fiction                          1637
Young Adult                       948
Romance                           595
Classics                          488
Historical-Historical Fiction     464
Nonfiction                        416
Science Fiction                   342
Mystery                           303
Horror                            206
Sequential Art-Graphic Novels     168
Religion                          157
Fantasy-Paranormal                107
Paranormal-Vampires               100
History                            94
Poetry                             88
Other                              87
Thriller                           86
Romance-Paranormal Romance         83
Philosophy                         82
Womens Fiction-Chick Lit           66
Childrens                          64
Short Stories                      52
Science                            43
Contemporary                       43
New Adult                          43
Plays       

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8863 entries, 0 to 8862
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   url                     8863 non-null   object 
 1   bookTitle               8863 non-null   object 
 2   bookImage               8863 non-null   object 
 3   bookDesc                8863 non-null   object 
 4   bookRating              8863 non-null   float64
 5   ratingCount             8863 non-null   int64  
 6   reviewCount             8863 non-null   int64  
 7   Genre                   8863 non-null   object 
 8   pageCount               8863 non-null   int64  
 9   Author                  8863 non-null   object 
 10  lang                    8863 non-null   object 
 11  title_key_words         8863 non-null   object 
 12  clean_keywords          8863 non-null   object 
 13  sentiment_polarity      8863 non-null   float64
 14  sentiment_subjectivity  8863 non-null   

In [12]:
df.columns

Index(['url', 'bookTitle', 'bookImage', 'bookDesc', 'bookRating',
       'ratingCount', 'reviewCount', 'Genre', 'pageCount', 'Author', 'lang',
       'title_key_words', 'clean_keywords', 'sentiment_polarity',
       'sentiment_subjectivity'],
      dtype='object')

In [13]:
# get number unique book names
df.bookTitle.nunique()

8863

In [14]:
# get the discription language and place it in new column 'lang'
df['lang'] = df.bookDesc.apply(lambda x:detect(x))
df.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords,sentiment_polarity,sentiment_subjectivity
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,Young Adult,374,Suzanne Collins,en,"['hunger', 'games']",could survive wild every one make sure live se...,0.101623,0.484921
1,https://www.goodreads.com/book/show/2.Harry_Po...,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,4.5,2668409,45724,Fantasy,870,J.K. Rowling,en,"['harry', 'potter', 'order', 'phoenix']",door end silent corridor haunting harry pottte...,-0.087273,0.420909
2,https://www.goodreads.com/book/show/2657.To_Ki...,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4.28,4772918,95595,Classics,324,Harper Lee,en,"['kill', 'mockingbird']",unforgettable novel childhood sleepy southern ...,0.165686,0.368067
3,https://www.goodreads.com/book/show/1885.Pride...,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,4.27,3206070,74020,Classics,279,Jane Austen,en,"['pride', 'prejudice']",alternate cover edition isbn since immediate s...,0.475556,0.68
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,3.61,5231000,107619,Young Adult,501,Stephenie Meyer,en,['twilight'],three things absolutely positive first edward ...,0.24,0.62


In [34]:
# drop duplicate books, including books with same title but different author
df_drop_dups = df.drop_duplicates(subset = 'bookTitle')

In [35]:
df_drop_dups.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang,title_key_words,clean_keywords,sentiment_polarity,sentiment_subjectivity
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,Young Adult,374,Suzanne Collins,en,"['hunger', 'games']",could survive wild every one make sure live se...,0.101623,0.484921
1,https://www.goodreads.com/book/show/2.Harry_Po...,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,4.5,2668409,45724,Fantasy,870,J.K. Rowling,en,"['harry', 'potter', 'order', 'phoenix']",door end silent corridor haunting harry pottte...,-0.087273,0.420909
2,https://www.goodreads.com/book/show/2657.To_Ki...,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4.28,4772918,95595,Classics,324,Harper Lee,en,"['kill', 'mockingbird']",unforgettable novel childhood sleepy southern ...,0.165686,0.368067
3,https://www.goodreads.com/book/show/1885.Pride...,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,4.27,3206070,74020,Classics,279,Jane Austen,en,"['pride', 'prejudice']",alternate cover edition isbn since immediate s...,0.475556,0.68
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,3.61,5231000,107619,Young Adult,501,Stephenie Meyer,en,['twilight'],three things absolutely positive first edward ...,0.24,0.62


In [36]:
# drop unneeded columns
df_drop_dups.drop(['url', 'bookImage', 'bookDesc', 'pageCount', 'title_key_words', 'clean_keywords'], axis = 1, inplace = True)


In [37]:
df_drop_dups.head()

Unnamed: 0,bookTitle,bookRating,ratingCount,reviewCount,Genre,Author,lang,sentiment_polarity,sentiment_subjectivity
0,The Hunger Games,4.32,6717635,176054,Young Adult,Suzanne Collins,en,0.101623,0.484921
1,Harry Potter and the Order of the Phoenix,4.5,2668409,45724,Fantasy,J.K. Rowling,en,-0.087273,0.420909
2,To Kill a Mockingbird,4.28,4772918,95595,Classics,Harper Lee,en,0.165686,0.368067
3,Pride and Prejudice,4.27,3206070,74020,Classics,Jane Austen,en,0.475556,0.68
4,Twilight,3.61,5231000,107619,Young Adult,Stephenie Meyer,en,0.24,0.62


In [38]:
df2 = df_drop_dups.copy()

keep_col = ['bookTitle', 'bookRating', 'Genre', 'Author', 'sentiment_polarity', 'sentiment_subjectivity']
df_keep = df2.loc[:, keep_col]

# for i in range(5):
#     df2.loc[ (df2['bookRating'] >= i) & (df2['bookRating'] <= i+1), 'rating_between'] = f"between {i} and {i+1}"
    
# #making a dummy col for each rating level
# ratings = df2['rating_between'].str.get_dummies(sep=",")
# ratings.head()

In [43]:
# one-hot-encode
cols = ['Genre', 'Author']
df_oh = df2.loc[:, cols]
df_oh = pd.get_dummies(df_oh)

df_oh.head()

Unnamed: 0,Genre_Adventure,Genre_Art,Genre_Autobiography-Memoir,Genre_Biography,Genre_Business,Genre_Childrens,Genre_Childrens-Picture Books,Genre_Classics,Genre_Contemporary,Genre_Culture,...,Author_Zoe Sugg,Author_Zoltan Andrejkovics,Author_Zora Neale Hurston,Author_Zoraida Córdova,Author_Zoë Heller,Author_kkat,Author_pleasefindthis,Author_Ágota Kristóf,Author_Åsne Seierstad,Author_Émile Zola
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# scale rating count
scaler = StandardScaler()

# Scale onlt the ASK_AMT column Prof Booth
cols = ['ratingCount', 'reviewCount']
df_scaled = df2.loc[:, cols]

# scaler.fit(df_scaled)
# df_scale = pd.DataFrame(scaler.transform(df_scaled), columns= cols )
# df_scale.head()


In [26]:
df_con = pd.concat([df_keep, df_scaled, df_oh], axis=1)

df_con.head()

Unnamed: 0,bookTitle,bookRating,Genre,Author,sentiment_polarity,sentiment_subjectivity,ratingCount,reviewCount,Genre_Adventure,Genre_Art,...,Author_Zoe Sugg,Author_Zoltan Andrejkovics,Author_Zora Neale Hurston,Author_Zoraida Córdova,Author_Zoë Heller,Author_kkat,Author_pleasefindthis,Author_Ágota Kristóf,Author_Åsne Seierstad,Author_Émile Zola
0,The Hunger Games,4.32,Young Adult,Suzanne Collins,0.101623,0.484921,6717635,176054,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Harry Potter and the Order of the Phoenix,4.5,Fantasy,J.K. Rowling,-0.087273,0.420909,2668409,45724,0,0,...,0,0,0,0,0,0,0,0,0,0
2,To Kill a Mockingbird,4.28,Classics,Harper Lee,0.165686,0.368067,4772918,95595,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Pride and Prejudice,4.27,Classics,Jane Austen,0.475556,0.68,3206070,74020,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Twilight,3.61,Young Adult,Stephenie Meyer,0.24,0.62,5231000,107619,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Output File (CSV)
output_data_file = "knn_books_sentiment.csv"

# Export to CSV
df_con.to_csv(output_data_file,index=False)

In [28]:
# knn
def recommender(df, book_title, n_neighbors=11):
    
    df_sub = df_con.drop(["bookTitle", "Genre", "Author"], axis=1)
    model_knn = NearestNeighbors(metric='cosine', n_neighbors=n_neighbors)
    model_knn.fit(df_sub)
    
    book = df_con.loc[df["bookTitle"] == book_title]
    book = book.drop(["bookTitle", "Genre", "Author"], axis=1)
    book = book.to_numpy()
    
    distances, indices = model_knn.kneighbors(book, n_neighbors = n_neighbors)
    
    result = df.iloc[indices.flatten()]
    result["Distance"] = distances.flatten()
    
    return result

In [29]:

# Run this cell to get recommended anime
df_result = recommender(df_con, "Harry Potter and the Sorcerer's Stone")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result["Distance"] = distances.flatten()


In [30]:
df_result

Unnamed: 0,bookTitle,bookRating,Genre,Author,sentiment_polarity,sentiment_subjectivity,ratingCount,reviewCount,Genre_Adventure,Genre_Art,...,Author_Zoltan Andrejkovics,Author_Zora Neale Hurston,Author_Zoraida Córdova,Author_Zoë Heller,Author_kkat,Author_pleasefindthis,Author_Ágota Kristóf,Author_Åsne Seierstad,Author_Émile Zola,Distance
37,Harry Potter and the Sorcerer's Stone,4.48,Fantasy,J.K. Rowling,0.039942,0.74116,7550262,119673,0,0,...,0,0,0,0,0,0,0,0,0,1.110223e-16
184,"Oh, the Places You'll Go!",4.33,Childrens,Dr. Seuss,0.32381,0.513095,350221,5547,0,0,...,0,0,0,0,0,0,0,0,0,1.461153e-10
831,Deception Point,3.73,Fiction,Dan Brown,0.004762,0.740476,613205,9739,0,0,...,0,0,0,0,0,0,0,0,0,5.284694e-10
204,Schindler's List,4.33,History,Thomas Keneally,0.044444,0.5,150981,2390,0,0,...,0,0,0,0,0,0,0,0,0,6.50791e-10
8066,Silverthorn,4.08,Fantasy,Raymond E. Feist,-0.061111,0.665278,57497,913,0,0,...,0,0,0,0,0,0,0,0,0,3.259324e-09
174,The Joy Luck Club,3.94,Fiction,Amy Tan,0.110963,0.439483,613833,9782,0,0,...,0,0,0,0,0,0,0,0,0,3.694687e-09
6666,The Cardinal of the Kremlin,4.06,Fiction,Tom Clancy,0.29688,0.458239,49137,777,0,0,...,0,0,0,0,0,0,0,0,0,4.532045e-09
136,The Little House Collection,4.34,Classics,Laura Ingalls Wilder,0.127383,0.489182,144878,2283,0,0,...,0,0,0,0,0,0,0,0,0,4.722766e-09
2262,Burnt Offerings,4.07,Fantasy,Laurell K. Hamilton,0.15,0.25,66786,1064,0,0,...,0,0,0,0,0,0,0,0,0,5.355296e-09
4233,Dragons of Winter Night,4.09,Fantasy,Margaret Weis,0.05,0.45,49338,785,0,0,...,0,0,0,0,0,0,0,0,0,5.663283e-09
