In [102]:
# data science imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

# data processing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from scipy.cluster.hierarchy import dendrogram, linkage

# unsupervised algorithms
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering

from langdetect import detect

# metrics
from sklearn.metrics import silhouette_samples, silhouette_score

In [103]:
pd.get_option('display.max_columns')

20

In [104]:
# read in file
file = 'data/goodreads_final.csv'

In [110]:
# read in file 
df = pd.read_csv(file)
df.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,Young Adult,374,Suzanne Collins
1,https://www.goodreads.com/book/show/2.Harry_Po...,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,4.5,2668409,45724,Fantasy,870,J.K. Rowling
2,https://www.goodreads.com/book/show/2657.To_Ki...,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4.28,4772918,95595,Classics,324,Harper Lee
3,https://www.goodreads.com/book/show/1885.Pride...,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,4.27,3206070,74020,Classics,279,Jane Austen
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,3.61,5231000,107619,Young Adult,501,Stephenie Meyer


In [111]:
print(df.Genre.nunique())

46


In [112]:
df.Genre.value_counts()


Fiction                          1861
Fantasy                          1857
Young Adult                      1042
Romance                           650
Classics                          559
Historical-Historical Fiction     504
Nonfiction                        425
Science Fiction                   372
Mystery                           320
Horror                            227
Religion                          178
Sequential Art-Graphic Novels     176
Fantasy-Paranormal                118
Paranormal-Vampires               106
Poetry                            101
History                            98
Thriller                           97
Philosophy                         92
Other                              89
Romance-Paranormal Romance         88
Womens Fiction-Chick Lit           69
Childrens                          65
Short Stories                      56
New Adult                          46
Science                            45
Contemporary                       45
Plays       

In [113]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9772 entries, 0 to 9771
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   url          9772 non-null   object 
 1   bookTitle    9772 non-null   object 
 2   bookImage    9772 non-null   object 
 3   bookDesc     9772 non-null   object 
 4   bookRating   9772 non-null   float64
 5   ratingCount  9772 non-null   int64  
 6   reviewCount  9772 non-null   int64  
 7   Genre        9772 non-null   object 
 8   pageCount    9772 non-null   int64  
 9   Author       9772 non-null   object 
dtypes: float64(1), int64(3), object(6)
memory usage: 763.6+ KB


In [114]:
df.columns

Index(['url', 'bookTitle', 'bookImage', 'bookDesc', 'bookRating',
       'ratingCount', 'reviewCount', 'Genre', 'pageCount', 'Author'],
      dtype='object')

In [115]:
# get number unique book names
df.bookTitle.nunique()

9272

In [116]:
# get the discription language and place it in new column 'lang'
df['lang'] = df.bookDesc.apply(lambda x:detect(x))
df.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author,lang
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,Young Adult,374,Suzanne Collins,en
1,https://www.goodreads.com/book/show/2.Harry_Po...,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,4.5,2668409,45724,Fantasy,870,J.K. Rowling,en
2,https://www.goodreads.com/book/show/2657.To_Ki...,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4.28,4772918,95595,Classics,324,Harper Lee,en
3,https://www.goodreads.com/book/show/1885.Pride...,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,4.27,3206070,74020,Classics,279,Jane Austen,en
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,3.61,5231000,107619,Young Adult,501,Stephenie Meyer,en


In [117]:
df.lang.value_counts()

en    9350
ar      90
es      82
fr      54
de      36
id      35
pt      21
it      17
tr      15
fa      12
pl      11
nl       8
ro       6
bg       5
ja       5
hr       5
no       3
el       3
ru       3
fi       2
cs       2
ta       1
sv       1
sk       1
uk       1
af       1
tl       1
et       1
Name: lang, dtype: int64

In [118]:
# get only the english lang books
df_sub = df.loc[df.lang == 'en']
df_sub.bookTitle.nunique()
df_sub.drop('lang', axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [119]:
# drop duplicate books, including books with same title but different author
df_drop_dups = df_sub.drop_duplicates(subset = 'bookTitle')

In [120]:
df_drop_dups.head()

Unnamed: 0,url,bookTitle,bookImage,bookDesc,bookRating,ratingCount,reviewCount,Genre,pageCount,Author
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,Young Adult,374,Suzanne Collins
1,https://www.goodreads.com/book/show/2.Harry_Po...,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,There is a door at the end of a silent corrido...,4.5,2668409,45724,Fantasy,870,J.K. Rowling
2,https://www.goodreads.com/book/show/2657.To_Ki...,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,The unforgettable novel of a childhood in a sl...,4.28,4772918,95595,Classics,324,Harper Lee
3,https://www.goodreads.com/book/show/1885.Pride...,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,Alternate cover edition of ISBN 9780679783268S...,4.27,3206070,74020,Classics,279,Jane Austen
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,About three things I was absolutely positive.F...,3.61,5231000,107619,Young Adult,501,Stephenie Meyer


In [121]:
# drop unneeded columns
df_drop_dups.drop(['url', 'bookImage', 'bookDesc', 'pageCount'], axis = 1, inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [122]:
df_drop_dups.head()

Unnamed: 0,bookTitle,bookRating,ratingCount,reviewCount,Genre,Author
0,The Hunger Games,4.32,6717635,176054,Young Adult,Suzanne Collins
1,Harry Potter and the Order of the Phoenix,4.5,2668409,45724,Fantasy,J.K. Rowling
2,To Kill a Mockingbird,4.28,4772918,95595,Classics,Harper Lee
3,Pride and Prejudice,4.27,3206070,74020,Classics,Jane Austen
4,Twilight,3.61,5231000,107619,Young Adult,Stephenie Meyer


In [162]:
df2 = df_drop_dups.copy()

keep_col = ['bookTitle', 'bookRating', 'Genre', 'Author']
df_keep = df2.loc[:, keep_col]

# for i in range(5):
#     df2.loc[ (df2['bookRating'] >= i) & (df2['bookRating'] <= i+1), 'rating_between'] = f"between {i} and {i+1}"
    
# #making a dummy col for each rating level
# ratings = df2['rating_between'].str.get_dummies(sep=",")
# ratings.head()

In [159]:
# one-hot-encode
cols = ['Genre', 'Author']
df_oh = df2.loc[:, cols]
df_oh = pd.get_dummies(df_oh)

df_oh.head()

Unnamed: 0,Genre_Adventure,Genre_Art,Genre_Autobiography-Memoir,Genre_Biography,Genre_Business,Genre_Childrens,Genre_Childrens-Picture Books,Genre_Classics,Genre_Contemporary,Genre_Culture,...,Author_Zoe Sugg,Author_Zoltan Andrejkovics,Author_Zora Neale Hurston,Author_Zoraida Córdova,Author_Zoë Heller,Author_kkat,Author_pleasefindthis,Author_Ágota Kristóf,Author_Åsne Seierstad,Author_Émile Zola
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [160]:
# scale rating count
scaler = StandardScaler()

# Scale onlt the ASK_AMT column Prof Booth
cols = ['ratingCount', 'reviewCount']
df_scaled = df2.loc[:, cols]

# scaler.fit(df_scaled)
# df_scale = pd.DataFrame(scaler.transform(df_scaled), columns= cols )
# df_scale.head()


In [163]:
df_con = pd.concat([df_keep, df_scaled, df_oh], axis=1)

df_con.head()

Unnamed: 0,bookTitle,bookRating,Genre,Author,ratingCount,reviewCount,Genre_Adventure,Genre_Art,Genre_Autobiography-Memoir,Genre_Biography,...,Author_Zoe Sugg,Author_Zoltan Andrejkovics,Author_Zora Neale Hurston,Author_Zoraida Córdova,Author_Zoë Heller,Author_kkat,Author_pleasefindthis,Author_Ágota Kristóf,Author_Åsne Seierstad,Author_Émile Zola
0,The Hunger Games,4.32,Young Adult,Suzanne Collins,6717635,176054,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Harry Potter and the Order of the Phoenix,4.5,Fantasy,J.K. Rowling,2668409,45724,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,To Kill a Mockingbird,4.28,Classics,Harper Lee,4772918,95595,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Pride and Prejudice,4.27,Classics,Jane Austen,3206070,74020,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Twilight,3.61,Young Adult,Stephenie Meyer,5231000,107619,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [164]:
# Output File (CSV)
output_data_file = "knn_books.csv"

# Export to CSV
df_con.to_csv(output_data_file,index=False)

In [173]:
# knn
def recommender(df, book_title, n_neighbors=11):
    
    df_sub = df_con.drop(["bookTitle", "Genre", "Author"], axis=1)
    model_knn = NearestNeighbors(metric='cosine', n_neighbors=n_neighbors)
    model_knn.fit(df_sub)
    
    book = df_con.loc[df["bookTitle"] == book_title]
    book = book.drop(["bookTitle", "Genre", "Author"], axis=1)
    book = book.to_numpy()
    
    distances, indices = model_knn.kneighbors(book, n_neighbors = n_neighbors)
    
    result = df.iloc[indices.flatten()]
    result["Distance"] = distances.flatten()
    
    return result

In [174]:

# Run this cell to get recommended anime
df_result = recommender(df_con, "About a Boy")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result["Distance"] = distances.flatten()


In [175]:
df_result

Unnamed: 0,bookTitle,bookRating,Genre,Author,ratingCount,reviewCount,Genre_Adventure,Genre_Art,Genre_Autobiography-Memoir,Genre_Biography,...,Author_Zoltan Andrejkovics,Author_Zora Neale Hurston,Author_Zoraida Córdova,Author_Zoë Heller,Author_kkat,Author_pleasefindthis,Author_Ágota Kristóf,Author_Åsne Seierstad,Author_Émile Zola,Distance
1327,About a Boy,3.79,Fiction,Nick Hornby,129674,3394,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2831,The Intelligent Investor,4.22,Business,Benjamin Graham,92716,2427,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.13961e-10
363,Eldest,3.99,Fantasy,Christopher Paolini,378662,9920,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.32849e-10
130,The Godfather,4.36,Fiction,Mario Puzo,354666,9294,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6.878211e-10
5169,Sourcery,3.88,Fantasy,Terry Pratchett,86900,2277,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7.364679e-10
39,The Princess Bride,4.25,Fantasy,William Goldman,809214,21208,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9.535714e-10
0,The Hunger Games,4.32,Young Adult,Suzanne Collins,6717635,176054,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.058851e-09
320,Fried Green Tomatoes at the Whistle Stop Cafe,4.28,Fiction,Fannie Flagg,274404,7170,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.102807e-09
1213,Guards! Guards!,4.29,Fantasy,Terry Pratchett,177312,4649,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.160081e-09
3065,Sandry's Book,3.97,Fantasy,Tamora Pierce,43876,1149,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.552749e-09
