In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from scipy.spatial.distance import pdist, squareform
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import NMF
from sklearn.decomposition import PCA
scale = MinMaxScaler(feature_range=(0,1))


In [2]:
df = pd.read_csv("book_features_data.csv", parse_dates=[6, 17], dtype={'ebook':int})
maturity = {'NOT_MATURE':0, 'MATURE':1}
df.maturity = df.maturity.replace(maturity)
for i in ['pages','google_ratings', 'total_google_ratings', 'price']:
    df[i] = df[i].fillna(0).astype(np.int64)

#drop the duplicate of Salems Lot which is the only duplicate
#drop unnecessary columns
df = df.drop(index=[3, 4, 5]).reset_index()
df = df.drop(columns=['level_0', "Unnamed: 0"])

In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9915 entries, 0 to 9914
Data columns (total 26 columns):
titles                       9915 non-null object
author                       9915 non-null object
pages                        9915 non-null int64
genres                       9632 non-null object
pub_date                     9850 non-null object
publisher                    9915 non-null object
google_ratings               9915 non-null int64
total_google_ratings         9915 non-null int64
maturity                     9876 non-null float64
price                        9915 non-null int64
ebook                        9915 non-null float64
index                        9915 non-null int64
id                           9915 non-null int64
book_id                      9915 non-null int64
books_count                  9915 non-null int64
original_publication_year    9895 non-null float64
original_title               9915 non-null object
short_title                  9915 non-null objec

In [4]:
#we see the most popular genres
#these roughly 40 genres account for 9170 books (that's 95% of all books with an identified title)
#and 8 of these genres account for 8228 books or roughly 85%

df.groupby('genres')['titles'].count()[df.groupby('genres')['titles'].count() >= 5].sort_values(ascending = False)

genres
Fiction                          5170
Juvenile Fiction                 1354
Young Adult Fiction               577
Biography & Autobiography         431
Comics & Graphic Novels           299
Business & Economics              142
History                           136
Religion                          119
Drama                              87
Humor                              63
Self-Help                          62
Philosophy                         57
Science                            55
Juvenile Nonfiction                53
Cooking                            48
Psychology                         45
Poetry                             43
Social Science                     43
Art                                39
Body, Mind & Spirit                31
Family & Relationships             30
Travel                             28
FICTION                            27
Children's stories                 22
Literary Collections               22
Literary Criticism                 22
Polit

In [5]:
#here is a list of the most prolific authors from our set of books
#we have some very recognizable names 

df.groupby('author')['titles'].count()[df.groupby('author')['titles'].count() >= 20].sort_values(ascending = False)

author
James Patterson        98
Stephen King           79
Nora Roberts           62
Dean Koontz            52
Terry Pratchett        48
Agatha Christie        42
J.D. Robb              41
Meg Cabot              37
Janet Evanovich        37
Neil Gaiman            34
John Grisham           34
David Baldacci         34
Laurell K. Hamilton    31
Michael Connelly       29
John Sandford          28
Kristen Ashley         27
Rick Riordan           27
Anne Rice              26
Tamora Pierce          26
Harlan Coben           25
Jim Butcher            25
William Shakespeare    25
Patricia Cornwell      25
Sherrilyn Kenyon       25
Lee Child              24
Jodi Picoult           24
Sue Grafton            24
Orson Scott Card       23
Douglas Preston        23
Dr. Seuss              22
Brandon Sanderson      22
C.S. Lewis             22
Charlaine Harris       22
Richelle Mead          22
Abbi Glines            22
Mary Higgins Clark     21
Haruki Murakami        21
Isaac Asimov           21
David

In [6]:
#here we are examining what are the most popular tags given by the goodbooks users
#we do see some of the genres from eariler
#however, we see some more fascinating tags as well, such as,
# sci-fi, vampiers, dystopia, paranormal
main = df.groupby('main_tag')['titles'].count()[df.groupby('main_tag')['titles'].count() >= 50].sort_values(ascending = False)
second = df.groupby('secondary_tag')['titles'].count()[df.groupby('secondary_tag')['titles'].count() >= 50].sort_values(ascending = False)
third = df.groupby('minor_tag')['titles'].count()[df.groupby('minor_tag')['titles'].count() >= 50].sort_values(ascending = False)

popular_tags = {}
for i in [main, second, third]:
    for j, k in enumerate(i.index):
        if k not in popular_tags.keys():
            popular_tags[k] = 0
        popular_tags[k] += i[j]
        
print(popular_tags)

{'fiction': 4481, 'fantasy': 1807, 'young-adult': 1504, 'mystery': 1126, 'non-fiction': 1343, 'romance': 1262, 'classics': 834, 'historical-fiction': 828, 'science-fiction': 496, 'horror': 374, 'graphic-novels': 330, 'picture-books': 165, 'childrens': 451, 'chick-lit': 325, 'history': 291, 'urban-fantasy': 122, 'vampires': 248, 'manga': 97, 'thriller': 511, 'paranormal-romance': 81, 'comics': 177, 'business': 78, 'james-patterson': 72, 'new-adult': 146, 'philosophy': 70, 'humor': 275, 'christian': 115, 'paranormal': 507, 'poetry': 59, 'biography': 269, 'christian-fiction': 56, 'memoir': 296, 'plays': 52, 'short-stories': 162, 'sci-fi': 358, 'contemporary': 568, 'children-s-books': 209, 'dystopian': 82, 'contemporary-romance': 163, 'crime': 192, 'series': 228, 'historical': 201, 'classic': 112, 'dystopia': 97, 'graphic-novel': 97, 'children': 95, 'adventure': 86, 'literature': 76, 'epic-fantasy': 68, 'memoirs': 52, 'psychology': 51}


In [7]:
#we move to create dummy variables for each genre
#since we don't want to use every genre, we select only the 40 popular genres we look at earlier
genres = df.groupby('genres')['titles'].count()[df.groupby('genres')['titles'].count() >= 5].sort_values(ascending = False).index
test = df[['book_id', 'genres']][df.genres.isin(genres)]
genre_dummies = pd.get_dummies(test, prefix='', prefix_sep='')
genre_dummies

Unnamed: 0,book_id,Adventure stories,Animals,Art,Biography & Autobiography,"Body, Mind & Spirit",Brothers,Business & Economics,Children's stories,"Comic books, strips, etc",...,Political Science,Psychology,Religion,Science,Self-Help,Social Science,Sports & Recreation,Travel,True Crime,Young Adult Fiction
0,82747,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,18667945,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11590,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4912,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11525,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,18505796,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7,11339,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,76889,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,9436746,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,4835838,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
#we also create dummies for each of the popular tags
#we want to be cautious because even though when we pulled the data, we tried to be careful about the tags we used
#these tags still are not super clean therefore could easily lead to unecessary duplicates
df1 = df[['book_id', 'main_tag']]
df1.columns = ['book_id', 'tags']
df2 = df[['book_id', 'secondary_tag']]
df2.columns = ['book_id', 'tags']
df3 = df[['book_id', 'minor_tag']]
df3.columns = ['book_id', 'tags']
concat_df = pd.concat([df1, df2, df3], ignore_index=True)
tags = concat_df[concat_df.tags.isin(popular_tags.keys())]
tag_dummies = pd.get_dummies(tags, prefix='', prefix_sep='')
print(tag_dummies)


tag_dummies = tag_dummies.groupby('book_id').sum(axis=1)

        book_id  adventure  biography  business  chick-lit  children  \
0         82747          0          0         0          0         0   
1      18667945          0          0         0          0         0   
2         11590          0          0         0          0         0   
3          4912          0          0         0          0         0   
5      18505796          0          0         0          0         0   
6         94510          0          0         0          0         0   
7         11339          0          0         0          0         0   
8         76889          0          0         0          0         0   
9       9436746          0          0         0          0         0   
10      4835838          0          0         0          0         0   
11     10644930          0          0         0          0         0   
12     11872242          0          0         0          0         0   
13     15734464          0          0         0          0      

In [9]:
#we see that the majority of the language codes
#which I have to assume is the language the book would have been translated from
#are predominatly english 
#this may just add unnecessary noise to the data
#but we should incorporate the langauge codes into the book feature space
#we create dummies for each
print(df.groupby('language_code')['titles'].count())
lang_dummies = pd.get_dummies(df[["book_id", "language_code"]], columns=["language_code"], prefix='', prefix_sep="")
print(lang_dummies)

language_code
dan         3
en          4
en-CA      58
en-GB     257
en-US    2070
eng      6339
fre        25
ger        13
ind        21
ita         2
jpn         4
mul         1
nl          1
nor         3
por         6
rum         1
spa        20
swe         1
tur         1
vie         1
Name: titles, dtype: int64
       book_id  dan  en  en-CA  en-GB  en-US  eng  fre  ger  ind ...   jpn  \
0        82747    0   0      0      0      1    0    0    0    0 ...     0   
1     18667945    0   0      0      0      0    1    0    0    0 ...     0   
2        11590    0   0      0      1      0    0    0    0    0 ...     0   
3         4912    0   0      0      0      0    1    0    0    0 ...     0   
4        11525    0   0      0      0      0    0    0    0    0 ...     0   
5     18505796    0   0      0      0      0    1    0    0    0 ...     0   
6        94510    0   0      0      0      0    1    0    0    0 ...     0   
7        11339    0   0      0      0      0    1    0 

In [10]:
#we can now create the book feature space
#we have to drop the columns that won't be useful in measuring the distance between books
#we then add all of the dummy variables we have been creating
#we then clean it up a little bit further 
#have a book feature space to compute distance (similarity)
dummies = [genre_dummies, tag_dummies, lang_dummies]
book_fs = df.copy()
book_fs = book_fs.drop(columns = ['genres', 'publisher', 'pub_date', 'index', 'id', 'short_title','language_code', 'work_text_reviews_count', 'main_tag', 'secondary_tag', 'minor_tag'])
for i in dummies:
    book_fs = pd.merge(book_fs, i, on ='book_id', how='left')
book_fs = book_fs.fillna(0)
book_fs.index = [book_fs.titles, book_fs.author, book_fs.book_id,]
book_fs = book_fs.drop(columns=['titles', 'author', 'original_title', 'book_id'])
book_fs

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pages,google_ratings,total_google_ratings,maturity,price,ebook,books_count,original_publication_year,goodbook_rating,ratings_count,...,jpn,mul,nl,nor,por,rum,spa,swe,tur,vie
titles,author,book_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
"Angels (Walsh Family, #3)",Marian Keyes,82747,448,4,9,0.0,10,1.0,67,2002.0,3.73,25680,...,0,0,0,0,0,0,0,0,0,0
#GIRLBOSS,Sophia Amoruso,18667945,256,3,2595,0.0,11,1.0,35,2014.0,3.71,40090,...,0,0,0,0,0,0,0,0,0,0
'Salem's Lot,Stephen King,11590,560,4,153,0.0,8,1.0,211,1975.0,3.99,228680,...,0,0,0,0,0,0,0,0,0,0
"'Tis (Frank McCourt, #2)",Frank McCourt,4912,368,3,65,0.0,12,1.0,85,1999.0,3.66,40726,...,0,0,0,0,0,0,0,0,0,0
"1,000 Places to See Before You Die",Patricia Schultz,11525,1200,5,1,0.0,0,0.0,28,2003.0,3.83,33025,...,0,0,0,0,0,0,0,0,0,0
"10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works",Dan Harris,18505796,256,4,3307,0.0,11,1.0,29,2014.0,3.93,33969,...,0,0,0,0,0,0,0,0,0,0
"100 Bullets, Vol. 1: First Shot, Last Call",Brian Azzarello,94510,416,0,0,0.0,0,0.0,13,1999.0,3.82,13026,...,0,0,0,0,0,0,0,0,0,0
100 Love Sonnets,Pablo Neruda,11339,213,4,11,0.0,0,0.0,70,1959.0,4.40,11425,...,0,0,0,0,0,0,0,0,0,0
100 Selected Poems,E.E. Cummings,76889,128,4,16,0.0,15,1.0,8,1954.0,4.32,22346,...,0,0,0,0,0,0,0,0,0,0
"10th Anniversary (Women's Murder Club, #10)",James Patterson,9436746,448,3,48,0.0,9,1.0,45,2011.0,4.05,27052,...,0,0,0,0,0,0,0,0,0,0


In [11]:
#we go and normalize the variables so that they are between 0 and 1 
#this is to not overweight a particular variable when we compute the distance between book vectors
normalize_book_fs = book_fs.copy()
for i in normalize_book_fs.columns:
    if np.mean(normalize_book_fs[i]) > 1:
        x = np.array(normalize_book_fs[i])
        x = x.reshape(-1, 1)
        x = scale.fit_transform(x)
        number = []
        for num in x:
            number.append(num[0])
        normalize_book_fs[i] = number



In [12]:
normalize_book_fs

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pages,google_ratings,total_google_ratings,maturity,price,ebook,books_count,original_publication_year,goodbook_rating,ratings_count,...,jpn,mul,nl,nor,por,rum,spa,swe,tur,vie
titles,author,book_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
"Angels (Walsh Family, #3)",Marian Keyes,82747,0.160920,0.8,0.001051,0.0,0.204082,1.0,0.019108,0.996018,0.536170,0.004794,...,0,0,0,0,0,0,0,0,0,0
#GIRLBOSS,Sophia Amoruso,18667945,0.091954,0.6,0.302977,0.0,0.224490,1.0,0.009844,0.999204,0.527660,0.007810,...,0,0,0,0,0,0,0,0,0,0
'Salem's Lot,Stephen King,11590,0.201149,0.8,0.017863,0.0,0.163265,1.0,0.060799,0.988851,0.646809,0.047282,...,0,0,0,0,0,0,0,0,0,0
"'Tis (Frank McCourt, #2)",Frank McCourt,4912,0.132184,0.6,0.007589,0.0,0.244898,1.0,0.024320,0.995222,0.506383,0.007943,...,0,0,0,0,0,0,0,0,0,0
"1,000 Places to See Before You Die",Patricia Schultz,11525,0.431034,1.0,0.000117,0.0,0.000000,0.0,0.007817,0.996284,0.578723,0.006332,...,0,0,0,0,0,0,0,0,0,0
"10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works",Dan Harris,18505796,0.091954,0.8,0.386106,0.0,0.224490,1.0,0.008107,0.999204,0.621277,0.006529,...,0,0,0,0,0,0,0,0,0,0
"100 Bullets, Vol. 1: First Shot, Last Call",Brian Azzarello,94510,0.149425,0.0,0.000000,0.0,0.000000,0.0,0.003474,0.995222,0.574468,0.002146,...,0,0,0,0,0,0,0,0,0,0
100 Love Sonnets,Pablo Neruda,11339,0.076509,0.8,0.001284,0.0,0.000000,0.0,0.019977,0.984603,0.821277,0.001811,...,0,0,0,0,0,0,0,0,0,0
100 Selected Poems,E.E. Cummings,76889,0.045977,0.8,0.001868,0.0,0.306122,1.0,0.002027,0.983276,0.787234,0.004097,...,0,0,0,0,0,0,0,0,0,0
"10th Anniversary (Women's Murder Club, #10)",James Patterson,9436746,0.160920,0.6,0.005604,0.0,0.183673,1.0,0.012739,0.998407,0.672340,0.005082,...,0,0,0,0,0,0,0,0,0,0


In [13]:
authors = pd.DataFrame(df.author)

In [14]:
#we now want to create a feature space for the authors
#since authors are probably the most important reason someone chooses a book
#we begin with pages, we could compute the average pages an author writes per book which is an interesting feature
#but we could also see how many books fall within particular page ranges
pages = []
for i in df.author.index:
    if 1 <= df.pages[i] < 100:
        pages.append("<100")
    elif 100 <= df.pages[i] <200:
        pages.append("100 - 199")
    elif 200 <= df.pages[i] <300:
        pages.append("200 - 299")
    elif 300 <= df.pages[i] < 400:
        pages.append("300 - 399")
    elif 400 <= df.pages[i] <500:
        pages.append("400 - 499")
    elif 500 <= df.pages[i] <600:
        pages.append("500 - 599")
    elif 600 <= df.pages[i] <700:
        pages.append("600 - 699")
    elif 700 <= df.pages[i] <800:
        pages.append("700 - 799")
    elif 800 <= df.pages[i] <900:
        pages.append("800 - 899")
    elif 900 <= df.pages[i] <1000:
        pages.append("900 - 999")
    elif df.pages[i] >= 1000:
        pages.append(">=1000")
    else:
        pages.append('No Pages')

pages_to_df = pd.DataFrame(pages, columns=['pages'])
pages_df = pd.concat([authors, pages_to_df], axis=1)
pages_dummies = pd.get_dummies(pages_df, columns=['pages'], prefix='', prefix_sep='')
grouped_pages = pages_dummies.groupby('author').sum().reset_index()
grouped_pages

Unnamed: 0,author,100 - 199,200 - 299,300 - 399,400 - 499,500 - 599,600 - 699,700 - 799,800 - 899,900 - 999,<100,>=1000,No Pages
0,A. Manette Ansay,0,1,0,0,0,0,0,0,0,0,0,0
1,A. Meredith Walters,0,0,2,0,0,0,0,0,0,0,0,0
2,A.A. Milne,3,0,0,1,0,0,0,0,0,1,0,1
3,A.C. Gaughen,0,0,1,0,0,0,0,0,0,0,0,0
4,A.G. Howard,0,1,0,1,0,0,0,0,0,0,0,0
5,A.G. Riddle,0,0,1,3,0,0,0,0,0,0,0,0
6,A.J. Banner,0,0,0,0,0,0,0,0,0,0,0,1
7,A.J. Jacobs,0,0,1,2,0,0,0,0,0,0,0,0
8,A.L. Singer,0,0,0,0,0,0,0,0,0,1,0,0
9,A.M. Homes,0,0,0,1,0,0,0,0,0,0,0,0


In [15]:
#same with years, we can compute a central tendency and we can group 
#which gives a little bit more of a profile to the author
years = []
for i in df.author.index:
    if df.original_publication_year[i] < 1700:
        years.append("<1700")
    elif 1700 <= df.original_publication_year[i] <1800:
        years.append("1700 - 1799")
    elif 1800 <= df.original_publication_year[i] <1900:
        years.append("1800 - 1899")
    elif 1900 <= df.original_publication_year[i] <1910:
        years.append("1900 - 1910")
    elif 1910 <= df.original_publication_year[i] <1920:
        years.append("1910 - 1920")
    elif 1920 <= df.original_publication_year[i] <1930:
        years.append("1920 - 1930")
    elif 1930 <= df.original_publication_year[i] <1940:
        years.append("1930 - 1940")
    elif 1940 <= df.original_publication_year[i] <1950:
        years.append("1940 - 1950")
    elif 1950 <= df.original_publication_year[i] <1960:
        years.append("1950 - 1960")
    elif 1960 <= df.original_publication_year[i] <1970:
        years.append("1960 - 1970")
    elif 1970 <= df.original_publication_year[i] <1980:
        years.append("1970 - 1980")
    elif 1980 <= df.original_publication_year[i] <1990:
        years.append("1980 - 1990")
    elif 1990 <= df.original_publication_year[i] <2000:
        years.append("1990 - 2000")
    elif 2000 <= df.original_publication_year[i] <2010:
        years.append("2000 - 2010")
    else:
        years.append("2010+")
years_to_df = pd.DataFrame(years, columns=['years'])
years_df = pd.concat([authors, years_to_df], axis=1)
years_dummies = pd.get_dummies(years_df, columns=['years'], prefix='', prefix_sep='')
grouped_years = years_dummies.groupby('author').sum().reset_index()
grouped_years

Unnamed: 0,author,1700 - 1799,1800 - 1899,1900 - 1910,1910 - 1920,1920 - 1930,1930 - 1940,1940 - 1950,1950 - 1960,1960 - 1970,1970 - 1980,1980 - 1990,1990 - 2000,2000 - 2010,2010+,<1700
0,A. Manette Ansay,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,A. Meredith Walters,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0
2,A.A. Milne,0,0,0,0,5,0,0,0,1,0,0,0,0,0,0
3,A.C. Gaughen,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,A.G. Howard,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0
5,A.G. Riddle,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0
6,A.J. Banner,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
7,A.J. Jacobs,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0
8,A.L. Singer,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
9,A.M. Homes,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [16]:
google_ratings = []
for i in df.author.index:
    if df.google_ratings[i] < 1:
        google_ratings.append('No Rating')
    elif 1 <= df.google_ratings[i] < 2:
        google_ratings.append('1 - 1.9')
    elif 2 <= df.google_ratings[i] < 3:
        google_ratings.append('2 - 2.9')
    elif 3 <= df.google_ratings[i] < 4:
        google_ratings.append('3 - 3.9')
    elif 4 <= df.google_ratings[i] < 5:
        google_ratings.append('4 - 4.9')
    else:
        google_ratings.append('5')
google_ratings = pd.DataFrame(google_ratings, columns=['google_ratings'])
google_df = pd.concat([authors, google_ratings], axis=1)
google_dummies = pd.get_dummies(google_df, columns=['google_ratings'],prefix='google', prefix_sep='_')
grouped_google_ratings = google_dummies.groupby('author').sum().reset_index()
grouped_google_ratings

Unnamed: 0,author,google_1 - 1.9,google_2 - 2.9,google_3 - 3.9,google_4 - 4.9,google_5,google_No Rating
0,A. Manette Ansay,0,0,1,0,0,0
1,A. Meredith Walters,0,0,1,0,0,1
2,A.A. Milne,0,0,0,3,1,2
3,A.C. Gaughen,0,0,1,0,0,0
4,A.G. Howard,0,0,0,0,1,1
5,A.G. Riddle,0,0,1,1,0,2
6,A.J. Banner,0,1,0,0,0,0
7,A.J. Jacobs,0,0,2,1,0,0
8,A.L. Singer,0,0,0,0,0,1
9,A.M. Homes,0,0,1,0,0,0


In [17]:
goodbook_ratings = []
for i in df.author.index:
    if df.goodbook_rating[i] < 1:
        goodbook_ratings.append('No Rating')
    elif 1 <= df.goodbook_rating[i] < 2:
        goodbook_ratings.append('1 - 1.9')
    elif 2 <= df.goodbook_rating[i] < 3:
        goodbook_ratings.append('2 - 2.9')
    elif 3 <= df.goodbook_rating[i] < 4:
        goodbook_ratings.append('3 - 3.9')
    elif 4 <= df.goodbook_rating[i] < 5:
        goodbook_ratings.append('4 - 4.9')
    else:
        goodbook_ratings.append('5')
        
goodbooks_ratings = pd.DataFrame(goodbook_ratings, columns=['goodbook_ratings'])
goodbook_df = pd.concat([authors, goodbooks_ratings], axis=1)
goodbook_dummies = pd.get_dummies(goodbook_df, columns=['goodbook_ratings'],prefix='goodbook', prefix_sep='_')
grouped_goodbook_ratings = goodbook_dummies.groupby('author').sum().reset_index()
grouped_goodbook_ratings

Unnamed: 0,author,goodbook_2 - 2.9,goodbook_3 - 3.9,goodbook_4 - 4.9
0,A. Manette Ansay,0,1,0
1,A. Meredith Walters,0,2,0
2,A.A. Milne,0,0,6
3,A.C. Gaughen,0,1,0
4,A.G. Howard,0,1,1
5,A.G. Riddle,0,4,0
6,A.J. Banner,0,1,0
7,A.J. Jacobs,0,3,0
8,A.L. Singer,0,0,1
9,A.M. Homes,0,1,0


In [18]:
#summed the number of ebooks the author has
#summed the number of mature books the author has
#counted the number of books in our list the author has
grouped_ebook = df.groupby('author')['ebook'].sum().reset_index()
grouped_maturity = df.groupby('author')['maturity'].sum().reset_index()
grouped_books = df.groupby('author')['titles'].count().reset_index()

In [19]:
#using the genre dummies object we created earlier
#which essentially created a dummy variable for each genre
#we can now group by the author and sum to see the number of books each author wrote in each genre
merged_df = pd.merge(df[['author', 'book_id']], genre_dummies, on = 'book_id', how='left')
merged_df = merged_df.drop('book_id', axis = 1).fillna(0)
grouped_genres = merged_df.groupby('author').sum().reset_index()
grouped_genres

Unnamed: 0,author,Adventure stories,Animals,Art,Biography & Autobiography,"Body, Mind & Spirit",Brothers,Business & Economics,Children's stories,"Comic books, strips, etc",...,Political Science,Psychology,Religion,Science,Self-Help,Social Science,Sports & Recreation,Travel,True Crime,Young Adult Fiction
0,A. Manette Ansay,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A. Meredith Walters,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A.A. Milne,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A.C. Gaughen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A.G. Howard,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,A.G. Riddle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,A.J. Banner,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,A.J. Jacobs,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,A.L. Singer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,A.M. Homes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
#we now do the same as above for the tags
#we see the total number of tags associated with each author
merged_df = pd.merge(df[['author', 'book_id']],tag_dummies, on='book_id', how = 'left')
merged_df = merged_df.drop('book_id', axis = 1).fillna(0)
grouped_tag = merged_df.groupby('author').sum().reset_index()
grouped_tag



Unnamed: 0,author,adventure,biography,business,chick-lit,children,children-s-books,childrens,christian,christian-fiction,...,psychology,romance,sci-fi,science-fiction,series,short-stories,thriller,urban-fantasy,vampires,young-adult
0,A. Manette Ansay,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A. Meredith Walters,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,A.A. Milne,0.0,0.0,0.0,0.0,5.0,1.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A.C. Gaughen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,A.G. Howard,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
5,A.G. Riddle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,A.J. Banner,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,A.J. Jacobs,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,A.L. Singer,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,A.M. Homes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
#we now combine all of the dummy variables to create the author feature space
#we have 3840 unique authors over 137 variables
variables = [grouped_years, grouped_pages, grouped_maturity, grouped_google_ratings, grouped_goodbook_ratings, grouped_genres, grouped_ebook, grouped_books, grouped_tag]
author_fs = pd.DataFrame(grouped_books.author)
for i in variables:
    author_fs = pd.merge(author_fs, i, on='author', how = 'left')
author_fs.index = author_fs.author
author_fs = author_fs.drop(columns=['author'])
author_fs


Unnamed: 0_level_0,1700 - 1799,1800 - 1899,1900 - 1910,1910 - 1920,1920 - 1930,1930 - 1940,1940 - 1950,1950 - 1960,1960 - 1970,1970 - 1980,...,psychology,romance,sci-fi,science-fiction,series,short-stories,thriller,urban-fantasy,vampires,young-adult
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A. Manette Ansay,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A. Meredith Walters,0,0,0,0,0,0,0,0,0,0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
A.A. Milne,0,0,0,0,5,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A.C. Gaughen,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
A.G. Howard,0,0,0,0,0,0,0,0,0,0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
A.G. Riddle,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
A.J. Banner,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
A.J. Jacobs,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A.L. Singer,0,0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A.M. Homes,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
#we now want to normalize by the number of books they have written 
normalize_author_fs = author_fs.divide(author_fs.titles, axis=0)
normalize_author_fs = normalize_author_fs.drop(columns=['titles'])


In [23]:
#we now want to embed the origianl author feautre space into our book feature space
#we decompose the author feature space into 12 features
pca = PCA(n_components=12)
author_fs_pca = pca.fit_transform(normalize_author_fs)
reduce_author_fs = pd.DataFrame(author_fs_pca, index=normalize_author_fs.index, columns=['author_feature_' +str(i+1) for i in range(12) ])

In [24]:
#we want to normalize these values to be between 0 and 1 to fit the other values in the book feature space
#we then merge the book feature space with this normalized reduce author feature space
for i in reduce_author_fs.columns:
    x = np.array(reduce_author_fs[i])
    x = x.reshape(-1, 1)
    x = scale.fit_transform(x)
    number = []
    for num in x:
        number.append(num[0])
    reduce_author_fs[i] = number
normalize_book_fs = pd.merge(normalize_book_fs, reduce_author_fs, left_index=True, right_index=True)

In [25]:
#we can now compute the distance metrics for books, authors
#we need to normalize the features in both feature spaces or else we get these skewed distances based on a handful of variables
#we want to treat each variable as equal
#we also round the distances to fourth decimal place
b_distances = pdist(normalize_book_fs, metric = 'euclidean')
book_distance = pd.DataFrame(squareform(b_distances), columns=df.book_id, index=book_fs.index)
book_distance = book_distance.round(4)


a_distances = pdist(normalize_author_fs, metric = 'euclidean')
author_distance = pd.DataFrame(squareform(a_distances), columns=author_fs.index, index=author_fs.index)
author_distance = author_distance.round(4)


In [26]:
#here we create the csv files for the distances
#book_distance.to_csv('book_distance.csv')
#author_distance.to_csv('author_distance.csv')


In [27]:
#import the ratings and books data and the user-book rating matrix
ratings = pd.read_csv("new_ratings.csv")
book = pd.read_csv('new_books.csv')
book_rating = pd.merge(book, ratings, on='book_id')
book_rating = book_rating[['book_id', 'title','authors', 'user_id', 'rating']]
user_book = pd.read_csv('user_book_matrix.csv', index_col=0)

In [28]:
#now that we have all of the different data accessible and cleaned
#we can now move onto recommending books
#first let us see what a given user likes to read
def user_preference(user_id):
    user_rating = book_rating[(book_rating.user_id == user_id) & (book_rating.rating > 2)]
    return user_rating

In [29]:
#we see Harry Potter, Lord of the Rings, a counple entries in the Dune series and some other books sprinkled throughout
#generally, we migth classify this person as a fan of fantasty or science-fiction based on these preferences
user_preference(314)

Unnamed: 0,book_id,title,authors,user_id,rating
0,3,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré",314,3
900,5,Harry Potter and the Prisoner of Azkaban (Harr...,"J.K. Rowling, Mary GrandPré, Rufus Beck",314,4
1201,6,Harry Potter and the Goblet of Fire (Harry Pot...,"J.K. Rowling, Mary GrandPré",314,5
1400,1,Harry Potter and the Half-Blood Prince (Harry ...,"J.K. Rowling, Mary GrandPré",314,5
5723,976,Deception Point,Dan Brown,314,3
11746,343,Perfume: The Story of a Murderer,"Patrick Süskind, John E. Woods",314,3
23375,30,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,J.R.R. Tolkien,314,4
25164,106,Dune Messiah (Dune Chronicles #2),Frank Herbert,314,4
33882,760,Memories of My Melancholy Whores,"Gabriel García Márquez, Edith Grossman",314,4
42282,105,Chapterhouse: Dune (Dune Chronicles #6),Frank Herbert,314,3


In [30]:
#we now write the first recommendation function which is the simplest in terms of procedure
#we use matrix factorization to impute the ratings for books 
#we can then select the books with the highest ratings to recommend

def impute_ratings(user_id, n_components = 25, n_books = 5,return_imputed_ratings=False, view_components = False):
        #we intitalize our nmf object with # of components, we use 25 as the default
        #25 seems to be enough to be able to interpret the features and capture the general structure of this data
        nmf = NMF(n_components=n_components, random_state=5236)
        nmf.fit(user_book.values)
        #after fitting our data, we provide the option to see if the user wants to take a look at the different compoonents 
        #typically, we see different series grouped together
        if view_components == True:
            x = pd.DataFrame(nmf.components_, columns=user_book.columns)
            for i in x.index:
                print(pd.DataFrame(x.iloc[i].sort_values(axis=0, ascending = False)[:5]))
        #we then use matrix factorization to impute fill in the values, sort the values, 
        #and print out the the titles and authors of the books we recommend
        imputed_matrix = np.dot(nmf.transform(user_book.values), nmf.components_)
        imputed_df = pd.DataFrame(imputed_matrix, index=user_book.index, columns=user_book.columns)
        new_user_ratings = imputed_df.loc[user_id].sort_values(ascending = False)[:n_books]
        return df[['titles', 'author', 'genres', 'main_tag', 'secondary_tag', 'minor_tag']][df.book_id.isin([i.split(':')[1] for i in new_user_ratings.index])]
        #we also provide the option to return the imputed ratings data frame so as to not have to fit the data
        #for each user
        if return_imputed_ratings == True:
            return imputed_df

In [31]:
#we can see what this model recommended
#it would seem that this picked up the preference for maybe young adult fiction/fantasy similiar to harry potter
impute_ratings(314)

Unnamed: 0,titles,author,genres,main_tag,secondary_tag,minor_tag
1338,"Catching Fire (The Hunger Games, #2)",Suzanne Collins,Juvenile Fiction,young-adult,dystopia,fantasy
1938,"Divergent (Divergent, #1)",Veronica Roth,Young Adult Fiction,young-adult,dystopian,fiction
6385,The Battle of the Labyrinth (Percy Jackson and...,Rick Riordan,Juvenile Fiction,fantasy,young-adult,mythology
7032,The Fault in Our Stars,John Green,Young Adult Fiction,young-adult,fiction,romance
8406,The Sea of Monsters (Percy Jackson and the Oly...,Rick Riordan,Juvenile Fiction,fantasy,young-adult,mythology


In [32]:
#the next recommendation model we build looks at the average shortest distance between the books
#from the books the user has positively rated,
#we simply average the distances and select the number of books with the lowest distance

#we select the ratings that are generally positive
#select the user we are interested in
#merge the normalize book distances with the books that the user rated positively
#what we have is the books that user rated positively
#we then average the distances and select the shortest average distance
def average_shortest_dist(user_id, n_books = 5):
    #start by selecting the ratings that are generally positive
    user_books = book_rating[(book_rating.user_id == user_id) & (book_rating.rating > 2)]
    #by merging we are simply filtering the books the user has positively rated 
    merged_distance = pd.merge(book_distance, user_books, on='book_id')
    merged_distance = merged_distance.drop(columns=['book_id', 'user_id', 'rating', 'title', 'authors'])
    merged_distance = merged_distance.drop(columns= user_books.book_id)
    #we then average the distances and select from the shortest distance
    recommendations = np.mean(merged_distance).sort_values()[:n_books]
    #print the recommendation from the model
    return df[['titles', 'author', 'genres', 'main_tag', 'secondary_tag', 'minor_tag']][df.book_id.isin([i for i in recommendations.index])]


In [33]:
#unfortunately, I am not too familiar with the books from this recommendation
#however, we do see that again our model is choosing books that have a fantasy component
#it would be very interesting to see if these recommendations would be good for this user
#especially since we seem to be moving towards authors that the user may not be familiar with/rated
average_shortest_dist(314)

Unnamed: 0,titles,author,genres,main_tag,secondary_tag,minor_tag
288,A Storm of Swords: Steel and Snow (A Song of I...,George R.R. Martin,Fantasy fiction,fantasy,fiction,favourites
389,After Dark,Haruki Murakami,Fiction,fiction,japan,japanese
5308,Rita Hayworth and Shawshank Redemption: A Stor...,Stephen King,Fiction,fiction,stephen-king,rory-gilmore-reading-challenge
5855,"Song of Susannah (The Dark Tower, #6)",Stephen King,,fantasy,stephen-king,fiction
5909,Sputnik Sweetheart,Haruki Murakami,Fiction,fiction,japan,magical-realism


In [34]:
#taking the principle from the prior model
#we begin to think that maybe the majority of books are pulling the mean distance too far
#by this we mean that there may be too much noise in simply looking at mean distance
#that the signal we are interested in are smaller values closer to our user's ratings
#we decide to choose the smallest n-distances in which to take the mean distance
def average_of_mins_distance(user_id, n_books = 5, threshold = 3):
    #start by selecting the books that are generally rated positive
    user_books = book_rating[(book_rating.user_id == user_id) & (book_rating.rating > 2)]
    merged_distance = pd.merge(book_distance, user_books, on='book_id')
    merged_distance = merged_distance.drop(columns=['book_id', 'user_id', 'rating', 'title', 'authors'])
    merged_distance = merged_distance.drop(columns= user_books.book_id)
    #after we have filtered to the books that are user has rated,
    #we then select the n number of distances that happen to be the smallest values for the particluar book and take the average
    average_mins = []
    for i in merged_distance:
        average_mins.append(np.mean(merged_distance[i].sort_values()[:threshold]))
    #from there we select the small distances from these smaller averages
    avg_min_df = pd.DataFrame(average_mins, index=merged_distance.columns, columns=['Distance'])
    #print out these recommendations and their information
    return df[['titles', 'author', 'genres', 'main_tag', 'secondary_tag', 'minor_tag']][df.book_id.isin([i for i in avg_min_df.Distance.sort_values()[:n_books].index])]
        

In [35]:
#we begin to see the best of the two previous models
#we are recommending books that fit this science fiction/fantasy genre
#and recommending books that are apart of a series or from an author our user has previously rated
average_of_mins_distance(314)

Unnamed: 0,titles,author,genres,main_tag,secondary_tag,minor_tag
1408,Children of Dune (Dune Chronicles #3),Frank Herbert,Fiction,science-fiction,sci-fi,fiction
2776,God Emperor of Dune (Dune Chronicles #4),Frank Herbert,Fiction,fiction,sci-fi,science-fiction
2951,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,Juvenile Fiction,fantasy,young-adult,fiction
2954,Harry Potter and the Deathly Hallows (Harry Po...,J.K. Rowling,Juvenile Fiction,fantasy,young-adult,fiction
7220,"The Golden Compass (His Dark Materials, #1)",Philip Pullman,Juvenile Fiction,fantasy,young-adult,fiction


In [36]:
#our fourth model takes a further in complexity as we decide to try and re-create the user's vector
#to be able to deploy into the book feature space
#we can then measure which books are closest to this new vector in this feature space 
#and recommend books with the smallest distance

def user_book_vector(user_id, n_books = 5):
    #start by selecting the books that are generally rated positive
    user_books = book_rating[(book_rating.user_id == user_id) & (book_rating.rating > 2)]
    #we then filter the normalize book feature space to only include the books our user has positively rated
    user_vector = pd.merge(normalize_book_fs, user_books, on = 'book_id')
    #we then average those features to create the users vector
    #we could perform other computations to create this vecotr but average seems to be most understandable
    user_vector_df = pd.DataFrame(np.mean(user_vector)[1:-2]).T
    #get rid of the books the user has already rated from the feature space
    #to avoid recommending something they have already read
    filter_book_dist = normalize_book_fs[~normalize_book_fs.index.get_level_values('book_id').isin(user_books.book_id)]
    #create the distance matrix, select our user, and select the smallest values
    filter_book_dist = filter_book_dist.append(user_vector_df)
    user_dist = pdist(filter_book_dist, metric='euclidean')
    user_dist_matrix = squareform(user_dist)[-1]
    recommendation = pd.Series(user_dist_matrix, index=filter_book_dist.index)
    ids = []
    for i in recommendation.sort_values().index[1:n_books + 1]:
        ids.append(i[2])
    return df[['titles', 'author', 'genres', 'main_tag', 'secondary_tag', 'minor_tag']][df.book_id.isin([i for i in ids])]



In [37]:
#once again, not entirely familiar with the books we have suggested
#we do see the Buddha of Suburbia which was in one of the previous recommendation models
#maybe not the best model
user_book_vector(314)

Unnamed: 0,titles,author,genres,main_tag,secondary_tag,minor_tag
288,A Storm of Swords: Steel and Snow (A Song of I...,George R.R. Martin,Fantasy fiction,fantasy,fiction,favourites
389,After Dark,Haruki Murakami,Fiction,fiction,japan,japanese
5308,Rita Hayworth and Shawshank Redemption: A Stor...,Stephen King,Fiction,fiction,stephen-king,rory-gilmore-reading-challenge
5855,"Song of Susannah (The Dark Tower, #6)",Stephen King,,fantasy,stephen-king,fiction
5909,Sputnik Sweetheart,Haruki Murakami,Fiction,fiction,japan,magical-realism


In [38]:
#our final takes the principles of the previous but decides to create a user vector to recommend authors
#from those authors, we then select the books which have the shortest distance to the books our user has recommended

def user_author_vector(user_id, n_books = 5,n_authors = 5, threshold = 5):
    #start by selecting the books that are generally rated positive
    user_books = book_rating[(book_rating.user_id == user_id) & (book_rating.rating > 2)]
    #from there we pull out the unique authors
    user_authors = []
    for i in user_books.authors:
        if ',' in i:
            user_authors.append(i.split(',')[0])
        else:
            user_authors.append(i)
    user_authors = list(set(user_authors))
    #we subset our author distance matrix by the authors our user has rated
    #we decided not to drop the authors the user has rated from the columns 
    #seeing as there could be some book by that author our user has not read 
    user_author_dist = author_distance.loc[user_authors]
    #we decide to further build on the average of min distances models
    #we select n authors with the shortest distance and average those distances
    average_min_dist = []
    for i in user_author_dist.columns:
        average_min_dist.append(np.mean(user_author_dist[i].sort_values()[:threshold]))
    average_min_dist_df = pd.DataFrame(average_min_dist, index=user_author_dist.columns, columns=['Distance'])
    #from those averages we select the closest number of authors and print those out
    recomend_authors = average_min_dist_df.sort_values('Distance')[:n_authors].index
    print(recomend_authors)
    #from here we now need to recommend the books with the shortest distance based on the subset of authors
    #we pull out the book ids for all books that were written by our subset of authors
    #and we filter out the columns that are not apart of this list
    book_ids = [i for i in df.book_id[df.author.isin(recomend_authors)]]
    filtered_books = book_distance[book_ids]
    #we now need to get rid of the books our user has read so that we recommend something unique
    filtered_books = filtered_books.drop(columns = [i for i in user_books.book_id if i in book_ids])
    #we select observations for which our user has rated those particular books
    #we now have a complete subset of the book distance matrix
    #in which the rows are the books the user has rated and the columns are the subset of books by our recommended authors
    subset_book_distances = filtered_books[filtered_books.index.get_level_values(2).isin(user_books.book_id)]
    #we now continue with the average of min distance models 
    #by selecting the n number of books with the smallest distances and averaging those
    subset_user_author = []
    for i in subset_book_distances:
        subset_user_author.append(np.mean(subset_book_distances[i].sort_values()[:threshold]))
    recommendations = pd.DataFrame(subset_user_author, index = subset_book_distances.columns, columns=['Distance'])
    #finally we print our recommendations
    return df[['titles', 'author', 'genres', 'main_tag', 'secondary_tag', 'minor_tag']][df.book_id.isin([i for i in recommendations.sort_values('Distance')[:threshold].index])]


In [39]:
#we see similiar results to that of the averages of the min distances model
#we see a unique recommendation that fits within the fantasy/science-fiction realm that this particular user prefers
#we also see other books that are written by authors they seem to enjoy
user_author_vector(314)

Index(['Robert A. Heinlein', 'Frank Herbert', 'Kurt Vonnegut Jr.',
       'Gabriel García Márquez', 'Margaret Atwood'],
      dtype='object', name='author')


Unnamed: 0,titles,author,genres,main_tag,secondary_tag,minor_tag
1408,Children of Dune (Dune Chronicles #3),Frank Herbert,Fiction,science-fiction,sci-fi,fiction
1428,Chronicle of a Death Foretold,Gabriel García Márquez,Fiction,fiction,classics,magical-realism
2776,God Emperor of Dune (Dune Chronicles #4),Frank Herbert,Fiction,fiction,sci-fi,science-fiction
5773,"Slapstick, or Lonesome No More!",Kurt Vonnegut Jr.,Fiction,fiction,vonnegut,science-fiction
8222,The Puppet Masters,Robert A. Heinlein,Fiction,science-fiction,sci-fi,fiction


In [40]:
#for i in author_distance.columns:
#    print(author_distance[i].sort_values()[1:7])

In [41]:
#for i in book_distance.columns:
#    print(book_distance[i].sort_values()[1:4])

In [42]:
#we can now alos add those central tendency values
#print(df.groupby('author')['original_publication_year'].mean().round(0).head())
#print(df.groupby('author')['pages'].mean().round(0).head())
#print(df.groupby('author')['google_ratings'].mean().round(2).sort_values(ascending = False).head())
#print(df.groupby('author')['goodbook_rating'].mean().round(2).sort_values(ascending = False).head())
#print(df.groupby('author')['total_google_ratings'].mean().round(2).sort_values(ascending = False).head())
#print(df.groupby('author')['ratings_count'].mean().round(2).sort_values(ascending = False).head())