# Imports

In [118]:
import numpy as np
import pandas as pd
import sqlalchemy as db

from functools import reduce

# Data

In [133]:
data_dir = '../Data'

In [2]:
engine = db.create_engine('sqlite:///../Data/books.db')

In [3]:
connection = engine.connect()

In [4]:
books_meta_df = pd.read_sql_table('books_meta_data', connection)

In [5]:
books_meta_df.head()

Unnamed: 0,book_id,genre,description,date_time
0,1553,Fiction / Fantasy / Urban,First time in trade paperback: the third novel...,2020-07-18T17:29:50.879040
1,2090,Fiction / Fantasy / Contemporary,Animator and vampire hunter Anita Blake is abo...,2020-07-18T17:29:51.297302
2,2341,Fiction / Horror,In the sixth adventure in the #1 New York Time...,2020-07-18T17:29:51.793606
3,2549,Fiction / Mystery & Detective / Women Sleuths,"The Anita Blake, Vampire Hunter backlist light...",2020-07-18T17:29:52.290241
4,2550,Fiction / Fantasy / Dark Fantasy,Anita Blake is a vampire hunter. But when some...,2020-07-18T17:29:52.724854


In [6]:
all_genres = set()

In [7]:
split_genres = books_meta_df.genre.str.split('/').map(set).values

In [8]:
def genres_union(a, b):
    b.update(a)
    return b

all_genres = reduce(genres_union, split_genres, set())

In [9]:
all_genres = set(map(str.strip, all_genres))

In [66]:
len(all_genres)

232

In [99]:
def create_genre_table(books_meta_df):
    
    def extract_genres(s):
        return set(map(str.strip, set(s.split('/'))))

    books_genres_df = pd.DataFrame()

    for index in books_meta_df.index:
        genre_sub_set = extract_genres(books_meta_df.loc[index, 'genre'])
        genre_sub_dict = {key:1 for key in genre_sub_set}
        genre_sub_dict['book_id'] = books_meta_df.loc[index, 'book_id']
        books_genres_df = books_genres_df.append(genre_sub_dict, ignore_index=True)

    books_genres_df = books_genres_df.astype('Int64')
    books_genres_df.fillna(0, inplace=True)
    books_genres_df.set_index('book_id', inplace=True)
    return books_genres_df

In [100]:
books_genres_df = create_genre_table(books_meta_df)

In [106]:
sorted(books_genres_df.columns.values)

['19th Century Young Adult Fiction',
 'Action & Adventure',
 'Action & Adventure Fiction',
 'Adaptations Young Adult Fiction',
 'Adolescence',
 'Adoption Young Adult Fiction',
 'Alien Contact',
 'Alien Contact Fiction',
 'Alternative History',
 'Amateur Sleuth Fiction',
 'Ancient World',
 'Animals',
 'Apocalyptic & Post-Apocalyptic',
 'Apologetics Religion',
 'Arthurian Fiction',
 'Arthurian Young Adult Fiction',
 'Bears Juvenile Fiction',
 'Biography & Autobiography',
 'Black Humor Fiction',
 'Boarding School & Prep School',
 'Business, Careers, Occupations Juvenile Fiction',
 'Cats',
 'Cats Juvenile Fiction',
 'Christian',
 'Christian Life',
 'Christian Theology',
 'Christianity',
 'Class Differences Young Adult Fiction',
 'Classics Fiction',
 'Clean & Wholesome',
 'Clean & Wholesome Fiction',
 'Comics & Graphic Novels',
 'Coming of Age Fiction',
 'Contemporary',
 'Contemporary Fiction',
 'Contemporary Young Adult Fiction',
 'Cozy',
 'Crime',
 'Crime Fiction',
 'Cyberpunk Fiction',
 

In [107]:
books_genres_df

Unnamed: 0_level_0,Fantasy,Fiction,Urban,Contemporary,Horror,Mystery & Detective,Women Sleuths,Dark Fantasy,Paranormal,Romance,...,Adaptations Young Adult Fiction,Black Humor Fiction,Clean & Wholesome Fiction,Cozy,Fantasy & Magic Young Adult Fiction,Bears Juvenile Fiction,Values & Virtues,Alien Contact,Cyberpunk Fiction,Technological
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1553,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2090,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2341,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2549,0,1,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2550,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1972,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2673,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2730,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2859,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Cosine Similarity

$
\begin{align}
Similarity(A, B) = \frac{A . B}{\|{A}\| \|{B}\|} = \frac{\sum_{i=1}^nA_iB_i}{\sqrt{\sum_{i=1}^n{A_i}^2}\sqrt{\sum_{i=1}^n{B_i}^2}}
\end{align}
$

In [125]:
def create_similarity_matrix(books_genres_df):
    
    def cosine_similarity(x, y):
        a_dot_b = x.mul(y).sum()
        x_squared = x.pow(2).sum()
        y_squared = y.pow(2).sum()
        similarity = a_dot_b/(np.sqrt(x_squared)*np.sqrt(y_squared))
        return similarity

    num_books = books_genres_df.shape[0]
    similarity_matrix = np.zeros((num_books, num_books))

    for i in range(0, num_books):
        for j in range(num_books):
            x = books_genres_df.iloc[i,:]
            y = books_genres_df.iloc[j,:]
            similarity = cosine_similarity(x, y)
            similarity_matrix[i, j] = similarity
            similarity_matrix[j, i] = similarity    

    return similarity_matrix

genre_similarity_matrix = create_similarity_matrix(books_genres_df)

In [126]:
genre_similarity_matrix

array([[1.        , 0.66666667, 0.40824829, ..., 0.25819889, 0.25819889,
        0.25819889],
       [0.66666667, 1.        , 0.40824829, ..., 0.25819889, 0.25819889,
        0.25819889],
       [0.40824829, 0.40824829, 1.        , ..., 0.31622777, 0.31622777,
        0.31622777],
       ...,
       [0.25819889, 0.25819889, 0.31622777, ..., 1.        , 0.8       ,
        1.        ],
       [0.25819889, 0.25819889, 0.31622777, ..., 0.8       , 1.        ,
        0.8       ],
       [0.25819889, 0.25819889, 0.31622777, ..., 1.        , 0.8       ,
        1.        ]])

In [127]:
genre_similarity_matrix_df = pd.DataFrame(genre_similarity_matrix, index=books_genres_df.index, columns=books_genres_df.index)

In [128]:
genre_similarity_matrix_df.head()

book_id,1553,2090,2341,2549,2550,3510,3874,3990,4425,4656,...,3035,3431,3940,4802,4876,1972,2673,2730,2859,5655
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1553,1.0,0.666667,0.408248,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,...,0.235702,0.235702,0.235702,0.235702,0.235702,0.258199,0.258199,0.258199,0.258199,0.258199
2090,0.666667,1.0,0.408248,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,...,0.235702,0.235702,0.235702,0.235702,0.235702,0.258199,0.258199,0.258199,0.258199,0.258199
2341,0.408248,0.408248,1.0,0.408248,0.408248,0.408248,0.408248,0.408248,0.408248,0.408248,...,0.288675,0.288675,0.288675,0.288675,0.288675,0.316228,0.316228,0.316228,0.316228,0.316228
2549,0.333333,0.333333,0.408248,1.0,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,...,0.471405,0.471405,0.471405,0.471405,0.471405,0.258199,0.258199,0.258199,0.258199,0.258199
2550,0.666667,0.666667,0.408248,0.333333,1.0,0.666667,0.666667,1.0,0.666667,0.666667,...,0.235702,0.235702,0.235702,0.235702,0.235702,0.258199,0.258199,0.258199,0.258199,0.258199


In [134]:
genre_similarity_matrix_df.to_csv(data_dir+'/genre_similarity_matrix_df.csv')