# Imports

In [135]:
import numpy as np
import pandas as pd
import sqlalchemy as db

from functools import reduce

# Data

In [136]:
data_dir = '../Data'

In [137]:
engine = db.create_engine('sqlite:///../Data/books.db')

In [138]:
connection = engine.connect()

In [139]:
books_meta_df = pd.read_sql_table('books_meta_data', connection)

In [140]:
books_meta_df.head()

Unnamed: 0,book_id,genre,description,date_time
0,1553,Fiction / Fantasy / Urban,First time in trade paperback: the third novel...,2020-07-18T17:29:50.879040
1,2090,Fiction / Fantasy / Contemporary,Animator and vampire hunter Anita Blake is abo...,2020-07-18T17:29:51.297302
2,2341,Fiction / Horror,In the sixth adventure in the #1 New York Time...,2020-07-18T17:29:51.793606
3,2549,Fiction / Mystery & Detective / Women Sleuths,"The Anita Blake, Vampire Hunter backlist light...",2020-07-18T17:29:52.290241
4,2550,Fiction / Fantasy / Dark Fantasy,Anita Blake is a vampire hunter. But when some...,2020-07-18T17:29:52.724854


In [141]:
all_genres = set()

In [142]:
split_genres = books_meta_df.genre.str.split('/').map(set).values

In [143]:
def genres_union(a, b):
    b.update(a)
    return b

all_genres = reduce(genres_union, split_genres, set())

In [144]:
all_genres = set(map(str.strip, all_genres))

In [145]:
len(all_genres)

364

In [146]:
def create_genre_table(books_meta_df):
    
    def extract_genres(s):
        return set(map(str.strip, set(s.split('/'))))

    books_genres_df = pd.DataFrame()

    for index in books_meta_df.index:
        genre_sub_set = extract_genres(books_meta_df.loc[index, 'genre'])
        genre_sub_dict = {key:1 for key in genre_sub_set}
        genre_sub_dict['book_id'] = books_meta_df.loc[index, 'book_id']
        books_genres_df = books_genres_df.append(genre_sub_dict, ignore_index=True)

    books_genres_df = books_genres_df.astype('Int64')
    books_genres_df.fillna(0, inplace=True)
    books_genres_df.set_index('book_id', inplace=True)
    return books_genres_df

In [147]:
books_genres_df = create_genre_table(books_meta_df)

In [149]:
sorted(books_genres_df.columns.values)[:15]

['',
 '19th Century Young Adult Fiction',
 'Action & Adventure',
 'Action & Adventure Fiction',
 'Adaptations Juvenile Fiction',
 'Adaptations Young Adult Fiction',
 'Adolescence',
 'Adoption Young Adult Fiction',
 'Advertising & Promotion Business & Economics',
 'African American',
 'Alien Contact',
 'Alien Contact Fiction',
 'Alternative History',
 'Alternative History Fiction',
 'Amateur Sleuth Fiction']

In [150]:
books_genres_df

Unnamed: 0_level_0,Fantasy,Fiction,Urban,Contemporary,Horror,Mystery & Detective,Women Sleuths,Dark Fantasy,Paranormal,Romance,...,Middle East,Middle Eastern,World,African American,Unnamed: 16_level_0,"English, Irish, Scottish, Welsh Drama",European,General Drama,Shakespeare Performing Arts,Theater
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1553,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2090,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2341,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2549,0,1,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2550,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9946,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
2508,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7002,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
773,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


# Cosine Similarity

$
\begin{align}
Similarity(A, B) = \frac{A . B}{\|{A}\| \|{B}\|} = \frac{\sum_{i=1}^nA_iB_i}{\sqrt{\sum_{i=1}^n{A_i}^2}\sqrt{\sum_{i=1}^n{B_i}^2}}
\end{align}
$

In [172]:
def create_similarity_matrix(books_genres_df):
    
    def cosine_similarity(x, y):
        a_dot_b = x.mul(y).sum()
        x_squared = x.pow(2).sum()
        y_squared = y.pow(2).sum()
        similarity = a_dot_b/(np.sqrt(x_squared)*np.sqrt(y_squared))
        return similarity

    num_books = books_genres_df.shape[0]
    similarity_matrix = np.zeros((num_books, num_books))

    for i in range(0, num_books):
        for j in range(i, num_books): 
            x = books_genres_df.iloc[i,:]
            y = books_genres_df.iloc[j,:]
            similarity = cosine_similarity(x, y)
            similarity_matrix[i, j] = similarity
            similarity_matrix[j, i] = similarity    

    return similarity_matrix

In [173]:
%%time

genre_similarity_matrix = create_similarity_matrix(books_genres_df)

CPU times: user 24min 23s, sys: 7.27 s, total: 24min 30s
Wall time: 24min 32s


In [174]:
genre_similarity_matrix

array([[1.        , 0.66666667, 0.40824829, ..., 0.25819889, 0.        ,
        0.        ],
       [0.66666667, 1.        , 0.40824829, ..., 0.25819889, 0.        ,
        0.        ],
       [0.40824829, 0.40824829, 1.        , ..., 0.31622777, 0.        ,
        0.        ],
       ...,
       [0.25819889, 0.25819889, 0.31622777, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [175]:
genre_similarity_matrix_df = pd.DataFrame(genre_similarity_matrix, index=books_genres_df.index, columns=books_genres_df.index.values)

In [176]:
genre_similarity_matrix_df.head()

Unnamed: 0_level_0,1553,2090,2341,2549,2550,3510,3874,3990,4425,4656,...,9573,7238,9911,969,7950,9946,2508,7002,773,804
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1553,1.0,0.666667,0.408248,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,...,0.0,0.471405,0.774597,0.0,0.0,0.0,0.258199,0.258199,0.0,0.0
2090,0.666667,1.0,0.408248,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,...,0.0,0.471405,0.516398,0.0,0.0,0.0,0.258199,0.258199,0.0,0.0
2341,0.408248,0.408248,1.0,0.408248,0.408248,0.408248,0.408248,0.408248,0.408248,0.408248,...,0.0,0.288675,0.316228,0.0,0.0,0.0,0.316228,0.316228,0.0,0.0
2549,0.333333,0.333333,0.408248,1.0,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,...,0.0,0.235702,0.258199,0.0,0.0,0.0,0.258199,0.258199,0.0,0.0
2550,0.666667,0.666667,0.408248,0.333333,1.0,0.666667,0.666667,1.0,0.666667,0.666667,...,0.0,0.471405,0.516398,0.0,0.0,0.0,0.258199,0.258199,0.0,0.0


In [180]:
genre_similarity_matrix_df.loc[genre_similarity_matrix_df.index== 2090, 2341]

book_id
2090    0.408248
Name: 2341, dtype: float64

In [181]:
genre_similarity_matrix_df.to_csv(data_dir+'/genre_similarity_matrix_df.csv')