In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# read in full (cleaned) book list
full_book_list= pd.read_csv("/Users/caranix/Documents/GitHub/RomantasyRec/cleaned_romantasy_data.csv")

# read in favorite book list 
fav_book_list= pd.read_csv("user_read_list_fantasy_romance_titles_authors.csv")
fav_book_list= fav_book_list[fav_book_list['Rating_x'] >= 4]

In [4]:
# now we TF-IDF weight!
vectorizer = TfidfVectorizer(stop_words='english')

# Combine title, author, and description into a single feature
full_book_list['combined_features'] = full_book_list['cleaned_title'] + " " + full_book_list['cleaned_author'] + " " + full_book_list['cleaned_description']
fav_book_list['combined_features'] = fav_book_list['cleaned_title_y'] + " " + fav_book_list['cleaned_author_y'] + " " + fav_book_list['cleaned_description']


# Fit TF-IDF on all books and transform both datasets
tfidf_matrix_all = vectorizer.fit_transform(full_book_list['combined_features'])
tfidf_matrix_favs = vectorizer.transform(fav_book_list['combined_features'])


In [35]:
fav_book_list.head()

Unnamed: 0,Title_x,Author,Rating_x,cleaned_title_x,cleaned_author_x,api,Title_y,cleaned_title_y,cleaned_author_y,Rating_y,Count of Ratings,cleaned_description,cleaned_publisher,cleaned_page_count_v1,cleaned_count_ratings_v1,published_year,mature,combined_features
1,Electric Idol,Katee Robert,5.0,Electric Idol,Katee Robert,https://www.googleapis.com/books/v1/volumes?q=...,Electric Idol,electric idol,katee robert,3.95,111306,instant new york time usa today bestseller bea...,other,<300,2.0,2022.0,1.0,electric idol katee robert instant new york ti...
3,The Ex Hex,Erin Sterling,5.0,The Ex Hex,Erin Sterling,https://www.googleapis.com/books/v1/volumes?q=...,The Ex Hex,ex hex,erin sterling,3.48,248969,new york time bestseller erin sterling cast de...,harper collins,<300,1.0,2021.0,1.0,ex hex erin sterling new york time bestseller ...
4,A Game of Fate,Scarlett St. Clair,5.0,A Game of Fate,Scarlett St Clair,https://www.googleapis.com/books/v1/volumes?q=...,A Game of Fate,game fate,scarlett st clair,4.0,84809,discover enthralling fantasy world god mortal ...,other,300-450,2.0,2021.0,1.0,game fate scarlett st clair discover enthralli...
6,The Selection,Kiera Cass,5.0,The Selection,Kiera Cass,https://www.googleapis.com/books/v1/volumes?q=...,The Selection,selection,kiera cass,4.08,1661933,fall lovefrom beginning discover first book ca...,harper collins,300-450,1.0,2012.0,0.0,selection kiera cass fall lovefrom beginning d...
7,Cinder,Marissa Meyer,4.0,Cinder,Marissa Meyer,https://www.googleapis.com/books/v1/volumes?q=...,Cinder,cinder,marissa meyer,4.13,970130,new york time bestselling series human android...,other,300-450,1.0,2012.0,0.0,cinder marissa meyer new york time bestselling...


In [5]:
# compute cosine similarity!! 

cosine_similarities = cosine_similarity(tfidf_matrix_favs, tfidf_matrix_all)

In [9]:
fav_indices = np.where(full_book_list['Title'].isin(fav_book_list['Title_x']))[0].tolist()

In [None]:
read_book_list= pd.read_csv("user_read_list_fantasy_romance_titles_authors.csv")
read_indices = np.where(full_book_list['Title'].isin(fav_book_list['Title_x']))[0].tolist()

In [34]:
full_book_list.iloc[216]

api                         https://www.googleapis.com/books/v1/volumes?q=...
Title                                                           Electric Idol
cleaned_title                                                   electric idol
cleaned_author                                                   katee robert
Rating                                                                   3.95
Count of Ratings                                                      111,306
cleaned_description         instant new york time usa today bestseller bea...
cleaned_publisher                                                       other
cleaned_page_count_v1                                                    <300
cleaned_count_ratings_v1                                                    2
published_year                                                         2022.0
mature                                                                      1
combined_features           electric idol katee robert instant n

In [20]:
np.argsort(cosine_similarities[0], axis=0)[-2]

np.int64(60)

In [27]:
cosine_similarities[0][60]

np.float64(0.4395013372221967)

In [None]:
np.argsort(cosine_similarities[0], axis=0)
# s0 smallest value is at index 683, largest is at 216 

array([683, 955,  85, ..., 417,  60, 216])

In [None]:
d= {} # key: index in  , value: 
for i in range(len(cosine_similarities)): 
    index_to_access = -2
    no_match = True 
    while no_match: 
        idx_top_rec= np.argsort(cosine_similarities[i], axis=0)[index_to_access] # exclude self 
        if idx_top_rec in read_indices: #if we've already read it, pick next top match! 
            index_to_access = index_to_access - 1
        else: 
            if idx_top_rec not in d.keys(): 
                closest_match_book_info= full_book_list.iloc[idx_top_rec]
                title= closest_match_book_info['Title']
                author= closest_match_book_info['cleaned_author']
                cosine_sim_score_for_closest_match= cosine_similarities[i][idx_top_rec]

                d[idx_top_rec] = {"title" : title, "author": author, "cosine_sim" : cosine_sim_score_for_closest_match, "ct" : 1, "index": idx_top_rec}
               
            else: 
                 cosine_sim_score_for_closest_match= cosine_similarities[i][idx_top_rec]
                 d[idx_top_rec]['ct'] +=1 
                 d[idx_top_rec]['cosine_sim'] +=cosine_sim_score_for_closest_match 

            no_match = False
            
# goal is to return top similar book name, author + score! 


In [55]:
rec_df= pd.json_normalize(d.values())

In [57]:
rec_df.sort_values(by=['ct','cosine_sim'], ascending=[False, False])

Unnamed: 0,title,author,cosine_sim,ct,index
15,Crown of Midnight,sarah j maas,1.148831,5,16
27,A Court of Wings and Ruin,sarah j maas,1.127161,4,3
6,Ruin and Rising,leigh bardugo,1.448027,2,208
0,Wicked Beauty,katee robert,0.883197,2,417
7,Defy Me,tahereh mafi,0.702258,2,306
3,The Elite,kiera cass,0.702132,1,371
8,Ruthless Fae,caroline peckham,0.611878,1,89
5,Destroy Me,tahereh mafi,0.574406,1,336
2,A Game of Retribution,scarlett st clair,0.531567,1,445
14,Midnight Sun,stephenie meyer,0.513728,1,325
