In [2]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 

#from pyvis.network import Network
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re



In [3]:
#books_raw = pd.read_csv('books_raw.csv')
#bring in "raw data" -- has already been processed once
books_raw = pd.read_csv('books_processed.csv') 

In [4]:
books_raw.head()

Unnamed: 0.1,Unnamed: 0,id,title,series,author,rating,description,language,isbn,genres,bookFormat,edition,pages,publisher,publishDate,awards,numRatings,likedPercent,coverImg,price
0,2,1,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,4.28,The unforgettable novel of a childhood in a sl...,English,9999999999999,"['Classics', 'Fiction', 'Historical Fiction', ...",Paperback,,324,Harper Perennial Modern Classics,05/23/06,"['Pulitzer Prize for Fiction (1961)', 'Audie A...",4501075,95.0,https://i.gr-assets.com/images/S/compressed.ph...,-1.0
1,3,2,Pride and Prejudice,,"Jane Austen, Anna Quindlen (Introduction)",4.26,Alternate cover edition of ISBN 9780679783268S...,English,9999999999999,"['Classics', 'Fiction', 'Romance', 'Historical...",Paperback,"Modern Library Classics, USA / CAN",279,Modern Library,10/10/00,[],2998241,94.0,https://i.gr-assets.com/images/S/compressed.ph...,-1.0
2,9,3,Gone with the Wind,,Margaret Mitchell,4.3,"Scarlett O'Hara, the beautiful, spoiled daught...",English,9780446675536,"['Classics', 'Historical Fiction', 'Fiction', ...",Mass Market Paperback,,1037,Warner Books,04/01/99,"['Pulitzer Prize for Novel (1937)', 'National ...",1074620,94.0,https://i.gr-assets.com/images/S/compressed.ph...,5.58
3,10,4,The Fault in Our Stars,,John Green (Goodreads Author),4.21,Despite the tumor-shrinking medical miracle th...,English,9999999999999,"['Young Adult', 'Romance', 'Fiction', 'Contemp...",Hardcover,,313,Dutton Books,01/10/12,"['Georgia Peach Book Award (2013)', 'Buxtehude...",3550714,93.0,https://i.gr-assets.com/images/S/compressed.ph...,-1.0
4,12,5,The Giving Tree,,Shel Silverstein,4.37,"""Once there was a tree...and she loved a littl...",English,9780060256654,"['Childrens', 'Picture Books', 'Classics', 'Fi...",Hardcover,,64,HarperCollins Publishers,10/07/64,[],905731,94.0,https://i.gr-assets.com/images/S/compressed.ph...,4.87


# Processing data

In [5]:
def gen_processing(column):
    try:
        col_mod = re.sub('[^A-Za-z0-9 ]+', '', column)
        col_mod = col_mod.lower()
        col_mod = col_mod.strip()
        col_mod = ''.join(c for c in col_mod if c.isprintable())
        #encode() method
        strencode = col_mod.encode("ascii", "ignore")

        #decode() method
        col_mod = strencode.decode()
        col_mod = re.sub(r'[\t\n]', '', col_mod)
    except:
        x = 'lol'
        col_mod = ''
    
    return col_mod




In [6]:
#copy the data just in case
books = books_raw.copy()
books = books.rename(columns={'Unnamed: 0': 'matrix_index'})
books['matrix_index'] = list(range(0, books.shape[0]))

In [7]:
books.shape

(26489, 20)

In [8]:
#books['awards'] = books['awards'].apply(gen_processing)
books['genres'] = books['genres'].apply(gen_processing)
books['bookFormat'] = books['bookFormat'].apply(gen_processing)
books['title'] = books['title'].apply(gen_processing)
books['publisher'] = books['publisher'].apply(gen_processing)
books['language'] = books['language'].apply(gen_processing)

In [9]:
books.columns

Index(['matrix_index', 'id', 'title', 'series', 'author', 'rating',
       'description', 'language', 'isbn', 'genres', 'bookFormat', 'edition',
       'pages', 'publisher', 'publishDate', 'awards', 'numRatings',
       'likedPercent', 'coverImg', 'price'],
      dtype='object')

In [10]:
def author_process(column):
    try:
        col_mod = column.lower()
        col_mod = re.sub("[\(\[].*?[\)\]]", "", col_mod)
        col_mod = col_mod.split(',')
        col_mod = [x.strip() for x in col_mod]
        col_mod = [re.sub('[^A-Za-z0-9 ]+', '', x) for x in col_mod]
        col_mod = [re.sub(' ', '_', x) for x in col_mod]
        col_mod = ' '.join(col_mod)
        col_mod = ''.join(c for c in col_mod if c.isprintable())
        #encode() method
        strencode = col_mod.encode("ascii", "ignore")

        #decode() method
        col_mod = strencode.decode()
        col_mod = re.sub(r'[\t\n]', '', col_mod)
        col_mod = col_mod.replace('\t', '')
        col_mod = col_mod.replace('\n', '')
        col_mod = col_mod.replace('\s', '')
        col_mod = col_mod.replace('\r', '')
        col_mod = col_mod.replace('\f', '')
        col_mod = col_mod.replace('\v', '')
        col_mod = col_mod.replace('\a', '')
        col_mod = col_mod.replace('\b', '')
        col_mod = col_mod.strip()
    except:
        col_mod = ''
    
    return col_mod

In [11]:
books['author'] = books['author'].apply(author_process)

In [12]:
def award_process(column):
    try:
        col_mod = column.lower()
        col_mod = re.sub(r"[\([{})\]]", "", col_mod)
        #col_mod = col_mod.replace('''''', '')
        col_mod = col_mod.split(',')
        col_mod = [x.strip() for x in col_mod]
        col_mod = [re.sub(' ', '_', x) for x in col_mod]
        col_mod = ' '.join(col_mod)
        col_mod = col_mod.replace('\'', '')
    except:
        col_mod = ''
    
    return col_mod

In [13]:
books['awards'] = books['awards'].apply(award_process)

In [14]:
def description_processing(column):
    try:
        col_mod = column.lower()
        col_mod = re.sub('[^A-Za-z ]+', ' ', col_mod)
        #define stop words
        stop_words = set(stopwords.words('english'))
        #tokenize words
        tokens = word_tokenize(col_mod)
        #remove stop words
        tokens = [x for x in tokens if x not in stop_words]
        #lemmatize words so they are more general
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(x) for x in tokens]
        #rejoin tokens as string
        col_mod = ' '.join(tokens)
    except:
        col_mod = ''
    
    return col_mod
    

In [15]:
books['description'] = books['description'].apply(description_processing)

In [16]:
books

Unnamed: 0,matrix_index,id,title,series,author,rating,description,language,isbn,genres,bookFormat,edition,pages,publisher,publishDate,awards,numRatings,likedPercent,coverImg,price
0,0,1,to kill a mockingbird,To Kill a Mockingbird,harper_lee,4.28,,english,9999999999999,classics fiction historical fiction school lit...,paperback,,324,harper perennial modern classics,05/23/06,pulitzer_prize_for_fiction_1961 audie_award_fo...,4501075,95.0,https://i.gr-assets.com/images/S/compressed.ph...,-1
1,1,2,pride and prejudice,,jane_austen anna_quindlen,4.26,alternate cover edition isbn since immediate s...,english,9999999999999,classics fiction romance historical fiction li...,paperback,"Modern Library Classics, USA / CAN",279,modern library,10/10/00,,2998241,94.0,https://i.gr-assets.com/images/S/compressed.ph...,-1
2,2,3,gone with the wind,,margaret_mitchell,4.30,scarlett hara beautiful spoiled daughter well ...,english,9780446675536,classics historical fiction fiction romance hi...,mass market paperback,,1037,warner books,04/01/99,pulitzer_prize_for_novel_1937 national_book_aw...,1074620,94.0,https://i.gr-assets.com/images/S/compressed.ph...,5.58
3,3,4,the fault in our stars,,john_green,4.21,despite tumor shrinking medical miracle bought...,english,9999999999999,young adult romance fiction contemporary reali...,hardcover,,313,dutton books,01/10/12,georgia_peach_book_award_2013 buxtehuder_bulle...,3550714,93.0,https://i.gr-assets.com/images/S/compressed.ph...,-1
4,4,5,the giving tree,,shel_silverstein,4.37,tree loved little boy begin story unforgettabl...,english,9780060256654,childrens picture books classics fiction poetr...,hardcover,,64,harpercollins publishers,10/07/64,,905731,94.0,https://i.gr-assets.com/images/S/compressed.ph...,4.87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26484,26484,26485,attracted to fire,,diann_mills,4.14,special agent meghan connors dream one day pro...,english,9781414348643,christian fiction christian suspense romance m...,paperback,,416,tyndale house publishers,October 1st 2011,holt_medallion_by_virginia_romance_writers_nom...,2143,95.0,https://i.gr-assets.com/images/S/compressed.ph...,5.55
26485,26485,26486,unbelievable,Port Fare #2,sherry_gammon,4.16,lilah lopez dreser town take care unfinished f...,english,9781477594247,romance young adult contemporary contemporary ...,paperback,,360,wordpaintings unlimited,April 11th 2013,,1028,94.0,https://i.gr-assets.com/images/S/compressed.ph...,19.18
26486,26486,26487,anasazi,Sense of Truth #2,emma_michaels,4.19,anasazi sequel thirteenth chime emma michael o...,english,9999999999999,mystery young adult,paperback,First Edition,190,bokheim publishing,August 5th 2011,,37,95.0,https://i.gr-assets.com/images/S/compressed.ph...,-1
26487,26487,26488,wayward son,,tom_pollack john_loftus jim_alves,3.85,powerful tremor unearths ancient secretburied ...,english,9781450755634,fiction mystery historical fiction adventure c...,paperback,1st edition,507,cascada productions,September 1st 2011,,238,90.0,https://i.gr-assets.com/images/S/compressed.ph...,2.86


In [17]:
#relative importance of different features,
#repeats the text (weight) number of times 
w_title = 1
w_author = 1
w_description = 1
w_language = 1
w_genres = 1
w_bookFormat = 1
w_publisher = 1
w_awards = 1
# function for merging features

def concatenate_features(df_row):

    return ' '.join([df_row['title']]*w_title)+' '+' '.join([df_row['author']]*w_author)+' '+' '.join([df_row['description']]*w_description)+' '+' '.join([df_row['language']]*w_language)+' '+' '.join([df_row['genres']]*w_genres)+' '+' '.join([df_row['bookFormat']]*w_bookFormat)+' '+' '.join([df_row['publisher']]*w_publisher)+' '+' '.join([df_row['awards']]*w_awards)

In [18]:
books['features'] = books.apply(concatenate_features,axis=1)
books

Unnamed: 0,matrix_index,id,title,series,author,rating,description,language,isbn,genres,...,edition,pages,publisher,publishDate,awards,numRatings,likedPercent,coverImg,price,features
0,0,1,to kill a mockingbird,To Kill a Mockingbird,harper_lee,4.28,,english,9999999999999,classics fiction historical fiction school lit...,...,,324,harper perennial modern classics,05/23/06,pulitzer_prize_for_fiction_1961 audie_award_fo...,4501075,95.0,https://i.gr-assets.com/images/S/compressed.ph...,-1,to kill a mockingbird harper_lee english clas...
1,1,2,pride and prejudice,,jane_austen anna_quindlen,4.26,alternate cover edition isbn since immediate s...,english,9999999999999,classics fiction romance historical fiction li...,...,"Modern Library Classics, USA / CAN",279,modern library,10/10/00,,2998241,94.0,https://i.gr-assets.com/images/S/compressed.ph...,-1,pride and prejudice jane_austen anna_quindlen ...
2,2,3,gone with the wind,,margaret_mitchell,4.30,scarlett hara beautiful spoiled daughter well ...,english,9780446675536,classics historical fiction fiction romance hi...,...,,1037,warner books,04/01/99,pulitzer_prize_for_novel_1937 national_book_aw...,1074620,94.0,https://i.gr-assets.com/images/S/compressed.ph...,5.58,gone with the wind margaret_mitchell scarlett ...
3,3,4,the fault in our stars,,john_green,4.21,despite tumor shrinking medical miracle bought...,english,9999999999999,young adult romance fiction contemporary reali...,...,,313,dutton books,01/10/12,georgia_peach_book_award_2013 buxtehuder_bulle...,3550714,93.0,https://i.gr-assets.com/images/S/compressed.ph...,-1,the fault in our stars john_green despite tumo...
4,4,5,the giving tree,,shel_silverstein,4.37,tree loved little boy begin story unforgettabl...,english,9780060256654,childrens picture books classics fiction poetr...,...,,64,harpercollins publishers,10/07/64,,905731,94.0,https://i.gr-assets.com/images/S/compressed.ph...,4.87,the giving tree shel_silverstein tree loved li...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26484,26484,26485,attracted to fire,,diann_mills,4.14,special agent meghan connors dream one day pro...,english,9781414348643,christian fiction christian suspense romance m...,...,,416,tyndale house publishers,October 1st 2011,holt_medallion_by_virginia_romance_writers_nom...,2143,95.0,https://i.gr-assets.com/images/S/compressed.ph...,5.55,attracted to fire diann_mills special agent me...
26485,26485,26486,unbelievable,Port Fare #2,sherry_gammon,4.16,lilah lopez dreser town take care unfinished f...,english,9781477594247,romance young adult contemporary contemporary ...,...,,360,wordpaintings unlimited,April 11th 2013,,1028,94.0,https://i.gr-assets.com/images/S/compressed.ph...,19.18,unbelievable sherry_gammon lilah lopez dreser ...
26486,26486,26487,anasazi,Sense of Truth #2,emma_michaels,4.19,anasazi sequel thirteenth chime emma michael o...,english,9999999999999,mystery young adult,...,First Edition,190,bokheim publishing,August 5th 2011,,37,95.0,https://i.gr-assets.com/images/S/compressed.ph...,-1,anasazi emma_michaels anasazi sequel thirteent...
26487,26487,26488,wayward son,,tom_pollack john_loftus jim_alves,3.85,powerful tremor unearths ancient secretburied ...,english,9781450755634,fiction mystery historical fiction adventure c...,...,1st edition,507,cascada productions,September 1st 2011,,238,90.0,https://i.gr-assets.com/images/S/compressed.ph...,2.86,wayward son tom_pollack john_loftus jim_alves ...


In [19]:
books.to_hdf('books_processed_for_recommendation.h5', key = 'books')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['title', 'series', 'author', 'description', 'language', 'isbn',
       'genres', 'bookFormat', 'edition', 'pages', 'publisher', 'publishDate',
       'awards', 'coverImg', 'price', 'features'],
      dtype='object')]

  pytables.to_hdf(


In [20]:
books = pd.read_hdf('books_processed_for_recommendation.h5', key = 'books')

In [21]:
#Create vectorizer
vect = CountVectorizer(stop_words='english')
#transform the text data
vect_matrix = vect.fit_transform(books['features'])
#compute cosine similarity matrix
cosine_similarity_matrix = cosine_similarity(vect_matrix, vect_matrix)

In [22]:
cosine_similarity_matrix

array([[1.        , 0.31555792, 0.3519247 , ..., 0.09883324, 0.17287295,
        0.27594578],
       [0.31555792, 1.        , 0.32482892, ..., 0.06081584, 0.13910614,
        0.2425713 ],
       [0.3519247 , 0.32482892, 1.        , ..., 0.06358559, 0.15970044,
        0.28743458],
       ...,
       [0.09883324, 0.06081584, 0.06358559, ..., 1.        , 0.06407098,
        0.09496715],
       [0.17287295, 0.13910614, 0.15970044, ..., 0.06407098, 1.        ,
        0.18399945],
       [0.27594578, 0.2425713 , 0.28743458, ..., 0.09496715, 0.18399945,
        1.        ]])

In [23]:
#save cosine similarity matrix
with open('cosine_similarity_matrix.npy', 'wb') as f:
    np.save(f, cosine_similarity_matrix)

In [24]:
#load matrix
with open('cosine_similarity_matrix.npy', 'rb') as f:
    cosine_similarity_matrix = np.load(f)

# FUNCTIONS!!!

In [None]:
#dictionary update function
def update_dict_with_list_merge(target_dict, source_dict):
    for key, value in source_dict.items():
        if key in target_dict:
            # If the key already exists, convert the value to a list if it's not already
            if not isinstance(target_dict[key], list):
                target_dict[key] = [target_dict[key]]
            target_dict[key].append(value)
        else:
            target_dict[key] = value

In [26]:
def create_cos_matrix(processed_data):
    from sklearn.metrics.pairwise import cosine_similarity
    
    #Create vectorizer
    vect = CountVectorizer(stop_words='english')
    #transform the text data
    vect_matrix = vect.fit_transform(books['features'])
    #compute cosine similarity matrix
    cosine_similarity_matrix = cosine_similarity(vect_matrix, vect_matrix)
    return cosine_similarity_matrix

## Recommendation off a singular book

In [27]:
#if input is one single id
def individual_recommendations(book_id, recs_to_check, recs_to_return):
    
    #Set number of recommendations for each book we want to check against, probably
    #like 200?
    number_of_recommendations_to_check = recs_to_check
    number_of_recommendations_to_return = recs_to_return
    
    #get matrix position the book you're searching against
    search_titles_matrix_location = books.loc[books['id'] == book_id, 'matrix_index']
    #Get column position of of the book you're searching against
    search_titles_matrix_column_numeric = int(search_titles_matrix_location.values[0])
    
    #import the saved cosine matrix!!
    #################################
    
    #Get the cosine scores all the other books
    similarity_scores = list(enumerate(cosine_similarity_matrix[search_titles_matrix_location]))
    #sort the scores highest to lowest
    similarity_scores_sorted = sorted(similarity_scores[0][1], reverse = True)
    #Update the number of recommendations used for the score
    num_recs_for_score = number_of_recommendations_to_check + 1
    #Get the top X scores, whatever value you originally chose + 1
    top_x_cosines = similarity_scores_sorted[1:num_recs_for_score]
    
    #Get the indices of the books having the top X scores
    recommended_books_indices = [index for index, score in sorted(enumerate(cosine_similarity_matrix[search_titles_matrix_column_numeric]), key=lambda x: x[1], reverse=True) if score in top_x_cosines]
    #get ratings of indexed recommendations
    recommendation_ratings = books['rating'][recommended_books_indices]
    #get rating of original book
    original_book_rating = books['rating'][search_titles_matrix_column_numeric]
    #divide the rating of the new books by the rating of the original book to see if it is "better"
    recommendation_ratios = recommendation_ratings / original_book_rating
    #multiply ratio by the cosine score to slightly adjust them
    adjusted_scores = recommendation_ratios * top_x_cosines
    
    #Get the names of the recommended books
    recommended_titles = books['title'][recommended_books_indices]
    
    recommendation_df = pd.DataFrame(list(zip(recommended_titles, adjusted_scores)), columns = ['Recommended Books', 'Scores'])
    
    return recommendation_df

## Recommending off a singular book

In [28]:
book_id = 94
recs_to_check = 80
recs_to_return = 10
individual_recommendations(book_id, recs_to_check, recs_to_return)

Unnamed: 0,Recommended Books,Scores
0,the bully book,0.423019
1,nobody was here,0.342724
2,firegirl,0.356933
3,the school at the chalet,0.349866
4,bilgewater,0.326067
...,...,...
75,feather boy,0.248923
76,my friend flicka,0.278057
77,lizard music,0.276592
78,leven thumps and the wrath of ezra,0.264672


## Recommendations based off historical data (the borrowal list)

### This function can accept any length of input, but it must be as a list!
book_ids = list of book_ids, can change it to titles with slight modifications
recs_to_check = the number of score you want to calculate for each book
recs_to_return = the total number of recommendations you want returned

In [29]:
def historical_recommendations(book_ids, recs_to_check, recs_to_return):
    score_dictionary = {}
    for book_id in book_ids:
        #Set number of recommendations for each book we want to check against, probably
        #like 200?
        number_of_recommendations_to_check = recs_to_check
        number_of_recommendations_to_return = recs_to_return
        
        #get matrix position the book you're searching against
                                                #change variables if we don't want to use id
        #                                                vv        vv
        search_titles_matrix_location = books.loc[books['id'] == book_id, 'matrix_index']
        #Get column position of of the book you're searching against
        search_titles_matrix_column_numeric = int(search_titles_matrix_location.values[0])

        #import the saved cosine matrix!!
        #################################

        #Get the cosine scores all the other books
        similarity_scores = list(enumerate(cosine_similarity_matrix[search_titles_matrix_location]))
        #sort the scores highest to lowest
        similarity_scores_sorted = sorted(similarity_scores[0][1], reverse = True)
        #Update the number of recommendations used for the score
        num_recs_for_score = number_of_recommendations_to_check + 1
        #Get the top X scores, whatever value you originally chose + 1
        top_x_cosines = similarity_scores_sorted[1:num_recs_for_score]

        #Get the indices of the books having the top X scores
        recommended_books_indices = [index for index, score in sorted(enumerate(cosine_similarity_matrix[search_titles_matrix_column_numeric]), key=lambda x: x[1], reverse=True) if score in top_x_cosines]
        #get ratings of indexed recommendations
        recommendation_ratings = books['rating'][recommended_books_indices]
        #get rating of original book
        original_book_rating = books['rating'][search_titles_matrix_column_numeric]
        #divide the rating of the new books by the rating of the original book to see if it is "better"
        recommendation_ratios = recommendation_ratings / original_book_rating
        #multiply ratio by the cosine score to slightly adjust them
        adjusted_scores = recommendation_ratios * top_x_cosines
        adjusted_scores = [float(x) for x in adjusted_scores]
        #Create dictionary of the scores each book you are recommending off of
        individual_dictionary = {recommended_books_indices[i]: adjusted_scores[i] for i in range(len(recommended_books_indices))}
        #update the overall score dictionary
        update_dict_with_list_merge(score_dictionary, individual_dictionary)
    
    #Take the average score for those books with multiple recommendation scores
    score_dictionary = {key: sum(value) / len(value) if isinstance(value, list) else value for key, value in score_dictionary.items()}
    
    #Remove the books that are being used to make recommendations if they are in the dictionary
    score_dictionary = {key: value for key, value in score_dictionary.items() if key not in book_ids}
    
    #sort the dictionary from highest to lowest and return the top x books
    top_x_dictionary = dict(sorted(score_dictionary.items(), key=lambda x: x[1], reverse=True)[:number_of_recommendations_to_return])
    
    top_x_vals_indices = []
    top_x_vals_scores = []
    for key, val in enumerate(top_x_dictionary):
        top_x_vals_indices.append(key)
        top_x_vals_scores.append(val)
        
    
    
    
    top_x_vals_indices = top_x_dictionary.keys()
    top_x_vals_scores = list(top_x_dictionary.values())
    #Get the names of the recommended books
    recommended_books_list = books['title'][top_x_vals_indices]
    recommendation_df = pd.DataFrame(list(zip(recommended_books_list, top_x_vals_scores, top_x_vals_indices)), columns = ['Recommended Books', 'Scores', 'Index Value'])

    return recommendation_df

## Calling the function

In [30]:
 #ids of books in borrowals
book_ids = [1, 2]
#number of books each book_id will be compared to
recs_to_check = 10
#final number of recommendations that will be returned
recs_to_return = 5

recommended_books = historical_recommendations(book_ids, recs_to_check, recs_to_return)
recommended_books

Unnamed: 0,Recommended Books,Scores,Index Value
0,petey,0.527365,14597
1,the complete novels,0.50313,176
2,the school at the chalet,0.49245,12838
3,in our time,0.471815,3719
4,their eyes were watching god,0.470161,79


### Can use index value to pull information from the original table

# Running the function from start

In [31]:
def recommendation_engine(book_ids, recs_to_check, recs_to_return):
    #pull in processed data
    books = pd.read_hdf('books_processed_for_recommendation.h5', key = 'books')
    #build cosine similarity matrix
    cosine_similarity_matrix = create_cos_matrix(processed_data)
    recommended_books = historical_recommendations(book_ids, recs_to_check, recs_to_return)
    
    return recommended_books

In [9]:
def recommendation_engine(book_ids, recs_to_check, recs_to_return):
    
    #import packages
    import numpy as np
    import pandas as pd
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer 
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import re
    
    #define cosine matrix function
    def create_cos_matrix(processed_data):
        from sklearn.metrics.pairwise import cosine_similarity

        #Create vectorizer
        vect = CountVectorizer(stop_words='english')
        #transform the text data
        vect_matrix = vect.fit_transform(books['features'])
        #compute cosine similarity matrix
        cosine_similarity_matrix = cosine_similarity(vect_matrix, vect_matrix)
        
        return cosine_similarity_matrix
    
    #define the dictionary updating function
    #dictionary update function
    def update_dict_with_list_merge(target_dict, source_dict):
        for key, value in source_dict.items():
            if key in target_dict:
                # If the key already exists, convert the value to a list if it's not already
                if not isinstance(target_dict[key], list):
                    target_dict[key] = [target_dict[key]]
                target_dict[key].append(value)
            else:
                target_dict[key] = value
    
    
    #define recommendation function:
    def historical_recommendations(book_ids, recs_to_check, recs_to_return):
        
        score_dictionary = {}
        for book_id in book_ids:
            #Set number of recommendations for each book we want to check against, probably
            #like 200?
            number_of_recommendations_to_check = recs_to_check
            number_of_recommendations_to_return = recs_to_return

            #get matrix position the book you're searching against
                                                    #change variables if we don't want to use id
            #                                                vv        vv
            search_titles_matrix_location = books.loc[books['id'] == book_id, 'matrix_index']
            #Get column position of of the book you're searching against
            search_titles_matrix_column_numeric = int(search_titles_matrix_location.values[0])

            #import the saved cosine matrix!!
            #################################

            #Get the cosine scores all the other books
            similarity_scores = list(enumerate(cosine_similarity_matrix[search_titles_matrix_location]))
            #sort the scores highest to lowest
            similarity_scores_sorted = sorted(similarity_scores[0][1], reverse = True)
            #Update the number of recommendations used for the score
            num_recs_for_score = number_of_recommendations_to_check + 1
            #Get the top X scores, whatever value you originally chose + 1
            top_x_cosines = similarity_scores_sorted[1:num_recs_for_score]

            #Get the indices of the books having the top X scores
            recommended_books_indices = [index for index, score in sorted(enumerate(cosine_similarity_matrix[search_titles_matrix_column_numeric]), key=lambda x: x[1], reverse=True) if score in top_x_cosines]
            #get ratings of indexed recommendations
            recommendation_ratings = books['rating'][recommended_books_indices]
            #get rating of original book
            original_book_rating = books['rating'][search_titles_matrix_column_numeric]
            #divide the rating of the new books by the rating of the original book to see if it is "better"
            recommendation_ratios = recommendation_ratings / original_book_rating
            #multiply ratio by the cosine score to slightly adjust them
            adjusted_scores = recommendation_ratios * top_x_cosines
            adjusted_scores = [float(x) for x in adjusted_scores]
            #Create dictionary of the scores each book you are recommending off of
            individual_dictionary = {recommended_books_indices[i]: adjusted_scores[i] for i in range(len(recommended_books_indices))}
            #update the overall score dictionary
            update_dict_with_list_merge(score_dictionary, individual_dictionary)

        #Take the average score for those books with multiple recommendation scores
        score_dictionary = {key: sum(value) / len(value) if isinstance(value, list) else value for key, value in score_dictionary.items()}

        #Remove the books that are being used to make recommendations if they are in the dictionary
        score_dictionary = {key: value for key, value in score_dictionary.items() if key not in book_ids}

        #sort the dictionary from highest to lowest and return the top x books
        top_x_dictionary = dict(sorted(score_dictionary.items(), key=lambda x: x[1], reverse=True)[:number_of_recommendations_to_return])

        top_x_vals_indices = []
        top_x_vals_scores = []
        for key, val in enumerate(top_x_dictionary):
            top_x_vals_indices.append(key)
            top_x_vals_scores.append(val)




        top_x_vals_indices = top_x_dictionary.keys()
        top_x_vals_scores = list(top_x_dictionary.values())
        #Get the names of the recommended books
        recommended_books_list = books['title'][top_x_vals_indices]
        recommendation_df = pd.DataFrame(list(zip(recommended_books_list, top_x_vals_scores, top_x_vals_indices)), columns = ['Recommended Books', 'Scores', 'Index Value'])

        return recommendation_df
    
    #pull in processed data
    books = pd.read_hdf('books_processed_for_recommendation.h5', key = 'books')
    #build cosine similarity matrix
    cosine_similarity_matrix = create_cos_matrix(books)
    recommended_books = historical_recommendations(book_ids, recs_to_check, recs_to_return)
    
    return recommended_books
    
    


In [10]:
 #ids of books in borrowals
book_ids = [8264, 502, 963]
#number of books each book_id will be compared to
recs_to_check = 10
#final number of recommendations that will be returned
recs_to_return = 5

recommended_books = recommendation_engine(book_ids, recs_to_check, recs_to_return)
recommended_books

Unnamed: 0,Recommended Books,Scores,Index Value
0,shiver trilogy boxset,0.617596,5273
1,linger,0.562791,362
2,the savage grace,0.440593,24636
3,the lyris,0.394616,11145
4,the lost saint,0.387108,4462
