In [0]:
#dependencies: pandas, numpy, json, spacy, and en_core_web_lg (python -m spacy download en_core_web_lg)

import pandas as pd
import numpy as np
import spacy
import json
from spacy.lang.en import English

#load the csv
filename = 'trimmed_20k.csv'
df = pd.read_csv(filename).drop('Unnamed: 0', axis=1)

#create the spacy docs column of the book descriptions
nlp = spacy.load("en_core_web_lg")
docs = list(nlp.pipe(df.description))
df['docs'] = docs

#pickle the df for fast loading after server restart:
df.to_pickle('df_pickle.pkl', compression='infer')

#to load the pickled df back in the future:
df = pd.read_pickle('df_pickle.pkl')


#Functions:

def get_recs_from_desc(input_string, from_isbn=False):
    '''Takes a book description, converts to a spacy doc object and 
    calculates the similarity score for all other books in the dataframe 
    (variable called df), sorts and returns the top 10 as a json object
    containing title, author, avg rating and ISBN'''
    
    #convert input string of hypthetical book description into spacy doc object
    test_doc = nlp(input_string)
    
    #instantiate empty list of similarity scores:
    sims = []
    
    #iterate over the doc object for each book in the df to get the similarity score and append to list
    for doc in df.docs:
        sim = test_doc.similarity(doc)
        sims.append(sim)
    
    #sort the list and grab the top 10:
    if from_isbn:
        #skip the 0th ranked book which will be the bookused to get the input_string of the description:
        top10 = pd.Series(sims).sort_values(ascending=False).iloc[1:11]
    else:
        top10 = pd.Series(sims).sort_values(ascending=False).iloc[:10]
    
    #instantiate empty list to store the python dicts of each book
    books = []
    
    #iterate thru the top 10 ranked simlilar books and populate the book list w/ dictionaries for each book
    for i in top10.index:
        book = {}
        book['title'] = df.iloc[i]['book_title']
        book['author'] = df.iloc[i]['author']
        book['avg_rating'] = df.iloc[i]['avg_rating']
        book['ISBN'] = df.iloc[i]['ISBN']
        books.append(book)
    return json.dumps(books)


def get_books_by_author(author):
    '''Takes an authors name string input and returns json object of the top 10 highest rated books 
    by that author'''
    
    #limited to top 10- can return all books by author if we want (or fewer)
    books_df = df[df.author == author].sort_values('avg_rating', ascending=False).reset_index().head(10)
    
    #if no books by that author are found, returns error message (string)
    if len(books_df) == 0:
        return 'Author not found in database- check for correct spelling'
    
    #instantiate empty list to store the python dicts of each book
    books = []
    
    #iterate thru the authors books and populate the book list with dictionaries for each book
    for i in range(len(books_df)):
        book = {}
        book['title'] = books_df.iloc[i]['book_title']
        book['author'] = books_df.iloc[i]['author']
        book['avg_rating'] = books_df.iloc[i]['avg_rating']
        book['ISBN'] = books_df.iloc[i]['ISBN']
        books.append(book)
    
    #return the list of dictionaries (books) as json object:
    return json.dumps(books)

#user clicks on a book by author, that should send us the isbn and we will get the book 
#description from df and return recommendations based on that

def get_recs_from_isbn(ISBN):
    #get the book description from df:
    description = df[df['ISBN']== ISBN].description
    
    #pass in the description to the get recommendations function and set from_isbn=True
    return get_recs_from_desc(description, from_isbn=True)