In [40]:
from langchain_community.document_loaders import TextLoader #Imports the text and converts it to a format langchain can use
from langchain_text_splitters import CharacterTextSplitter #Splits the texts into meaningful chunks 
from langchain_openai import OpenAIEmbeddings #Converts the chunks from the split text into document embeddings
from langchain_chroma import Chroma #Stores the information in a vector database



In [41]:
from dotenv import load_dotenv

load_dotenv()

True

In [42]:
import pandas as pd
books = pd.read_csv('books_cleaned.csv')

In [43]:
books.head()

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,tagged_description
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0,9780002005883 A NOVEL THAT READERS and critics...
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0,9780002261982 A new 'Christie for Christmas' -...
2,9780006163831,6163831,The One Tree,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0,9780006163831 Volume Two of Stephen Donaldson'...
3,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0,"9780006178736 A memorable, mesmerizing heroine..."
4,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0,9780006280897 Lewis' work on the nature of lov...


In [44]:
books["tagged_description"]

0       9780002005883 A NOVEL THAT READERS and critics...
1       9780002261982 A new 'Christie for Christmas' -...
2       9780006163831 Volume Two of Stephen Donaldson'...
3       9780006178736 A memorable, mesmerizing heroine...
4       9780006280897 Lewis' work on the nature of lov...
                              ...                        
6502    9788173031014 This book tells the tale of a ma...
6503    9788179921623 Wisdom to Create a Life of Passi...
6504    9788185300535 This collection of the timeless ...
6505    9789027712059 Since the three volume edition o...
6506    9789042003408 This is a jubilant and rewarding...
Name: tagged_description, Length: 6507, dtype: object

In [45]:
# Save the tagged descriptions in a separate file
books["tagged_description"].to_csv('tagged_description.txt', 
                                    sep = '\n',
                                    index=False,
                                    header = False)


In [46]:
raw_documents = TextLoader('tagged_description.txt').load()
text_splitter = CharacterTextSplitter(chunk_size= 1000, chunk_overlap=0, separator='\n')
docs = text_splitter.split_documents(raw_documents)

Created a chunk of size 1168, which is longer than the specified 1000


Created a chunk of size 1214, which is longer than the specified 1000
Created a chunk of size 1088, which is longer than the specified 1000
Created a chunk of size 1189, which is longer than the specified 1000
Created a chunk of size 1267, which is longer than the specified 1000
Created a chunk of size 2010, which is longer than the specified 1000
Created a chunk of size 1225, which is longer than the specified 1000
Created a chunk of size 1184, which is longer than the specified 1000
Created a chunk of size 1214, which is longer than the specified 1000
Created a chunk of size 1191, which is longer than the specified 1000
Created a chunk of size 1057, which is longer than the specified 1000
Created a chunk of size 1270, which is longer than the specified 1000
Created a chunk of size 1635, which is longer than the specified 1000
Created a chunk of size 1132, which is longer than the specified 1000
Created a chunk of size 1325, which is longer than the specified 1000
Created a chunk of s

In [47]:
docs[0]

Document(metadata={'source': 'tagged_description.txt'}, page_content='9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, Gi

In [48]:
# Creating document embedding and storing in a vector database
db_books = Chroma.from_documents(docs, embedding=OpenAIEmbeddings())

In [49]:
query = 'What are the best history books?'
query_docs = db_books.similarity_search(query, k=10)
query_docs

[Document(id='a5640385-07c1-4482-9b57-5a1acfb2435d', metadata={'source': 'tagged_description.txt'}, page_content="9780140449082 One of the masterpieces of classical literature, the Histories describes how a small and quarrelsome band of Greek city states united to repel the might of the Persian empire. But while this epic struggle forms the core of his work, Herodotus' natural curiosity frequently gives rise to colourful digressions - a description of the natural wonders of Egypt; an account of European lake-dwellers; and far-fetched accounts of dog-headed men and gold-digging ants. With its kaleidoscopic blend of fact and legend, the Histories offers a compelling Greek view of the world of the fifth century BC.\n9780140449099 This text is a revised edition of Cervantes' classic tale. Don Quixote has become so entranced by reading romances of chivalry that he determines to become a knight errant and pursue bold adventures, accompanied by his squire, the cunning Sancho Panza."),
 Docume

In [56]:
# We can use the operation below to get the book details from the vector database into a readable format
books[books['isbn13'] == int(query_docs[3].page_content.split()[0].strip())]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,tagged_description
2351,9780393924954,393924955,A History of Modern Europe,John M. Merriman,History,http://books.google.com/books/content?id=uvyqH...,Available in both one-volume and two-volume pa...,2004.0,3.98,1040.0,62.0,9780393924954 Available in both one-volume and...


In [57]:
def retrieve_semantic_recommendations(query: str, top_k: int = 10) -> pd.DataFrame:
    recs = db_books.similarity_search(query, k=50)

    books_list = []

    for i in range(0, len(recs)):
        books_list += [int(recs[i].page_content.strip('"').split()[0])]

    return books[books['isbn13'].isin(books_list)].head(top_k)

#  This function will return a dataframe of the top k recommendations for a given query

In [58]:
retrieve_semantic_recommendations('A book to teach children about programming')

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,tagged_description
80,9780007173136,000717313X,I Wish that I Had Duck Feet,Dr. Seuss;Barney Tobey,Children's stories,http://books.google.com/books/content?id=m9cZA...,A boy imagines what it would be like if he had...,2004.0,4.16,64.0,8903.0,9780007173136 A boy imagines what it would be ...
126,9780060177249,0060177241,The thief of always,Clive Barker,Fiction,http://books.google.com/books/content?id=jKsrA...,After a mysterious stranger promises to end hi...,1992.0,4.19,225.0,22123.0,9780060177249 After a mysterious stranger prom...
610,9780130887016,0130887013,C++ Programmer's Notebook,James Edward Keogh;John Shapley Gray,Computers,http://books.google.com/books/content?id=tmgPA...,Covers all the C++ concepts a programmer needs...,2002.0,3.0,528.0,3.0,9780130887016 Covers all the C++ concepts a pr...
1294,9780240806082,0240806085,Directing the Documentary,Michael Rabiger,Performing Arts,http://books.google.com/books/content?id=uoKli...,Michael Rabiger guides the reader through the ...,2004.0,4.23,648.0,173.0,9780240806082 Michael Rabiger guides the reade...
1316,9780262510875,0262510871,Structure and Interpretation of Computer Programs,Harold Abelson;Gerald Jay Sussman;Julie Sussman,Computers,http://books.google.com/books/content?id=6QOXQ...,"""Structure and Interpretation of Computer Prog...",1996.0,4.45,657.0,3491.0,"9780262510875 ""Structure and Interpretation of..."
2151,9780380815937,0380815931,In the Beginning...was the Command Line,Neal Stephenson,Computers,http://books.google.com/books/content?id=iL9wB...,"This is ""the Word"" -- one man's word, certainl...",1999.0,3.79,151.0,6618.0,"9780380815937 This is ""the Word"" -- one man's ..."
2594,9780439440035,0439440033,Hope Springs Eternal,Kristiana Gregory,Juvenile Fiction,http://books.google.com/books/content?id=5FnzG...,"Albert, Nessa's old friend from the orphanage,...",2005.0,4.26,176.0,128.0,"9780439440035 Albert, Nessa's old friend from ..."
2604,9780439569729,0439569729,SCHOLASTIC SUCCESS WITH 4TH GRADE(WORKBOOK),Terry Cooper,Education,http://books.google.com/books/content?id=b-EBH...,"416 bright, colorful pages that give kids prac...",2003.0,4.57,416.0,7.0,"9780439569729 416 bright, colorful pages that ..."
2713,9780440495048,0440495040,War Comes to Willy Freeman,James Lincoln Collier,Juvenile Fiction,http://books.google.com/books/content?id=qcbpI...,A free thirteen-year-old black girl in Connect...,1987.0,3.6,192.0,253.0,9780440495048 A free thirteen-year-old black g...
2730,9780441012039,0441012035,Neuromancer,William Gibson,Fiction,http://books.google.com/books/content?id=2NyiP...,"Case, a burned-out computer whiz, is asked to ...",2004.0,3.89,384.0,1617.0,"9780441012039 Case, a burned-out computer whiz..."
