In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_mistralai import MistralAIEmbeddings
from langchain_chroma import Chroma # opensource vector database

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [3]:
import pandas as pd

books = pd.read_csv('books_cleaned.csv')

In [27]:
books["tagged description"]

0       9780002005883 A NOVEL THAT READERS and critics...
1       9780002261982 A new 'Christie for Christmas' -...
2       9780006178736 A memorable, mesmerizing heroine...
3       9780006280897 Lewis' work on the nature of lov...
4       9780006280934 "In The Problem of Pain, C.S. Le...
                              ...                        
5193    9788172235222 On A Train Journey Home To North...
5194    9788173031014 This book tells the tale of a ma...
5195    9788179921623 Wisdom to Create a Life of Passi...
5196    9788185300535 This collection of the timeless ...
5197    9789027712059 Since the three volume edition o...
Name: tagged description, Length: 5198, dtype: object

In [4]:
books["tagged description"].to_csv("tagged_descriptions.txt", index=False, header=False)

In [5]:
raw_document = TextLoader("tagged_descriptions.txt", encoding="utf-8").load()

In [7]:
text_splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw_document)

Created a chunk of size 2010, which is longer than the specified 1500
Created a chunk of size 1637, which is longer than the specified 1500
Created a chunk of size 2012, which is longer than the specified 1500
Created a chunk of size 2834, which is longer than the specified 1500
Created a chunk of size 2510, which is longer than the specified 1500
Created a chunk of size 1814, which is longer than the specified 1500
Created a chunk of size 1830, which is longer than the specified 1500
Created a chunk of size 1644, which is longer than the specified 1500
Created a chunk of size 1932, which is longer than the specified 1500
Created a chunk of size 2008, which is longer than the specified 1500
Created a chunk of size 2285, which is longer than the specified 1500
Created a chunk of size 1914, which is longer than the specified 1500
Created a chunk of size 2616, which is longer than the specified 1500
Created a chunk of size 1580, which is longer than the specified 1500
Created a chunk of s

In [8]:
documents[0]

Document(metadata={'source': 'tagged_descriptions.txt'}, page_content='"9780002005883 A NOVEL THAT READERS and critics have been eagerly anticipating for over a decade, Gilead is an astonishingly imagined story of remarkable lives. John Ames is a preacher, the son of a preacher and the grandson (both maternal and paternal) of preachers. It’s 1956 in Gilead, Iowa, towards the end of the Reverend Ames’s life, and he is absorbed in recording his family’s story, a legacy for the young son he will never see grow up. Haunted by his grandfather’s presence, John tells of the rift between his grandfather and his father: the elder, an angry visionary who fought for the abolitionist cause, and his son, an ardent pacifist. He is troubled, too, by his prodigal namesake, Jack (John Ames) Boughton, his best friend’s lost son who returns to Gilead searching for forgiveness and redemption. Told in John Ames’s joyous, rambling voice that finds beauty, humour and truth in the smallest of life’s details, 

In [None]:
embedding = MistralAIEmbeddings(
    model="mistral-embed",
    api_key= os.getenv("MISTRAL_API_KEY")
)

db_books = Chroma.from_documents(documents, embedding=embedding) 

tokenizer.json: 0.00B [00:00, ?B/s]

In [22]:
query = "A book about computers and technology"
docs = db_books.similarity_search(query, k=5)

In [23]:
docs

[Document(id='cdba0969-18bf-484b-aeb2-22de8cbf1e44', metadata={'source': 'tagged_descriptions.txt'}, page_content='"9780226817415 In the early 1960s, computers haunted the American popular imagination. Bleak tools of the cold war, they embodied the rigid organization and mechanical conformity that made the military-industrial complex possible. But by the 1990s—and the dawn of the Internet—computers started to represent a very different kind of world: a collaborative and digital utopia modeled on the communal ideals of the hippies who so vehemently rebelled against the cold war establishment in the first place. From Counterculture to Cyberculture is the first book to explore this extraordinary and ironic transformation. Fred Turner here traces the previously untold story of a highly influential group of San Francisco Bay–area entrepreneurs: Stewart Brand and the Whole Earth network. Between 1968 and 1998, via such familiar venues as the National Book Award–winning Whole Earth Catalog, t

In [24]:
query = "A book about medicine and psychology"
docs = db_books.similarity_search(query, k=5)
docs

[Document(id='69f27879-ae90-47e1-aeac-71487582241d', metadata={'source': 'tagged_descriptions.txt'}, page_content='"9780520231511 ""Undertaker of the Mind is the most splendid piece of original research for many a year on the early history of British psychiatry. Brilliantly exploiting hitherto unused documentation, Andrews and Scull bring the once murky world of the eighteenth- century mad-doctor to life, and dispel many deeply embedded myths in the process. Absolutely essential reading!""--Roy Porter, author of The Creation of the Modern World ""This is a wonderfully well-written work... The authors reconstruct, in rich and convincing detail, the dilemmas faced by Monro, his patients, their families, and the broader culture when confronted with psychological distress.""--Joel Braslow, author of Mental Ills and Bodily Cures ""A telling reconstruction of the ideas and practice of probably the most famous psychiatrist in eighteenth-century Britain.... The analyses of Monro\'s more famous

In [49]:
# lets return the titles and authors using the isbn in the tagged descriptons

# extract the isbn from the returned docs and convert it to int ( remove the initial ")
isbn =  docs[0].page_content.split()[0].strip()
isbn = int(isbn[1:])

books[books["isbn13"] == isbn]

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count,title_and_subtitle,tagged description
2584,9780520231511,520231511,Undertaker of the Mind,Jonathan Andrews;Andrew T. Scull,Biography & Autobiography,http://books.google.com/books/content?id=_zYTn...,"""Undertaker of the Mind is the most splendid p...",2001.0,3.42,386.0,12.0,Undertaker of the Mind: John Monro and Mad-doc...,"9780520231511 ""Undertaker of the Mind is the m..."


In [None]:
# Recommendation System
def retrive_semantic_recommendation(query: str, top_k: int = 10) -> pd.DataFrame:
    results = []
    docs = db_books.similarity_search(query, top_k)
    for i in range(len(docs)):
        results += [int(docs[i].page_content.strip('"').split()[0])]
    
    return books[books["isbn13"].isin(results).head(top_k)] # stopped at 1:15 -- finished the theory