In [None]:
# Convert the raw text to a langchain format
from langchain_community.document_loaders import TextLoader
# Split the documents into meaningful chunks (individual description of each book)
from langchain_text_splitters import CharacterTextSplitter
# Convert those chunks into embeddings
from langchain_openai import OpenAIEmbeddings
# Store this into vector database
from langchain_chroma import Chroma

In [None]:
# Setup environment/ calling model using API calls
from dotenv import load_dotenv

load_dotenv()

In [None]:
import pandas as pd

books = pd.read_csv("books_cleaned.csv")

In [None]:
books

In [None]:
# Start building Vector Search
books["tagged_description"]

In [None]:
#Textloaders in Langchain does not work with pandas dataframe
#We need to save tagged_desciption in text file

books["tagged_description"].to_csv("tagged_description.txt",
                                   sep= "\n",
                                   index=False,
                                   header=False)

In [None]:
#Load them, added encoding so that textloader does not corrupt words containing apostrophes 
raw_documents = TextLoader("tagged_description.txt", encoding="utf-8").load()
#Instantiate text splitter, setting chunk_size=0 to prioritize splitting on separator rather than splitting on chunk_size
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw_documents)


In [None]:
print(documents[0].page_content)

In [None]:
#Create documment embedding and store them in a vector database
db_books = Chroma.from_documents(documents, 
                                 embedding=OpenAIEmbeddings())

In [None]:
query = "A book to teach children about nature"
docs = db_books.similarity_search(query, k=10)
docs

In [None]:
#Gets first document from doc list > docs[0]
#Access text content > page_content
#Split text into tokens
#Access first element from split
#Remove any leading/trailing space
# convert it to int
books[books["isbn13"]==int(docs[0].page_content.split()[0].strip())]

In [None]:
def retrieve_semantic_recommendation(query: str, top_k: int = 10) -> pd.DataFrame:
    """
    Retrieve book recommendations based on semantic similarity to a query.
    
    Args:
        query (str): Search query describing desired books
        top_k (int): Number of top recommendations to return (default: 10)
        
    Returns:
        pd.DataFrame: DataFrame containing top_k recommended books
    """
    # Get similarity search results
    similarity_results = db_books.similarity_search(query, k=50)
    
    # Extract ISBNs from search results
    recommended_isbns = [
        int(doc.page_content.split()[0].strip().strip('"').strip("'")) 
        for doc in similarity_results
    ]
    
    # Filter books DataFrame and return top recommendations
    recommended_books = books[books["isbn13"].isin(recommended_isbns)]
    return recommended_books.head(top_k)


In [None]:
retrieve_semantic_recommendation("A book to teach children about nature")