In [1]:
#importing libraries
import pandas as pd
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.output_parsers import StrOutputParser
from sentence_transformers import SentenceTransformer, util
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore 
from langchain.schema import Document


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#importing the list with topics
topics = pd.read_csv('/workspace/topic_identifier/topic_lists.csv')
topics.head()

Unnamed: 0,chunk_id,headings
0,0,The Fall of Man
1,1,The Nature and Acquisition of Salvation
2,2,Confessing Jesus as Lord and Savior
3,3,New Creation in Christ
4,4,Dealing with Sin as a New Believer


In [3]:
#loading the pdf data
loader = PyPDFLoader('/workspace/topic_identifier/data/Copy of TOT DISCIPLESHIP TEACHING MATERIAL (1).pdf')
docs = loader.load()
print(len(docs))

70


In [4]:
#splitting the document into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size = 10_000, chunk_overlap = 500)
chunks = splitter.split_documents(docs)
print(len(chunks))

70


In [5]:
#putting the docs and the topic in a list[dict]
documents = []
for _, row in topics.iterrows():
    chunk_id = row["chunk_id"]
    topic = row["headings"]

    if 0 <= chunk_id < len(chunks):  
        # Create Document with chunk text and metadata
        doc = Document(
            page_content=chunks[chunk_id].page_content,
            metadata={
                "chunk_id": chunk_id + 1, 
                "heading": topic.strip()
            }
        )
        documents.append(doc)
    else:
        print(f"Warning: chunk_id {chunk_id} out of range for chunks list")



In [6]:
print(documents)

[Document(metadata={'chunk_id': 1, 'heading': 'The Fall of Man'}, page_content='TOT  DISCIPLESHIP  TEACHING  MATERIAL  \nSALVATION  &  IDENTITY  \n \nTOPICS  COVERED:  1.  What  is  Salvation?  2.  How  do  you  get  saved?  3.  How  to  deal  with  sin  as  a  new  believer  4.  How  to  walk  like  a  Christian  5.  Habits  of  Grace   \nWhat  is  Salvation?  The  Fall  of  Man  In  Genesis  Chapter  3,  we  learn  about  the  fall  of  man  from  glory  due  to  the  sin  committed  by  \nAdam\n \nand\n \nEve.\n Adam  and  Eve  sinned  by  eating  from  the  tree  (Genesis  3:6)  that  God  had  forbidden  them  to  \neat\n \nfrom\n \n(Genesis\n \n2:16-17):\n “So  when  the  woman  saw  that  the  tree  was  good  for  food,  that  it  was  pleasant  to  \nthe\n \neyes,\n \nand\n \na\n \ntree\n \ndesirable\n \nto\n \nmake\n \none\n \nwise,\n \nshe\n \ntook\n \nof\n \nits\n \nfruit\n \nand\n \nate.\n \nShe\n \nalso\n \ngave\n \nto\n \nher\n \nhusband\n \nwith\n \nher,\n \nand\n \nhe\

In [11]:
first_10 = documents[:10]
print(first_10)

[Document(metadata={'chunk_id': 1, 'heading': 'The Fall of Man'}, page_content='TOT  DISCIPLESHIP  TEACHING  MATERIAL  \nSALVATION  &  IDENTITY  \n \nTOPICS  COVERED:  1.  What  is  Salvation?  2.  How  do  you  get  saved?  3.  How  to  deal  with  sin  as  a  new  believer  4.  How  to  walk  like  a  Christian  5.  Habits  of  Grace   \nWhat  is  Salvation?  The  Fall  of  Man  In  Genesis  Chapter  3,  we  learn  about  the  fall  of  man  from  glory  due  to  the  sin  committed  by  \nAdam\n \nand\n \nEve.\n Adam  and  Eve  sinned  by  eating  from  the  tree  (Genesis  3:6)  that  God  had  forbidden  them  to  \neat\n \nfrom\n \n(Genesis\n \n2:16-17):\n “So  when  the  woman  saw  that  the  tree  was  good  for  food,  that  it  was  pleasant  to  \nthe\n \neyes,\n \nand\n \na\n \ntree\n \ndesirable\n \nto\n \nmake\n \none\n \nwise,\n \nshe\n \ntook\n \nof\n \nits\n \nfruit\n \nand\n \nate.\n \nShe\n \nalso\n \ngave\n \nto\n \nher\n \nhusband\n \nwith\n \nher,\n \nand\n \nhe\

In [7]:
#creating the embedding model
class BGEEmbeddings:
    def __init__(self):
        self.model = SentenceTransformer("BAAI/bge-base-en")
        #embedding the content
    def embed_documents(self, text):
        return self.model.encode(text, batch_size = 8, normalize_embeddings = True).tolist()

        #embedding the query
    def embed_query(self, text):
        return self.model.encode([text], normalize_embeddings = True).tolist()[0]


In [8]:
#initializing the splitter 
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=25)

In [9]:
#initializing the vectorstore
vectorstore = Chroma(
    collection_name  = "topic_centred_chunks",
    embedding_function = BGEEmbeddings(),
    persist_directory = "./test_chroma"
)

  vectorstore = Chroma(


In [10]:
# Add all documents to vectorstore at once
vectorstore.add_documents(documents)

['334fa6dc-95ee-4690-8865-aa9f10ace957',
 '1341d582-f90c-442a-8814-5ce67f2acf3d',
 'e8186ee8-a3ed-4004-998d-4af0db7891f6',
 'c4d20015-3e50-4398-8c4a-e5ede9ac882e',
 'e26e3997-d290-4c5b-9d04-e37da8bf450e',
 '3ab0ba89-e09c-411d-bf46-08c77e155e6e',
 '8b494422-e60e-45a1-b32f-9eaf8ce73d73',
 '4691dbe7-8662-4687-9173-eeee0b085368',
 '6c8463cf-94f4-4115-a188-6f81da150169',
 '45d9b7d8-bfd5-4ad4-a635-2c1b0afa7267',
 '595e343a-d591-4e46-b274-11af4c65389a',
 'da34950a-3d95-489e-a5e8-74ded0a0e1aa',
 '2405e9bc-a82d-4f2f-a3ff-635b5c638c00',
 '1f09aefc-016f-4c52-8919-ea64c0d85e34',
 'c012869b-c478-408e-903c-2d3599d62667',
 '6192fcdf-fd02-46e6-add2-5b4553a7807c',
 '80c59b6f-f37c-493e-84f3-871d8827111e',
 'c1fe8033-0949-4844-a51a-040da0133bd5',
 '6f6ed0c6-61a7-4148-95fb-a0a5dd811529',
 'b42de67e-dcc6-4561-ba4f-7f437e1522eb',
 '3b56e151-5c1e-470e-abd1-2002c7b50719',
 '3380c9aa-9a1c-4af3-9fea-87b33342b0f5',
 'e76f89dc-e56d-42b2-b583-f368bd9d8d21',
 '2514c423-26d8-4d48-a4cd-e1db60c91c72',
 'ef67cb2f-b8d2-