In [1]:
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.output_parsers import StrOutputParser
from sentence_transformers import SentenceTransformer, util
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama.llms import OllamaLLM
from langchain.prompts import ChatPromptTemplate
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore 
from dotenv import load_dotenv
from langchain_core.documents import Document
import time
import json
import re
import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#loading the document
loader = PyPDFLoader("/workspace/topic_identifier/data/Copy of TOT DISCIPLESHIP TEACHING MATERIAL (1).pdf")
docs = loader.load()

In [3]:
#splitting the document into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size = 10_000, chunk_overlap = 500)
chunks = splitter.split_documents(docs)
print(len(chunks))

70


In [4]:
#setting up the prompt topic identifier
prompt = ChatPromptTemplate.from_template("""
    You are an expert at summarizing document sections.
Given the following page text:

{page_text}

Provide ONE short, clear topic title for this page.
Return ONLY the topic string without quotes or extra text.
    """
)
model = OllamaLLM(model="gemma3:latest")

chain = prompt | model | StrOutputParser()


In [5]:
#extracting  the topics json file from the output
def extract_json_array(output):
    # Look for the first valid JSON array in the output
    match = re.search(r'\[.*?\]', output, re.DOTALL)
    if match:
        try:
            return json.loads(match.group())
        except json.JSONDecodeError:
            print("Matched text is not valid JSON")
            return None
    else:
        print("No JSON array found")
        return None


In [6]:
#creating the outlines
sub_outlines = []

for i, chunk in enumerate(chunks):
    page_text = chunk.page_content
    try:
        print(f"\nSending chunk {i+1}/{len(chunks)} to LLM...")
        result = chain.invoke({"page_text": page_text})
        # Parse the JSON output safely
        try:
            sub_outlines.append({
                "chunk_id": i,
                "headings": result
            })
        except json.JSONDecodeError:
            print(f"Failed to parse JSON from chunk {i+1}. Output was:\n{result}")
            continue

        time.sleep(1.5)  
    except Exception as e:
        print(f"Error in chunk {i+1}: {e}")


Sending chunk 1/70 to LLM...



Sending chunk 2/70 to LLM...

Sending chunk 3/70 to LLM...

Sending chunk 4/70 to LLM...

Sending chunk 5/70 to LLM...

Sending chunk 6/70 to LLM...

Sending chunk 7/70 to LLM...

Sending chunk 8/70 to LLM...

Sending chunk 9/70 to LLM...

Sending chunk 10/70 to LLM...

Sending chunk 11/70 to LLM...

Sending chunk 12/70 to LLM...

Sending chunk 13/70 to LLM...

Sending chunk 14/70 to LLM...

Sending chunk 15/70 to LLM...

Sending chunk 16/70 to LLM...

Sending chunk 17/70 to LLM...

Sending chunk 18/70 to LLM...

Sending chunk 19/70 to LLM...

Sending chunk 20/70 to LLM...

Sending chunk 21/70 to LLM...

Sending chunk 22/70 to LLM...

Sending chunk 23/70 to LLM...

Sending chunk 24/70 to LLM...

Sending chunk 25/70 to LLM...

Sending chunk 26/70 to LLM...

Sending chunk 27/70 to LLM...

Sending chunk 28/70 to LLM...

Sending chunk 29/70 to LLM...

Sending chunk 30/70 to LLM...

Sending chunk 31/70 to LLM...

Sending chunk 32/70 to LLM...

Sending chunk 33/70 to LLM...

Sending chunk 3

In [None]:
import pandas as pd
topic_dataframe = pd.DataFrame(sub_outlines)
topic_dataframe.to_csv("topic_lists.csv", index=False)
print("Saved topic list to topic_lists.csv")

NameError: name 'topic_dataframe' is not defined

In [9]:
topic_dataframe.shape

(70, 2)

In [21]:
# 1. Use BGE embeddings (better than MiniLM)
embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

In [22]:
#adding the embeddings to a vectorstore
vectorstore = Chroma(
    collection_name = "test_documents",
    embedding_function = embedding_model,
    persist_directory="./chroma_db"

)

  vectorstore = Chroma(


In [23]:
#creating a retriever
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=25)

In [24]:
store = InMemoryStore()  
retriever = ParentDocumentRetriever(
            vectorstore=vectorstore,
            docstore=store,
            child_splitter=child_splitter,
            parent_splitter=parent_splitter
        )


In [25]:
# Store documents with metadata
texts = [c['text'] for c in tagged_chunks]
metadatas = [{'chunk_id': c['chunk_id'], 'topic': c['topic']} for c in tagged_chunks]
documents = [Document(page_content=text, metadata=meta) for text, meta in zip(texts, metadatas)]

# Add to vectorstore
vectorstore.add_documents(documents)
vectorstore.persist()

  vectorstore.persist()
