In [1]:
#importing libraires
from langchain.prompts import PromptTemplate
from langchain_ollama import OllamaLLM
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore 
from langchain_core.documents import Document
from typing import List
import chromadb
import json
import re


In [2]:
#initializing the text splitter
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=25)

In [3]:
#initializing the embedder
embedding_model = HuggingFaceEmbeddings(model_name = "BAAI/bge-base-en-v1.5")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#defining the db
vectorstore = Chroma(
    persist_directory="/workspace/topic_identifier/chroma_db",
    embedding_function=embedding_model

)

  vectorstore = Chroma(


In [5]:
store = InMemoryStore()

In [6]:
#defining the retriever
retriever = ParentDocumentRetriever(
    vectorstore = vectorstore,
    docstore = store, 
    parent_splitter = parent_splitter,
    child_splitter = child_splitter   
)

In [7]:
def get_topics(vectorstore):
    client = chromadb.PersistentClient(path="/workspace/topic_identifier/chroma_db")
    collection_name = "test_documents"
    collection = client.get_collection(collection_name)
    results = collection.get(include=['metadatas'])
    dict_topics = results['metadatas']
    topics = list(set(dict['topic'] for dict in dict_topics ))
    return topics


In [8]:
# #getting the relevant parent documents
# def get_parent_docs(topic: str):
#     query = f"Get information about {topic}"
#     retrieved_docs = retriever.get_relevant_documents(query)
#     return retrieved_docs


In [9]:
def get_parent_docs(topic):
    return retriever.vectorstore.similarity_search(
        query="",  # no embedding query
        k=100,
        filter={"topic": topic}
    )

In [10]:
#extracting  the topics json file from the output
def extract_json_array(output):
    # Look for the first valid JSON array in the output
    match = re.search(r'\[.*?\]', output, re.DOTALL)
    if match:
        try:
            return json.loads(match.group())
        except json.JSONDecodeError:
            print("Matched text is not valid JSON")
            return None
    else:
        print("No JSON array found")
        return None


In [11]:
#generating the questions
def generate_mcqs(topic: str, chunk: str) -> str:
    prompt = f"""
You are a test-set generator. Based on the following content about "{topic}", generate 2 multiple choice questions (each with 4 options and 1 correct answer):

\"\"\"
{chunk}
\"\"\"

Format:
Q: Question?
a) Option A
b) Option B
c) Option C
d) Option D
Answer: <correct letter>
    """.strip()
    model = OllamaLLM(model="deepseek-r1:1.5b")
    chain = prompt | model | StrOutputParser()
    full_question = chain.invoke({"topic": topic, "chunk": chunk})
    return full_question

In [12]:
# #getting the topics
# topics = get_topics(vectorstore)

In [13]:
# Get all topics
topics = get_topics(vectorstore)
all_generated_questions = {}

for topic in topics:
    print(f"Processing topic: {topic}")
    parent_docs = get_parent_docs(topic)
    
    if not parent_docs:
        print(f"No documents found for topic: {topic}")
        continue

    test_questions = []

    for doc in parent_docs:
        quiz = generate_mcqs(topic, doc.page_content)
        if quiz:
            test_questions.append(quiz)
        else:
            print(f"Failed to generate question for topic: {topic}")

    all_generated_questions[topic] = test_questions

# Print final result
print(json.dumps(all_generated_questions, indent=2))

Processing topic: Redemption Through Jesus
No documents found for topic: Redemption Through Jesus
Processing topic: Understanding God Through His Word
No documents found for topic: Understanding God Through His Word
Processing topic: The Attributes of God's Love
No documents found for topic: The Attributes of God's Love
Processing topic: Seeking a Better Country for God's Plan
No documents found for topic: Seeking a Better Country for God's Plan
Processing topic: Examining the Holy Spirit's Role on Earth
No documents found for topic: Examining the Holy Spirit's Role on Earth
Processing topic: Spiritual Growth
No documents found for topic: Spiritual Growth
{}


In [14]:
print(all_generated_questions)

{}
