In [None]:
from utils_openai import setup_openai_api, create_embeddings, create_llm, load_msme_data
from langchain_experimental.text_splitter import SemanticChunker

print('[OK] Imports done!')

In [None]:
api_key = setup_openai_api()
embeddings = create_embeddings(api_key)
llm = create_llm(api_key)
docs, metas, ids = load_msme_data('msme.csv')
print('[OK] Data loaded!')

In [None]:
semantic_chunker = SemanticChunker(embeddings)
print('[OK] Semantic chunker ready!')


In [None]:
combined_text = '\n\n'.join(docs)
semantic_chunks = semantic_chunker.create_documents([combined_text])

print(f'Original docs: {len(docs)}')
print(f'Semantic chunks: {len(semantic_chunks)}')
print(f'\nSample chunk lengths:')
for i in range(min(5, len(semantic_chunks))):
    print(f'  Chunk {i+1}: {len(semantic_chunks[i].page_content)} chars')

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

fixed_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

fixed_chunks = fixed_splitter.create_documents(docs)

print(f"Fixed chunks: {len(fixed_chunks)}")
print(f"Semantic chunks: {len(semantic_chunks)}")


In [None]:
from utils_openai import create_vectorstore

fixed_vs = create_vectorstore(
    [c.page_content for c in fixed_chunks],
    metas=None,
    ids=None,
    embeddings=embeddings,
    collection_name="fixed_chunks",
    persist_directory="./chroma_fixed"
)

semantic_vs = create_vectorstore(
    [c.page_content for c in semantic_chunks],
    metas=None,
    ids=None,
    embeddings=embeddings,
    collection_name="semantic_chunks",
    persist_directory="./chroma_semantic"
)

fixed_retriever = fixed_vs.as_retriever(search_kwargs={"k": 3})
semantic_retriever = semantic_vs.as_retriever(search_kwargs={"k": 3})

print("[OK] Retrievers ready!")


In [None]:
query = "How can small businesses access government funding?"

print("\n--- FIXED CHUNKS ---")
for doc in fixed_retriever.get_relevant_documents(query):
    print(doc.page_content[:300])
    print("-" * 60)

print("\n--- SEMANTIC CHUNKS ---")
for doc in semantic_retriever.get_relevant_documents(query):
    print(doc.page_content[:300])
    print("-" * 60)


In [None]:
test_queries = {
    "Policy / Regulation": "What regulations affect small businesses?",
    "Finance": "Available funding options for MSMEs",
    "Operations": "Employee management requirements",
}


In [None]:
for label, query in test_queries.items():
    print("=" * 80)
    print(f"{label} QUERY: {query}")

    print("\nFixed chunks:")
    fixed_docs = fixed_retriever.get_relevant_documents(query)
    for d in fixed_docs:
        print("-", d.page_content[:150])

    print("\nSemantic chunks:")
    semantic_docs = semantic_retriever.get_relevant_documents(query)
    for d in semantic_docs:
        print("-", d.page_content[:150])


In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

prompt = get_baseline_prompt()

fixed_rag = (
    {"context": fixed_retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

semantic_rag = (
    {"context": semantic_retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [None]:
query = "Explain funding opportunities available for MSMEs"

print("\n--- FIXED CHUNK ANSWER ---")
print(fixed_rag.invoke(query))

print("\n--- SEMANTIC CHUNK ANSWER ---")
print(semantic_rag.invoke(query))
