In [2]:
from pathlib import Path

In [None]:
from docling.document_converter import DocumentConverter
from langchain_text_splitters.character import RecursiveCharacterTextSplitter

In [4]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_nomic.embeddings import NomicEmbeddings

In [5]:
from langchain_ollama import ChatOllama
local_llm = "llama3.2:3b-instruct-fp16"
llm = ChatOllama(model=local_llm,temperature=0)
llm_json_mode = ChatOllama(model=local_llm,temperature=0,format="json")


In [None]:
source = "ambuja test-rotated.pdf"  # document per local path or URL
converter = DocumentConverter()
result = converter.convert(source)

# output: ## Docling Technical Report [...]"

In [7]:
output_dir = Path("scratch")
with (output_dir / f"markdown.md").open("w", encoding="utf-8") as fp:
        fp.write(result.document.export_to_markdown())
document_path = Path("scratch/markdown.md")

In [8]:
loader = UnstructuredMarkdownLoader(document_path)
loaded_documents = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
docs = text_splitter.split_documents(loaded_documents)


In [12]:
vectorstore = SKLearnVectorStore.from_documents(
    documents=docs,
    embedding=NomicEmbeddings(model="nomic-embed-text-v1.5",inference_mode="local")
)


In [13]:
retriever = vectorstore.as_retriever(k=3)

In [None]:
from langchain_core.messages import HumanMessage,SystemMessage
##Generating
rag_prompt = """You are an assistant for question-answering tasks.

Here is the context to use to answer the question:

{context}

Think Carefully about the above context.

Now, review the user question:

{question}

Provide an answer to this question using only the above context.

Use three sentences maximum and keep the answer concise.

Answer:"""

##post-prcessing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

question = "How many independent directors are in the board?"
#Test
docs_ret = retriever.invoke(question)
docs_txt = format_docs(docs)
rag_prompt_formatted = rag_prompt.format(context=docs_txt,question=question)
generation = llm.invoke([HumanMessage(content = rag_prompt_formatted)])
print(generation.content)
