In [1]:
!pip install -U langchain langchain-community langchain-huggingface langgraph chromadb sentence-transformers pydantic




In [1]:
from typing import List
from typing_extensions import TypedDict
from pydantic import BaseModel, Field

from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser

from langchain_community.document_loaders import DirectoryLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

from langchain_huggingface import HuggingFaceEndpoint
from langgraph.graph import StateGraph, START, END





In [3]:
class SafeTextLoader:
    def __init__(self, file_path):
        self.file_path = file_path

    def lazy_load(self):
        try:
            with open(self.file_path, encoding="utf-8", errors="ignore") as f:
                yield Document(
                    page_content=f.read(),
                    metadata={"source": self.file_path}
                )
        except Exception:
            return


In [4]:
DATA_PATH = r"C:\Users\PMLS\Downloads\langgraph\lmkr_data"

loader = DirectoryLoader(
    DATA_PATH,
    glob="**/*.txt",
    loader_cls=SafeTextLoader
)

documents = loader.load()


In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

doc_splits = splitter.split_documents(documents)


In [6]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

vectorstore = Chroma.from_documents(doc_splits, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 8})


  embeddings = HuggingFaceEmbeddings(


In [7]:
from langchain_huggingface import HuggingFaceEndpoint

base_llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.2",
    task="conversational",        # ðŸ”‘ CRITICAL
    temperature=0,
    max_new_tokens=256
)

from langchain_huggingface import ChatHuggingFace

llm = ChatHuggingFace(llm=base_llm)


In [83]:
from langchain_core.prompts import ChatPromptTemplate

rag_prompt = ChatPromptTemplate.from_template("""
You are a helpful assistant.
Answer the question using ONLY the provided context.

Context:
{context}

Question:
{question}

Answer:
""")

rag_chain = rag_prompt | llm


In [102]:
class DocRelevance(BaseModel):
    binary_score: str

doc_relevance_parser = JsonOutputParser(
    pydantic_object=DocRelevance
)

doc_relevance_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a strict binary classifier.\n"
     "Respond with ONLY valid JSON.\n"
     '{{ "binary_score": "yes" | "no" }}'),
    ("human",
     "Question:\n{question}\n\nDocument:\n{document}")
])

doc_relevance_grader = doc_relevance_prompt | llm | JsonOutputParser()


In [103]:
class Groundedness(BaseModel):
    binary_score: str

grounded_parser = JsonOutputParser(pydantic_object=Groundedness)

grounded_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are checking whether the answer is grounded in the documents.\n"
     "Respond ONLY in JSON.\n"
     '{{ "binary_score": "yes" | "no" }}'),
    ("human",
     "Documents:\n{documents}\n\nAnswer:\n{generation}")
])

grounded_grader = grounded_prompt | llm | JsonOutputParser()




In [104]:
class AnswerUsefulness(BaseModel):
    binary_score: str

usefulness_parser = JsonOutputParser(pydantic_object=AnswerUsefulness)

usefulness_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "Check whether the answer addresses the question.\n"
     "Respond ONLY in JSON.\n"
     '{{ "binary_score": "yes" | "no" }}'),
    ("human",
     "Question:\n{question}\n\nAnswer:\n{generation}")
])

usefulness_grader = usefulness_prompt | llm | JsonOutputParser()


In [105]:
rewrite_prompt = ChatPromptTemplate.from_messages([
    ("system", "Rewrite the question to improve document retrieval."),
    ("human", "{question}")
])

question_rewriter = rewrite_prompt | llm | StrOutputParser()


In [106]:
from typing import List, TypedDict

class GraphState(TypedDict):
    question: str
    documents: list
    generation: str
    steps: List[str]



In [None]:
def retrieve(state: GraphState):
    steps = state.get("steps", [])
    steps.append("\n---RETRIEVE---")
    steps.append("Node 'retrieve':")

    docs = retriever.invoke(state["question"])

    return {
        "question": state["question"],
        "documents": docs,
        "steps": steps,
    }


def grade_documents(state: GraphState):
    steps = state["steps"]
    steps.append("\n---CHECK DOCUMENT RELEVANCE TO QUESTION---")

    filtered_docs = []

    for d in state["documents"]:
        score = doc_relevance_grader.invoke({
            "question": state["question"],
            "document": d.page_content
        })

        steps.append(
            f"---GRADE: DOCUMENT {'RELEVANT' if score['binary_score']=='yes' else 'NOT RELEVANT'}---"
        )

        if score["binary_score"] == "yes":
            filtered_docs.append(d)

    steps.append("---ASSESS GRADED DOCUMENTS---")

    return {
        **state,
        "documents": filtered_docs,
        "steps": steps
    }




def generate(state: GraphState):
    steps = state["steps"]
    steps.append("\n---GENERATE---")

    generation = rag_chain.invoke({
        "question": state["question"],
        "context": format_docs(state["documents"]),
    })

    steps.append("---CHECK HALLUCINATIONS---")

    grounded = grounded_grader.invoke({
        "documents": format_docs(state["documents"]),
        "generation": generation,
    })


    if grounded["binary_score"] != "yes":
        steps.append("---DECISION: GENERATION IS NOT GROUNDED---")
        return {
            **state,
            "generation": generation,
            "steps": steps,
            "__route__": "regenerate",
        }

    steps.append("---DECISION: GENERATION IS GROUNDED IN DOCUMENTS---")
    steps.append("---GRADE GENERATION VS QUESTION---")

    useful = usefulness_grader.invoke({
        "question": state["question"],
        "generation": generation,
    })

    if useful["binary_score"] == "yes":
        steps.append("---DECISION: GENERATION ADDRESSES QUESTION---")
        return {
            **state,
            "generation": generation,
            "steps": steps,
            "__route__": "end",   
        }

    steps.append("---DECISION: GENERATION DOES NOT ADDRESS QUESTION---")
    return {
        **state,
        "generation": generation,
        "steps": steps,
        "__route__": "rewrite",  
    }



def transform_query(state):
    return {
        "question": question_rewriter.invoke({"question": state["question"]}),
        "documents": []
    }


In [130]:
def decide_to_generate(state):
    # AFTER document grading
    if not state["documents"]:
        return "transform_query"
    return "generate"


def decide_after_generation(state):
    # hallucination check
    if not state["grounded"]:
        return "regenerate"

    # usefulness check
    if not state["useful"]:
        return "transform_query"

    return "end"


def format_docs(docs):
    return "\n\n".join(d.page_content for d in docs)


def grade_generation(state):
    grounded = grounded_grader.invoke({
        "documents": format_docs(state["documents"]),
        "generation": state["generation"]
    })

    if grounded["binary_score"] != "yes":
        return "generate"  

    useful = usefulness_grader.invoke({
        "question": state["question"],
        "generation": state["generation"]
    })

    if useful["binary_score"] == "yes":
        return "useful"    

    return "transform_query"  



In [None]:
from langgraph.graph import StateGraph, END

graph = StateGraph(GraphState)

graph.add_node("retrieve", retrieve)
graph.add_node("grade_documents", grade_documents)
graph.add_node("generate", generate)
graph.add_node("transform_query", transform_query)

graph.add_edge(START, "retrieve")
graph.add_edge("retrieve", "grade_documents")

graph.add_conditional_edges(
    "grade_documents",
    decide_to_generate,
    {
        "generate": "generate",
        "transform_query": "transform_query",
    }
)

graph.add_edge("transform_query", "retrieve")

graph.add_conditional_edges(
    "generate",
    decide_after_generation,
    {
        "regenerate": "generate",
        "transform_query": "transform_query",
        "end": END,
    }
)


app = graph.compile()


In [110]:
from pprint import pprint

result = app.invoke({
    "question": "What are the product and services of lmkr?"
})

# Print execution trace
for step in result["steps"]:
    print(step)

print("\n---FINAL ANSWER---\n")
print(result["generation"])




---RETRIEVE---
Node 'retrieve':

---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: GENERATE---

---GENERATE---
---CHECK HALLUCINATIONS---
---DECISION: GENERATION IS GROUNDED IN DOCUMENTS---
---GRADE GENERATION VS QUESTION---
---DECISION: GENERATION ADDRESSES QUESTION---

---FINAL ANSWER---

content=' LMKR is a petroleum technology company providing reservoir-centric interpretation, modeling and analytics software, mobile technology solutions, and E&P data services. Their offerings aim to lower the risk in the exploration and production of both conventional and unconventional resource plays.' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 59, 'prompt_tok

In [126]:
from pprint import pprint

result = app.invoke({
    "question": "Who is the ceo of lmkr?"
})

# Print execution trace
for step in result["steps"]:
    print(step)

print("\n---FINAL ANSWER---\n")
print(result["generation"])




---RETRIEVE---
Node 'retrieve':

---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---ASSESS GRADED DOCUMENTS---

---GENERATE---
---CHECK HALLUCINATIONS---
---DECISION: GENERATION IS GROUNDED IN DOCUMENTS---
---GRADE GENERATION VS QUESTION---
---DECISION: GENERATION ADDRESSES QUESTION---

---FINAL ANSWER---

content=' Atif Rais Khan is the CEO of LMKR.' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 14, 'prompt_tokens': 0, 'total_tokens': 14}, 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2', 'system_fingerprint': '', 'finish_reason': 'stop', 'logprobs': None} id='lc_run--019b21e9-2bad-7e00-886c-d9fe43180939-0' usage_metadata={'input_tokens': 0, 'output_tokens': 14, 'total_tokens': 14}


In [127]:
from pprint import pprint

result = app.invoke({
    "question": "When was lmkr founded?"
})

# Print execution trace
for step in result["steps"]:
    print(step)

print("\n---FINAL ANSWER---\n")
print(result["generation"])




---RETRIEVE---
Node 'retrieve':

---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---ASSESS GRADED DOCUMENTS---

---GENERATE---
---CHECK HALLUCINATIONS---
---DECISION: GENERATION IS GROUNDED IN DOCUMENTS---
---GRADE GENERATION VS QUESTION---
---DECISION: GENERATION ADDRESSES QUESTION---

---FINAL ANSWER---

content=' LMKR was founded in 1994.' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 13, 'prompt_tokens': 0, 'total_tokens': 13}, 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2', 'system_fingerprint': '', 'finish_reason': 'stop', 'logprobs': None} id='lc_run--019b21ea-01d8-7360-a98a-98417a125287-0' usage_metadata={'input_tokens': 0, 'output_tokens': 13, 'total_tokens': 13}


In [16]:
doc_relevance_grader.invoke({
    "question": "What products does LMKR offer?",
    "document": documents[0].page_content
})


{'binary_score': 'yes'}

In [128]:
from pprint import pprint

result = app.invoke({
    "question": "What is gverse?"
})

# Print execution trace
for step in result["steps"]:
    print(step)

print("\n---FINAL ANSWER---\n")
print(result["generation"])




---RETRIEVE---
Node 'retrieve':

---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---ASSESS GRADED DOCUMENTS---

---GENERATE---
---CHECK HALLUCINATIONS---
---DECISION: GENERATION IS NOT GROUNDED---

---GENERATE---
---CHECK HALLUCINATIONS---
---DECISION: GENERATION IS GROUNDED IN DOCUMENTS---
---GRADE GENERATION VS QUESTION---
---DECISION: GENERATION DOES NOT ADDRESS QUESTION---

---RETRIEVE---
Node 'retrieve':

---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---ASSESS GRADED DO

In [132]:
from IPython.display import HTML, display

mermaid = app.get_graph().draw_mermaid()

display(HTML(f"""
<div id="mermaid-container"></div>

<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js"></script>
<script>
require.config({{
    paths: {{
        mermaid: "https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min"
    }}
}});

require(["mermaid"], function(mermaid) {{
    mermaid.initialize({{
        startOnLoad: false,
        theme: "default",
        flowchart: {{ curve: "linear" }}
    }});
    document.getElementById("mermaid-container").innerHTML =
        `<pre class="mermaid">{mermaid}</pre>`;
    mermaid.run();
}});
</script>
"""))
