In [1]:
import os
import glob
import re
import time
from typing import List, Optional, Annotated

import requests
from bs4 import BeautifulSoup
from typing_extensions import TypedDict

from langchain_core.documents import Document
from langchain_core.messages import HumanMessage, AIMessage, BaseMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
from langchain_huggingface import HuggingFaceEndpointEmbeddings

from langchain_community.vectorstores import FAISS
from langchain_community.tools.tavily_search import TavilySearchResults

from langchain_text_splitters import RecursiveCharacterTextSplitter

from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from langgraph.checkpoint.memory import MemorySaver





In [None]:
# ---- API keys ----
#Add you api keys

# ---- Models ----
HF_REPO_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

# ---- Static docs ----
STATIC_DATA_DIR = r"./lmkr_data"       # put your .txt docs here
FAISS_DIR = r"./lmkr_faiss_store"      # persisted FAISS index

# ---- Official sources ----
LMKR_SITE = "https://www.lmkr.com"
BAMBOOHR_URL = "https://lmkr.bamboohr.com/careers"


In [3]:
def trace(node_name: str, info: str = ""):
    if info:
        print(f"→ {node_name:<15} {info}")
    else:
        print(f"→ {node_name}")


In [4]:
base_llm = HuggingFaceEndpoint(
    repo_id=HF_REPO_ID,
    task="text-generation",
    max_new_tokens=600,
    temperature=0.2,
    top_p=0.9,
)

LLM = ChatHuggingFace(llm=base_llm)

embeddings = HuggingFaceEndpointEmbeddings(
    repo_id=EMBED_MODEL_NAME,
    task="feature-extraction",
    huggingfacehub_api_token=os.environ["HF_TOKEN"],
)


In [5]:
def load_static_docs(data_dir: str, chunk_size: int = 300, chunk_overlap: int = 50) -> List[Document]:
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs: List[Document] = []
    for path in glob.glob(os.path.join(data_dir, "*.txt")):
        text = open(path, "r", encoding="utf-8", errors="ignore").read()
        for i, chunk in enumerate(splitter.split_text(text)):
            docs.append(Document(page_content=chunk, metadata={"source": os.path.basename(path), "chunk_id": i}))
    return docs

static_docs = load_static_docs(STATIC_DATA_DIR)
print("Static chunks:", len(static_docs))

if not static_docs:
    raise RuntimeError(f"No .txt files found in {STATIC_DATA_DIR}")

if os.path.exists(FAISS_DIR):
    vectorstore = FAISS.load_local(FAISS_DIR, embeddings, allow_dangerous_deserialization=True)
else:
    vectorstore = FAISS.from_documents(static_docs, embeddings)
    vectorstore.save_local(FAISS_DIR)

retriever = vectorstore.as_retriever(search_kwargs={"k": 8})
print("FAISS ready.")


Static chunks: 721
FAISS ready.


In [6]:
class ChatState(TypedDict):
    user_message: str
    messages: Annotated[List[BaseMessage], add_messages]

    # evidence flowing through the pipeline
    documents: Optional[List[Document]]
    retrieved_docs: Optional[List[Document]]

    # CRAG control
    use_web_search: Optional[bool]       # only grade_documents sets this
    rewritten_query: Optional[str]
    path_taken: Optional[str]            # faiss / web_search / bamboohr


In [7]:
def is_careers_query(text: str) -> bool:
    q = text.lower()
    return any(w in q for w in ["job", "jobs", "career", "careers", "opening", "openings", "hiring", "vacancy", "vacancies"])

# --- Selenium (for JS-rendered BambooHR) ---
SELENIUM_AVAILABLE = True
try:
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.common.by import By
except Exception:
    SELENIUM_AVAILABLE = False

def fetch_bamboohr_jobs() -> List[str]:
    """
    Extract job titles from BambooHR careers page.
    Uses Selenium because BambooHR is often JS-rendered.
    No hard-coded titles.
    """
    if not SELENIUM_AVAILABLE:
        # If selenium isn't available, this will usually NOT work for JS-rendered pages.
        # We keep it graceful: return empty.
        return []

    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")

    driver = webdriver.Chrome(options=options)
    driver.get(BAMBOOHR_URL)
    time.sleep(5)  # allow JS to render

    els = driver.find_elements(By.CSS_SELECTOR, "a[href*='/careers/']")
    titles = []
    for el in els:
        t = el.text.strip()
        if t:
            titles.append(t)

    driver.quit()

    # dedupe, preserve order
    seen = set()
    out = []
    for t in titles:
        if t not in seen:
            seen.add(t)
            out.append(t)
    return out


In [17]:
from urllib.parse import urlparse

ALLOWED_HOSTS = {"lmkr.com", "www.lmkr.com"}

def is_allowed_lmkr_url(url: str) -> bool:
    try:
        host = urlparse(url).netloc.lower()
        return host in ALLOWED_HOSTS
    except Exception:
        return False

def fetch_page_text(url: str, timeout: int = 15) -> str:
    """Fetch the page and extract readable text (fallback when Tavily snippet is too short)."""
    try:
        resp = requests.get(url, timeout=timeout, headers={"User-Agent": "Mozilla/5.0"})
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()
        text = " ".join(soup.get_text(" ").split())
        return text
    except Exception:
        return ""


In [8]:
def retrieve_node(state: ChatState) -> ChatState:
    trace("retrieve", "(faiss)")
    query = state["user_message"]

    docs = retriever.invoke(query)

    # DISCIPLINE: retrieve does NOT decide web search.
    # Hygiene: clear stale flags without deciding.
    return {
        "documents": docs,
        "retrieved_docs": docs,
        "use_web_search": None,
        "rewritten_query": None,
        "path_taken": "faiss",
    }


In [9]:
def grade_documents_node(state: ChatState) -> ChatState:
    """
    CRAG discipline:
    - This node is the ONLY place that sets use_web_search True/False.
    - It grades ANSWERABILITY (complete + confident) from retrieved docs.
    - If NO => wipe docs and trigger rewrite+web.
    """
    docs = state.get("documents") or []
    question = state["user_message"]

    # If nothing retrieved, must correct
    if not docs:
        trace("grade_documents")
        return {**state, "use_web_search": True, "documents": [], "path_taken": "web_search"}

    context = "\n\n---\n\n".join(d.page_content for d in docs[:8])

    system_prompt = """
You are the CRAG gatekeeper.

Decide if the user's question can be answered COMPLETELY and CONFIDENTLY
using ONLY the provided retrieved context.

Rules:
- If the question asks for "current", "latest", "today", "now", or other time-sensitive info,
  answer YES only if the context clearly contains up-to-date information that supports it.
- If the context is incomplete, outdated, generic, or uncertain, answer NO.
- If the context contradicts itself or does not directly support an answer, answer NO.

Output ONLY: YES or NO
"""

    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", "Question:\n{question}\n\nRetrieved context:\n{context}")
    ])

    verdict = (prompt | LLM | StrOutputParser()).invoke({"question": question, "context": context}).strip().lower()
    use_web = (verdict != "yes")

    trace("grade_documents")

    if use_web:
        # DISCIPLINE: once we correct, discard internal docs
        return {
            **state,
            "use_web_search": True,
            "documents": [],
            "path_taken": "web_search",
        }

    return {
        **state,
        "use_web_search": False,
        "path_taken": "faiss",
    }


In [10]:
def rewrite_query_node(state: ChatState) -> ChatState:
    trace("rewrite_query")
    q = state["user_message"]

    # Keep rewrite minimal and domain-restricted
    # (No “analysis”, just a safe query shaping)
    rewritten = f"site:lmkr.com {q}"

    # If it is careers, we still allow rewrite but the web_search_node will override to BambooHR extraction
    return {**state, "rewritten_query": rewritten}


In [18]:
web_search_tool = TavilySearchResults(k=5)

def web_search_node(state: ChatState) -> ChatState:
    trace("web_search")

    user_q = state["user_message"]
    query = state.get("rewritten_query") or f"site:lmkr.com {user_q}"

    # Careers override stays as you had it (bamboohr) — unchanged:
    if is_careers_query(user_q):
        jobs = fetch_bamboohr_jobs()
        content = (
            "Current open positions at LMKR (BambooHR):\n" + "\n".join(f"- {j}" for j in jobs)
            if jobs else
            "Unable to confirm current openings from BambooHR right now."
        )
        doc = Document(page_content=content, metadata={"source": BAMBOOHR_URL})
        return {**state, "documents": [doc], "retrieved_docs": [doc], "path_taken": "bamboohr"}

    # Normal LMKR-only web search
    results = web_search_tool.invoke({"query": query})

    docs: List[Document] = []
    for r in results:
        url = (r.get("url") or "").strip()
        snippet = (r.get("content") or "").strip()

        # ✅ LMKR-only by host (not fragile startswith)
        if not url or not is_allowed_lmkr_url(url):
            continue

        # ✅ If snippet is short, fetch the page text
        text = snippet if len(snippet) >= 80 else fetch_page_text(url)

        # Keep even shorter pages if we have *some* content
        if not text or len(text) < 50:
            continue

        docs.append(Document(page_content=text, metadata={"source": url}))

    return {
        **state,
        "documents": docs,
        "retrieved_docs": docs,
        "path_taken": "web_search",
    }



In [19]:
def generate_node(state: ChatState) -> ChatState:
    trace("generate")

    docs = state.get("documents") or []
    if not docs:
        return {**state, "messages": [AIMessage(content="I don’t have enough verified information to answer that.")]}

    # DISCIPLINE: generator uses ONLY whatever docs are currently in state
    # (Either FAISS docs or Web docs, never mixed if discipline is maintained.)
    context = "\n\n".join(d.page_content for d in docs)

    system_prompt = """
Answer using ONLY the provided context.

Rules:
- Do not invent facts.
- If the answer is not explicitly supported by the context, say you cannot confirm it.
- Keep the answer concise and factual.
"""

    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", "Context:\n{context}\n\nQuestion:\n{question}")
    ])

    answer = (prompt | LLM | StrOutputParser()).invoke({
        "context": context,
        "question": state["user_message"]
    })

    return {**state, "messages": [AIMessage(content=answer)]}


In [20]:
def decide_next(state: ChatState) -> str:
    # Discipline: ONLY use_web_search controls this branch
    return "rewrite_query" if state.get("use_web_search") else "generate"

workflow = StateGraph(ChatState)

workflow.add_node("retrieve", retrieve_node)
workflow.add_node("grade_documents", grade_documents_node)
workflow.add_node("rewrite_query", rewrite_query_node)
workflow.add_node("web_search", web_search_node)
workflow.add_node("generate", generate_node)

workflow.add_edge(START, "retrieve")
workflow.add_edge("retrieve", "grade_documents")

workflow.add_conditional_edges(
    "grade_documents",
    decide_next,
    {
        "rewrite_query": "rewrite_query",
        "generate": "generate",
    },
)

workflow.add_edge("rewrite_query", "web_search")
workflow.add_edge("web_search", "generate")
workflow.add_edge("generate", END)

app = workflow.compile(checkpointer=MemorySaver())
print("CRAG graph compiled.")


CRAG graph compiled.


In [14]:
from IPython.display import HTML, display

mermaid_src = app.get_graph().draw_mermaid()

display(HTML(f"""
<div id="mermaid-container"></div>

<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js"></script>
<script>
require.config({{
    paths: {{
        mermaid: "https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min"
    }}
}});

require(["mermaid"], function(mermaidLib) {{
    mermaidLib.initialize({{
        startOnLoad: false,
        theme: "default",
        flowchart: {{ curve: "linear" }}
    }});

    document.getElementById("mermaid-container").innerHTML =
        `<pre class="mermaid">{mermaid_src}</pre>`;

    mermaidLib.run();
}});
</script>
"""))


In [15]:
def run_query(q: str, thread_id: str = "lmkr_crag_session"):
    print("\n========== EXECUTION TRACE ==========\n")
    state = {"user_message": q, "messages": [HumanMessage(content=q)]}

    final_state = None
    for step in app.stream(state, config={"thread_id": thread_id}):
        # We DO NOT print node names here (to avoid messy duplication).
        # Each node prints its own clean trace line.
        for _, out in step.items():
            final_state = out

    print("\n========== FINAL ANSWER ==========\n")
    print(final_state["messages"][-1].content)
    return final_state

# Try:
run_query("when was LMKR founded?")




→ retrieve        (faiss)
→ grade_documents
→ generate


1994.


{'user_message': 'when was LMKR founded?',
 'messages': [AIMessage(content='1994.', additional_kwargs={}, response_metadata={}, id='3a2b38e7-e3a2-4d79-9132-d835c03f002d')],
 'documents': [Document(id='d5238e4c-d684-482a-b313-cc4fd367f123', metadata={'source': 'partnership.txt', 'chunk_id': 0}, page_content='Key LMKR Partnerships and Collaborations:'),
  Document(id='e76e034f-83bf-47c9-a58a-838e4f9d6e17', metadata={'source': 'info.txt', 'chunk_id': 18}, page_content='LMKR is a diversified global technology company originally focused on geo-technology and petroleum-related IT / E&P services. Over time, under the leadership of Atif Rais Khan, the group expanded and restructured to include LMKT (a general-purpose ICT / technology services company) and TRVERSE (a'),
  Document(id='edb12d6c-f3e0-4665-a811-430aa80b4d47', metadata={'source': 'info.txt', 'chunk_id': 0}, page_content='Company: LMKR  \nFounded: 1994  \nType: Private, global provider of geo-technology and information technology se

In [21]:
def run_query(q: str, thread_id: str = "lmkr_crag_session"):
    print("\n========== EXECUTION TRACE ==========\n")
    state = {"user_message": q, "messages": [HumanMessage(content=q)]}

    final_state = None
    for step in app.stream(state, config={"thread_id": thread_id}):
        # We DO NOT print node names here (to avoid messy duplication).
        # Each node prints its own clean trace line.
        for _, out in step.items():
            final_state = out

    print("\n========== FINAL ANSWER ==========\n")
    print(final_state["messages"][-1].content)
    return final_state

# Try:
run_query("what are the recent announcements?")




→ retrieve        (faiss)
→ grade_documents
→ rewrite_query
→ web_search
→ generate


Recent Announcements include:

1. LMKR Achieves ISO 9001 & ISO 27001 Certification Across USA, UAE & Pakistan
2. LMKR, Oracle Strengthen Cloud Relationship
3. LMKR Partners with LG&HTP to Modernize Karachi’s Property Tax System
4. LMKR, PULSE Partner to Transform Punjab’s Land Record Systems with Next-Gen LRMIS
5. Release of GVERSE GeoGraphix 2022.1
6. LMKR Empowers GEPCO to Revolutionize Distribution Network Management with Advanced GIS Solution
7. LMKR 2023 Townhall: A Year of Expanding Horizons
8. Release of GVERSE Geophysics and GeoGraphix 2017.2
9. Launch of GVERSE GO, a pay per use subscription program for instant access to latest software
10. Release of GVERSE Applications Suite
11. Release of GeoGraphix 2015
12. LMKR Announces new partnerships, new advanced geophysical capabilities and a new GeoGraphix update
13. Release of GVERSE GeoGraphix Release 2026.1


{'user_message': 'what are the recent announcements?',
 'messages': [AIMessage(content='Recent Announcements include:\n\n1. LMKR Achieves ISO 9001 & ISO 27001 Certification Across USA, UAE & Pakistan\n2. LMKR, Oracle Strengthen Cloud Relationship\n3. LMKR Partners with LG&HTP to Modernize Karachi’s Property Tax System\n4. LMKR, PULSE Partner to Transform Punjab’s Land Record Systems with Next-Gen LRMIS\n5. Release of GVERSE GeoGraphix 2022.1\n6. LMKR Empowers GEPCO to Revolutionize Distribution Network Management with Advanced GIS Solution\n7. LMKR 2023 Townhall: A Year of Expanding Horizons\n8. Release of GVERSE Geophysics and GeoGraphix 2017.2\n9. Launch of GVERSE GO, a pay per use subscription program for instant access to latest software\n10. Release of GVERSE Applications Suite\n11. Release of GeoGraphix 2015\n12. LMKR Announces new partnerships, new advanced geophysical capabilities and a new GeoGraphix update\n13. Release of GVERSE GeoGraphix Release 2026.1', additional_kwargs={

In [23]:
def run_query(q: str, thread_id: str = "lmkr_crag_session"):
    print("\n========== EXECUTION TRACE ==========\n")
    state = {"user_message": q, "messages": [HumanMessage(content=q)]}

    final_state = None
    for step in app.stream(state, config={"thread_id": thread_id}):
        # We DO NOT print node names here (to avoid messy duplication).
        # Each node prints its own clean trace line.
        for _, out in step.items():
            final_state = out

    print("\n========== FINAL ANSWER ==========\n")
    print(final_state["messages"][-1].content)
    return final_state

# Try:
run_query("what is gverse?")




→ retrieve        (faiss)
→ grade_documents
→ generate


GVERSE is a software designed for asset teams, providing fast, on-the-fly solutions with real-time data to maximize productivity without raising costs.


{'user_message': 'what is gverse?',
 'messages': [AIMessage(content='GVERSE is a software designed for asset teams, providing fast, on-the-fly solutions with real-time data to maximize productivity without raising costs.', additional_kwargs={}, response_metadata={}, id='f00cfa9a-b5ce-4c66-83f8-f5fca8237e2f')],
 'documents': [Document(id='43129491-bae4-47d6-bed7-f6332d2f154f', metadata={'source': 'post_275.txt', 'chunk_id': 5}, page_content='for asset teams. It has been designed specifically for users who need fast, on the fly solutions that provide data in real time enabling users to maximize productivity without raising costs. Availability GVERSE is available worldwide from November 2015. For more details visit our website.'),
  Document(id='710cb022-a30b-4da4-a50f-047f292eeedd', metadata={'source': 'post_217.txt', 'chunk_id': 3}, page_content='cost model and efficiency of the cloud. GVERSE Go offers an experience of using pay-per-use licensing model without moving the data outside of