In [385]:
import os 
import json
from langgraph.graph import StateGraph,START,END
from typing import TypedDict,Dict,List,Any
import pymongo
from langchain_openai import AzureChatOpenAI
from langchain.prompts import PromptTemplate

from dotenv import load_dotenv
from datetime import datetime
import re
from config import ConfigData

In [386]:
load_dotenv()

True

In [387]:
class MongoDBAgent:
    def __init__(self, uri: str, db_name: str, default_collection: str):
        import pymongo
        self.client = pymongo.MongoClient(uri)
        self.db = self.client[db_name]
        self.default_collection_name = default_collection

    def list_collections(self):
        return self.db.list_collection_names()

    def get_collection(self, name: str):
        return self.db[name]

    def validate_collection(self, name: str):
        return name in self.list_collections()

    def get_default_collection(self):
        """Return the default collection as pymongo Collection object"""
        return self.db[self.default_collection_name]

MONGO_URI = "mongodb://localhost:27017/mydatabase"
MONGO_DB = "ASD"
DEFAULT_COLLECTION = "2025-05-13"


mongo_agent = MongoDBAgent(MONGO_URI, MONGO_DB, DEFAULT_COLLECTION)
collection = mongo_agent.get_default_collection()

In [388]:
llm = AzureChatOpenAI(
    azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    temperature=0.3
)

In [389]:

class State(TypedDict, total=False):
    date: str                          # which collection to use
    formula_included: bool             # true if formula/report is detected
    formula_list_included: list[dict]  # list of formula(s) matched from json
    query: str                         # MongoDB query generated
    result: Any                        # query result
    user_question: str 



In [390]:
date_prompt = PromptTemplate(
    template="""
You are an intelligent assistant that selects the correct MongoDB collection name.

Rules:
- If the user mentions "today" or "todays", return today's date in format YYYY-MM-DD.
- If the user mentions a specific date (e.g., 2025-08-01, 1st August 2025, Aug 1 2025), convert it to YYYY-MM-DD format.
- If no date is mentioned, return the default collection: {default_collection}.
- Output ONLY the date string, nothing else.

Today's Date: {today}

User Question: {user_question}
""",
    input_variables=["user_question", "default_collection", "today"]
)

In [391]:
def date_agent(state: State) -> State:
    user_q = state.get("user_question", "")
    
    prompt_text = date_prompt.format(
        user_question=user_q,
        default_collection=DEFAULT_COLLECTION,
        today=datetime.now().strftime("%Y-%m-%d"),
    )
    
    raw = llm.invoke(prompt_text).content.strip()
    
    # validate
    # match = re.search(r"\d{4}-\d{2}-\d{2}", raw)
    # date_str = match.group(0) if match else DEFAULT_COLLECTION
    # if not mongo_agent.validate_collection(date_str):
    #     date_str = DEFAULT_COLLECTION
    
    state["date"] = raw
    return state

In [392]:
with open("formulas.json", "r") as f:
    FORMULAS = json.load(f)

In [393]:
formula_prompt = PromptTemplate.from_template("""
You are an expert in parcel sorting systems and PLC-based data analysis. 
Your task is to detect if the user explicitly requests an analysis formula.

Available formulas: {formula_keys}

User Question: {user_question}

Rules:
- Carefully interpret the user request in the context of parcel sorting and reporting.
- Select one or more formula names from the list if the request is clearly referring to them,
  even if the wording is slightly different (e.g., "sortation result" = "sort_report").
- Only output formula names that semantically match the user request.
- If none match, output exactly "NONE".
- Output must contain only the formula name(s) or "NONE" (no extra text).
""")


def check_if_formula_matches(state: State) -> State:
    user_q = state.get("user_question", "")
    
    prompt_text = formula_prompt.format(
        formula_keys=list(FORMULAS.keys()),
        user_question=user_q
    )
    
    raw = llm.invoke(prompt_text).content.strip()
    
    if raw.upper() == "NONE":
        state["formula_included"] = False
        state["formula_list_included"] = []
    else:
        matched = [name.strip() for name in raw.split(",") if name.strip() in FORMULAS]
        if matched:
            state["formula_included"] = True
            state["formula_list_included"] = [{m: FORMULAS[m]} for m in matched]
        else:
            state["formula_included"] = False
            state["formula_list_included"] = []
    
    return state

In [394]:
from config import ConfigData
table_schema = ConfigData.TABLE_SCHEMA
schema_description = ConfigData.SCHEMA_DESCRIPTION

In [None]:
def generate_query_from_formula(state: State) -> State:
    user_q = state["user_question"]
    included_formulas = state.get("formula_list_included", [])

    # Format formulas nicely for LLM
    formulas_text = "\n".join([str(f) for f in included_formulas]) if included_formulas else "None"

    prompt_text = f"""
You are a MongoDB aggregation pipeline generator.

User Question:
{user_q}

MongoDB Schema:
{table_schema}

Sample formulas that might be useful:
{formulas_text}

⚠️ Rules:
1. Only return valid Python code. don t return anything else.
2. Do NOT use triple backticks (```).
3. Do NOT use print().
4. Always assign the pipeline to a variable called `pipeline`.
6. You can refere sample formulas above to construct the pipeline.and make changes as per user question.
7. don't mention any date in query
8. if there are more than one formula in sample formula you have to combine them to make query of combination of them.

9. Assume later the query will be executed as:
       result = list(collection.aggregate(pipeline))
10. Do NOT generate `collection.find()` or `count_documents()`, only aggregation pipelines.
11. Do NOT consider any date filter; collection name itself is the date.
"""

    raw_code = llm.invoke(prompt_text).content.strip()
    raw_code = raw_code.replace("```", "").strip()

    # Ensure final output is assigned to pipeline
    if not raw_code.strip().startswith("pipeline ="):
        raw_code = f"pipeline = {raw_code}"

    state["query"] = raw_code
    return state


In [396]:
def execute_agent(state: State) -> State:
    query_code = state.get("query", "")
    date_str = state.get("date", DEFAULT_COLLECTION)

    # Get the correct collection based on date
    if mongo_agent.validate_collection(date_str):
        collection = mongo_agent.get_collection(date_str)
    else:
        collection = mongo_agent.get_default_collection()

    # Prepare the local environment for exec
    local_env = {"collection": collection, "list": list}

    try:
        exec(query_code, {}, local_env)
        pipeline = local_env.get("pipeline", [])
        result = list(collection.aggregate(pipeline))
        state["result"] = result
    except Exception as e:
        state["result"] = f"Error executing query: {e}"

    return state

In [397]:


graph = StateGraph(State)


graph.add_node("date_agent", date_agent)
graph.add_node("check_if_formula_matches", check_if_formula_matches)
graph.add_node("generate_query_from_formula", generate_query_from_formula)
graph.add_node("execute_agent", execute_agent)



# --- Entry ---
graph.add_edge(START, "date_agent")
graph.add_edge("date_agent", "check_if_formula_matches")
graph.add_edge("check_if_formula_matches", "generate_query_from_formula")
graph.add_edge("generate_query_from_formula", "execute_agent")
graph.add_edge("execute_agent", END)


# --- Compile ---
app = graph.compile()


In [398]:
result = app.invoke({"user_question": "Share the total count of of each message"})
print("Final Result:", result["result"])
print("date",result["date"])
print("formula_include",result["formula_list_included"])
print("actual_used",result["query"]) 

Final Result: []
date 2025-05-13
formula_include [{'get_message_count_all_messages': [{'$unwind': '$events'}, {'$group': {'_id': '$events.type', 'count': {'$sum': 1}}}, {'$group': {'_id': 0, 'counts': {'$push': {'k': '$_id', 'v': '$count'}}}}, {'$replaceRoot': {'newRoot': {'$arrayToObject': '$counts'}}}]}]
actual_used pipeline = [
    {'$unwind': '$messages'},
    {'$group': {'_id': '$messages.type', 'count': {'$sum': 1}}},
    {'$group': {'_id': 0, 'counts': {'$push': {'k': '$_id', 'v': '$count'}}}},
    {'$replaceRoot': {'newRoot': {'$arrayToObject': '$counts'}}}
]
