# 🕸️ Week 07-08 · Notebook 11 · LangGraph Introduction

Model maintenance workflows as state graphs using LangGraph to prepare for multi-tool agents.

## 🎯 Learning Objectives
- Understand LangGraph node/edge fundamentals.
- Build a base state graph for intake → diagnosis → action.
- Persist state transitions for audits.
- Simulate failure handling and human-in-the-loop interventions.

## 🧩 Scenario
Arvind Manufacturing wants a deterministic flow before promoting autonomous agents: intake incident, retrieve context, reason, respond, escalate if confidence is low.

In [None]:
from langgraph.graph import StateGraph, END
from typing import TypedDict, List, Dict, Any
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel, Field
import json
from datetime import datetime

# --- 1. State Definition ---
# Define the schema for our graph's state. This is the data that flows between nodes.
class MaintenanceState(TypedDict):
    incident: str
    context: List[str]
    analysis: Dict[str, Any] # To store structured analysis
    response: str
    operator_id: str
    logs: List[Dict[str, Any]]

# --- 2. Pydantic Model for Structured Output ---
class AnalysisResult(BaseModel):
    reasoning: str = Field(description="Detailed reasoning for the diagnosis.")
    confidence: float = Field(description="Confidence score (0.0 to 1.0) in the reasoning.")
    recommended_action: str = Field(description="The next step to take.")

# --- 3. Workflow Class ---
# Encapsulate the graph logic in a class for better organization.
class MaintenanceWorkflow:
    def __init__(self):
        self.llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
        self.embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
        
        # Simulate a knowledge base
        self.vectorstore = Chroma.from_texts([
            "SOP-122: Spindle bearing maintenance requires lockout/tagout.",
            "SOP-187: Vibration thresholds for punch press: warning at 7mm/s, critical at 12mm/s.",
            "HIST-341: Similar vibration on Press 14 was resolved by adjusting mounting bolts.",
            "SAFETY-ALERT: Any vibration over 10mm/s requires immediate shutdown."
        ], self.embeddings)
        self.retriever = self.vectorstore.as_retriever()
        
        self.graph = self._build_graph()

    def _log_step(self, state: MaintenanceState, step_name: str, comment: str) -> List[Dict]:
        logs = state.get("logs", [])
        logs.append({
            "timestamp": datetime.now().isoformat(),
            "step": step_name,
            "comment": comment
        })
        return logs

    # --- Node Definitions ---
    def intake(self, state: MaintenanceState) -> MaintenanceState:
        incident = state["incident"]
        logs = self._log_step(state, "intake", f"Received incident: '{incident}'")
        return {"incident": incident, "logs": logs}

    def retrieve(self, state: MaintenanceState) -> MaintenanceState:
        incident = state["incident"]
        context = [doc.page_content for doc in self.retriever.get_relevant_documents(incident)]
        logs = self._log_step(state, "retrieve", f"Retrieved {len(context)} documents.")
        return {"context": context, "logs": logs}

    def analyze(self, state: MaintenanceState) -> MaintenanceState:
        parser = JsonOutputParser(pydantic_object=AnalysisResult)
        prompt = ChatPromptTemplate.from_template(
            """Analyze this maintenance incident with the provided context.
            Provide your reasoning, a confidence score, and a recommended action.
            {format_instructions}
            Incident: {incident}
            Context: {context}"""
        )
        chain = prompt | self.llm | parser
        analysis = chain.invoke({
            "incident": state["incident"],
            "context": "\n".join(state["context"]),
            "format_instructions": parser.get_format_instructions()
        })
        logs = self._log_step(state, "analyze", f"Analysis complete. Confidence: {analysis['confidence']:.2f}")
        return {"analysis": analysis, "logs": logs}

    def respond(self, state: MaintenanceState) -> MaintenanceState:
        prompt = ChatPromptTemplate.from_template(
            """Based on your analysis, provide a response to the maintenance technician.
            Include specific steps referencing relevant SOPs.
            Your analysis: {analysis}"""
        )
        chain = prompt | self.llm
        response = chain.invoke({"analysis": state["analysis"]}).content
        logs = self._log_step(state, "respond", "Generated response for technician.")
        return {"response": response, "logs": logs}

    def escalate(self, state: MaintenanceState) -> MaintenanceState:
        escalation_response = (
            "⚠️ This incident requires human expertise. "
            f"Reasoning: {state['analysis']['reasoning']}. "
            "I've escalated this to the maintenance supervisor."
        )
        logs = self._log_step(state, "escalate", "Confidence below threshold. Escalated to supervisor.")
        return {"response": escalation_response, "logs": logs}

    # --- Edge Logic ---
    def route_based_on_confidence(self, state: MaintenanceState) -> str:
        if state["analysis"]["confidence"] < 0.8:
            return "escalate"
        else:
            return "respond"

    # --- Graph Construction ---
    def _build_graph(self) -> StateGraph:
        graph = StateGraph(MaintenanceState)
        graph.add_node("intake", self.intake)
        graph.add_node("retrieve", self.retrieve)
        graph.add_node("analyze", self.analyze)
        graph.add_node("respond", self.respond)
        graph.add_node("escalate", self.escalate)
        
        graph.set_entry_point("intake")
        graph.add_edge("intake", "retrieve")
        graph.add_edge("retrieve", "analyze")
        graph.add_conditional_edge("analyze", self.route_based_on_confidence, {
            "respond": "respond",
            "escalate": "escalate"
        })
        graph.add_edge("respond", END)
        graph.add_edge("escalate", END)
        
        return graph.compile()

# --- 4. Execution ---
workflow = MaintenanceWorkflow()
maintenance_flow = workflow.graph

# Example 1: High-confidence scenario
high_confidence_incident = {
    "incident": "Punch Press 14 has excessive vibration after bearing replacement. Measured at 8.3mm/s.",
    "operator_id": "TECH-789",
}
result_high = maintenance_flow.invoke(high_confidence_incident)

print("--- High Confidence Run ---")
print(f"Final Response: {result_high['response']}\n")
print("Process Log:")
for log in result_high["logs"]:
    print(f"- {log['step']}: {log['comment']}")

# Example 2: Low-confidence scenario requiring escalation
low_confidence_incident = {
    "incident": "There's a strange smell near the main hydraulic pump and a puddle on the floor. Unsure if it's oil or water.",
    "operator_id": "TECH-123",
}
result_low = maintenance_flow.invoke(low_confidence_incident)

print("\n--- Low Confidence Run ---")
print(f"Final Response: {result_low['response']}\n")
print("Process Log:")
for log in result_low["logs"]:
    print(f"- {log['step']}: {log['comment']}")


### 🚨 Escalation Edge
Our implementation includes a conditional edge: if `confidence < 0.8`, trigger the `escalate` node which notifies a maintenance supervisor. This ensures safety-critical decisions always have human oversight.

The key elements in our LangGraph implementation:

1. **State Definition**: Using `TypedDict` to define what our maintenance workflow keeps track of
2. **Nodes**: Core functions that process the state (intake → retrieve → analyze → respond/escalate)
3. **Conditional Edges**: Logic to determine the flow based on confidence scores
4. **Persistence**: Logging every step to maintain audit trail

Let's visualize the workflow:

In [None]:
# Run the graph with a sample incident
result = maintenance_flow.invoke({
    "incident": "Punch Press 14 has excessive vibration after bearing replacement. Measured at 8.3mm/s.",
    "operator_id": "TECH-789",
    "logs": []
})

print(f"Response: {result['response']}\n")
print(f"Confidence: {result['confidence']}")
print("\nProcess Log:")
for log_entry in result["logs"]:
    print(f"- {log_entry['step']} at {log_entry['timestamp'].split('T')[1][:8]}")

# Save the process log
with open("maintenance_flow_log.json", "w") as f:
    json.dump(result["logs"], f, indent=2)

## 🧪 Lab Assignment
1. **Persist State Transitions**: Extend the state logging mechanism to write to a structured JSON log file with these exact fields:
   ```python
   log_entry = {
       "timestamp": datetime.now().isoformat(),
       "step": node_name,
       "state_before": state_snapshot_before,
       "state_after": state_snapshot_after,
       "operator_id": state.get("operator_id", "system"),
       "duration_ms": execution_time
   }
   ```

2. **Add Tool Health Check Node**: Implement a `verify_tools` node that runs before intake:
   ```python
   @graph.node()
   def verify_tools(state: MaintenanceState) -> MaintenanceState:
       """Verify all required tools are available and functioning"""
       tools = [
           {"name": "database_connector", "status": "healthy"},
           {"name": "sop_retriever", "status": "healthy"},
           {"name": "notification_service", "status": "degraded"}
       ]
       
       # Check if any critical tools are unhealthy
       unhealthy = [t for t in tools if t["status"] != "healthy"]
       if any(t["name"] == "sop_retriever" for t in unhealthy):
           raise Exception("Critical tool unavailable: sop_retriever")
       
       logs = state.get("logs", [])
       logs.append({
           "timestamp": datetime.now().isoformat(),
           "step": "verify_tools",
           "tools_checked": len(tools),
           "degraded_tools": [t["name"] for t in unhealthy]
       })
       
       return {**state, "logs": logs, "tools_status": tools}
   ```

3. **Simulate Failure Recovery**: Add exception handling to the `retrieve` node that returns fallback information when the retrieval fails:
   ```python
   @graph.node()
   def retrieve(state: MaintenanceState) -> MaintenanceState:
       try:
           # Original retrieval logic
           # ...
           
           # Simulate failure (25% of the time)
           if random.random() < 0.25:
               raise Exception("Knowledge base connection timeout")
               
           # Return normal state
           return {...}
       
       except Exception as e:
           logs = state.get("logs", [])
           logs.append({
               "timestamp": datetime.now().isoformat(),
               "step": "retrieve",
               "error": str(e),
               "using_fallback": True
           })
           
           # Return fallback context
           return {
               **state,
               "context": ["FALLBACK: Standard maintenance protocols apply. Refer to printed manual."],
               "logs": logs
           }
   ```

4. **Graph Visualization**: Generate a visual representation of your LangGraph:
   ```python
   # Visualization code example
   import networkx as nx
   import matplotlib.pyplot as plt
   
   def visualize_graph(graph):
       G = nx.DiGraph()
       
       # Add nodes
       for node in ["intake", "verify_tools", "retrieve", "analyze", "respond", "escalate"]:
           G.add_node(node)
       
       # Add edges
       G.add_edge("verify_tools", "intake")
       G.add_edge("intake", "retrieve")
       G.add_edge("retrieve", "analyze")
       G.add_edge("analyze", "respond")
       G.add_edge("analyze", "escalate")
       
       # Draw the graph
       plt.figure(figsize=(10, 6))
       pos = nx.spring_layout(G)
       nx.draw(G, pos, with_labels=True, node_color="lightblue", 
               node_size=2000, arrows=True, arrowsize=20)
       
       plt.title("Maintenance Workflow Graph")
       plt.savefig("maintenance_graph.png")
       plt.show()
   
   visualize_graph(graph)
   ```

## ✅ Checklist
- [ ] Base LangGraph implemented with all required nodes (intake, retrieve, analyze, respond/escalate)
- [ ] Conditional routing properly handles confidence thresholds (< 0.8 triggers escalation)
- [ ] State logs capture all transitions with timestamp, step name, and detailed data
- [ ] Tool verification node checks connectivity before workflow starts
- [ ] Error handling provides graceful recovery with appropriate fallback responses
- [ ] Graph visualization created showing all nodes and decision paths
- [ ] Log persistence implemented with structured JSON format

## 📚 References
- LangGraph Documentation
- Week 05 Governance Logging
- Incident Response SOP