In [2]:
# LangGraph Prompt Optimization System
# This notebook creates a 4-agent system for prompt optimization using LangGraph

# Installation and Setup
%pip install --upgrade --quiet langgraph langchain-google-vertexai google-cloud-aiplatform[evaluation] pandas

import os
import sys
import json
import pandas as pd
from typing import Dict, List, Any, TypedDict, Annotated
from IPython.display import display, Markdown

# Google Cloud Setup
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()

PROJECT_IDS = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_IDS[0]

if not PROJECT_ID:
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = "us-central1"
EXPERIMENT_NAME = "prompt-optimization-experiment"

os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID
os.environ["GOOGLE_CLOUD_LOCATION"] = LOCATION
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "TRUE"

print(f"Project ID: {PROJECT_ID}")
print(f"Location: {LOCATION}")

# LangGraph and LangChain imports
from langgraph.graph import StateGraph, END
from langgraph.prebuilt import ToolNode
from langchain_google_vertexai import ChatVertexAI
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
import vertexai
from vertexai.evaluation import EvalTask, PointwiseMetric, PointwiseMetricPromptTemplate

# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=LOCATION)

# Initialize the LLM
llm = ChatVertexAI(
    model_name="gemini-2.0-flash-001",
    temperature=0.7,
    max_tokens=2048
)

print("✅ Setup completed successfully!")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Project ID: my-project-0004-346516
Location: us-central1
✅ Setup completed successfully!


In [9]:

# =============================================================================
# STATE DEFINITION
# =============================================================================

# Define the state that will be passed between agents
class PromptOptimizationState(TypedDict):
    original_prompt: str
    current_prompt: str
    test_dataset: List[Dict[str, str]]
    agent2_results: List[Dict[str, Any]]
    evaluation_results: Dict[str, Any]
    enhancement_recommendations: str
    enhancement_makes_sense: bool
    final_prompt: str
    iteration: int
    max_iterations: int

print("✅ State definition completed!")

# =============================================================================
# AGENT 1: DATASET GENERATOR
# =============================================================================

def agent1_dataset_generator(state: PromptOptimizationState) -> PromptOptimizationState:
    """
    Agent 1: Generates a dataset of input-output pairs for testing the prompt
    """
    print("🤖 Agent 1: Generating test dataset...")
    
    original_prompt = state["original_prompt"]
    
    dataset_generation_prompt = f"""
    You are a dataset generation expert. Given the following prompt, generate 12 diverse input-output pairs that would be good for testing this prompt.
    
    Original Prompt: {original_prompt}
    
    Generate 12 test cases with varied inputs that would help evaluate how well this prompt performs. 
    Each test case should have:
    - input: A realistic input scenario
    - expected_output: What a good response should look like
    
    Make the test cases diverse to cover different scenarios, edge cases, and complexity levels.
    
    Return ONLY a valid JSON array in this format:
    [
        {{"input": "test input 1", "expected_output": "expected response 1"}},
        {{"input": "test input 2", "expected_output": "expected response 2"}},
        ...
    ]
    """
    
    response = llm.invoke([HumanMessage(content=dataset_generation_prompt)])
    
    try:
        # Extract JSON from response
        response_text = response.content.strip()
        if response_text.startswith("```json"):
            response_text = response_text[7:-3].strip()
        elif response_text.startswith("```"):
            response_text = response_text[3:-3].strip()
        
        test_dataset = json.loads(response_text)
        
        # Validate dataset structure
        if not isinstance(test_dataset, list) or len(test_dataset) != 12:
            raise ValueError("Dataset should be a list of 12 items")
        
        for item in test_dataset:
            if not isinstance(item, dict) or "input" not in item or "expected_output" not in item:
                raise ValueError("Each item should have 'input' and 'expected_output' keys")
        
        state["test_dataset"] = test_dataset
        print(f"✅ Generated {len(test_dataset)} test cases")
        
        # Save dataset to file
        import os
        import datetime
        
        # Create datasets folder if it doesn't exist
        os.makedirs("datasets", exist_ok=True)
        
        # Generate filename with timestamp to avoid overwriting
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"datasets/test_dataset_{timestamp}.json"
        
        # Check if file exists and create versioned filename
        counter = 1
        original_filename = filename
        while os.path.exists(filename):
            base_name = original_filename.replace(".json", "")
            filename = f"{base_name}_v{counter}.json"
            counter += 1
        
        # Save the dataset
        with open(filename, "w") as f:
            json.dump(test_dataset, f, indent=2)
        
        print(f"💾 Dataset saved to: {filename}")
        
        # Display sample test cases
        print("\n📋 Sample test cases:")
        for i, case in enumerate(test_dataset[:3]):
            print(f"Case {i+1}:")
            input_text = case['input'] if len(case['input']) <= 100 else case['input'][:100] + "..."
            expected_text = case['expected_output'] if len(case['expected_output']) <= 100 else case['expected_output'][:100] + "..."
            print(f"  Input: {input_text}")
            print(f"  Expected: {expected_text}")
            print()
        
    except Exception as e:
        print(f"❌ Error generating dataset: {e}")
        # Fallback: create a simple dataset
        state["test_dataset"] = [
            {"input": f"Test input {i+1}", "expected_output": f"Expected output {i+1}"}
            for i in range(12)
        ]
    
    return state

# =============================================================================
# AGENT 2: PROMPT EXECUTOR
# =============================================================================

def agent2_prompt_executor(state: PromptOptimizationState) -> PromptOptimizationState:
    """
    Agent 2: Executes the current prompt against all test cases and collects results
    """
    print("🤖 Agent 2: Executing prompt against test dataset...")
    
    current_prompt = state["current_prompt"]
    test_dataset = state["test_dataset"]
    results = []
    
    for i, test_case in enumerate(test_dataset):
        print(f"Processing test case {i+1}/12...", end=" ")
        
        # Apply the current prompt to the test input
        full_prompt = f"{current_prompt}\n\nInput: {test_case['input']}"
        
        try:
            response = llm.invoke([HumanMessage(content=full_prompt)])
            actual_output = response.content.strip()
            
            result = {
                "test_case_id": i + 1,
                "input": test_case["input"],
                "expected_output": test_case["expected_output"],
                "actual_output": actual_output,
                "prompt_used": current_prompt
            }
            results.append(result)
            print("✅")
            
        except Exception as e:
            print(f"❌ Error: {e}")
            result = {
                "test_case_id": i + 1,
                "input": test_case["input"],
                "expected_output": test_case["expected_output"],
                "actual_output": f"ERROR: {str(e)}",
                "prompt_used": current_prompt
            }
            results.append(result)
    
    state["agent2_results"] = results
    print(f"✅ Completed execution on {len(results)} test cases")
    
    return state

# =============================================================================
# AGENT 3: EVALUATION ANALYZER
# =============================================================================

def agent3_evaluation_analyzer(state: PromptOptimizationState) -> PromptOptimizationState:
    """
    Agent 3: Analyzes the results using Vertex AI evaluation and provides enhancement recommendations
    """
    print("🤖 Agent 3: Analyzing results and generating recommendations...")
    
    results = state["agent2_results"]
    
    # Create evaluation dataset
    eval_data = []
    for result in results:
        eval_data.append({
            "input": result["input"],
            "expected_output": result["expected_output"],
            "response": result["actual_output"]  # Changed from actual_output to response
        })
    
    eval_df = pd.DataFrame(eval_data)
    
    # Define custom evaluation metric for prompt quality
    prompt_quality_metric = PointwiseMetric(
        metric="prompt_quality",
        metric_prompt_template=PointwiseMetricPromptTemplate(
            criteria={
                "accuracy": "The actual output matches the expected output in terms of correctness and completeness",
                "relevance": "The actual output is relevant to the input and addresses the main points",
                "clarity": "The actual output is clear, well-structured, and easy to understand",
                "consistency": "The output style and format are consistent with expectations"
            },
            rating_rubric={
                "5": "Excellent: Meets all criteria exceptionally well",
                "4": "Good: Meets most criteria well with minor issues",
                "3": "Average: Meets some criteria but has notable gaps",
                "2": "Poor: Falls short on most criteria",
                "1": "Very Poor: Fails to meet criteria"
            },
        ),
    )
    
    try:
        # Run evaluation
        eval_task = EvalTask(
            dataset=eval_df,
            metrics=[prompt_quality_metric],
            experiment=EXPERIMENT_NAME
        )
        
        eval_result = eval_task.evaluate()
        
        # Extract evaluation scores
        scores = []
        if hasattr(eval_result, 'summary_metrics'):
            for metric_name, metric_value in eval_result.summary_metrics.items():
                scores.append(f"{metric_name}: {metric_value}")
        
        evaluation_summary = "\n".join(scores) if scores else "Evaluation completed"
        
        # Calculate simple metrics as fallback
        total_cases = len(results)
        success_cases = sum(1 for r in results if "ERROR" not in r["actual_output"])
        success_rate = (success_cases / total_cases) * 100 if total_cases > 0 else 0
        
        evaluation_summary += f"\nSuccess Rate: {success_rate:.1f}% ({success_cases}/{total_cases})"
        
    except Exception as e:
        print(f"⚠️ Evaluation error: {e}")
        # Fallback evaluation
        total_cases = len(results)
        success_cases = sum(1 for r in results if "ERROR" not in r["actual_output"])
        success_rate = (success_cases / total_cases) * 100 if total_cases > 0 else 0
        evaluation_summary = f"Basic Evaluation - Success Rate: {success_rate:.1f}% ({success_cases}/{total_cases})"
    
    # Generate enhancement recommendations using LLM
    analysis_prompt = f"""
    You are a prompt engineering expert. Analyze the following test results and provide specific recommendations to improve the prompt.
    
    Original Prompt: {state["current_prompt"]}
    
    Evaluation Summary: {evaluation_summary}
    
    Sample Results:
    {json.dumps(results[:3], indent=2)}
    
    Based on this analysis, provide specific, actionable recommendations to enhance the prompt. 
    Focus on:
    1. What patterns of errors or suboptimal responses do you see?
    2. How can the prompt be made clearer or more specific?
    3. What instructions or examples should be added?
    4. What formatting or structure improvements are needed?
    
    Provide your recommendations in a clear, structured format.
    """
    
    try:
        analysis_response = llm.invoke([HumanMessage(content=analysis_prompt)])
        enhancement_recommendations = analysis_response.content.strip()
    except Exception as e:
        enhancement_recommendations = f"Error generating recommendations: {e}"
    
    state["evaluation_results"] = {
        "summary": evaluation_summary,
        "total_cases": len(results),
        "success_cases": sum(1 for r in results if "ERROR" not in r["actual_output"]),
        "detailed_results": results
    }
    state["enhancement_recommendations"] = enhancement_recommendations
    
    print("✅ Evaluation completed")
    print(f"📊 {evaluation_summary}")
    print(f"📝 Recommendations generated")
    
    return state

# =============================================================================
# AGENT 4: ENHANCEMENT VALIDATOR
# =============================================================================

def agent4_enhancement_validator(state: PromptOptimizationState) -> PromptOptimizationState:
    """
    Agent 4: Validates if enhancement recommendations make sense and creates final prompt
    """
    print("🤖 Agent 4: Validating enhancements and finalizing prompt...")
    
    current_prompt = state["current_prompt"]
    recommendations = state["enhancement_recommendations"]
    evaluation_results = state["evaluation_results"]
    iteration = state.get("iteration", 1)
    max_iterations = state.get("max_iterations", 3)
    
    # Validate recommendations
    validation_prompt = f"""
    You are a prompt engineering validator. Review the current prompt and the proposed enhancement recommendations.
    
    Current Prompt: {current_prompt}
    
    Enhancement Recommendations: {recommendations}
    
    Evaluation Results: {evaluation_results.get("summary", "No summary available")}
    
    Current Iteration: {iteration}/{max_iterations}
    
    Determine if these recommendations make sense and would improve the prompt. Consider:
    1. Are the recommendations specific and actionable?
    2. Do they address real issues identified in the evaluation?
    3. Would implementing them likely improve performance?
    4. Are they reasonable and not over-complicated?
    
    Respond with:
    - "ENHANCE" if the recommendations should be implemented, followed by an improved version of the prompt
    - "FINALIZE" if the current prompt is good enough or if we've reached max iterations
    
    Format your response as:
    DECISION: [ENHANCE/FINALIZE]
    REASONING: [Your reasoning]
    PROMPT: [The final or enhanced prompt]
    """
    
    try:
        validation_response = llm.invoke([HumanMessage(content=validation_prompt)])
        response_text = validation_response.content.strip()
        
        # Parse the response
        lines = response_text.split('\n')
        decision = None
        reasoning = ""
        new_prompt = current_prompt
        
        current_section = None
        for line in lines:
            if line.startswith("DECISION:"):
                decision = line.replace("DECISION:", "").strip()
                current_section = "decision"
            elif line.startswith("REASONING:"):
                current_section = "reasoning"
                reasoning = line.replace("REASONING:", "").strip()
            elif line.startswith("PROMPT:"):
                current_section = "prompt"
                new_prompt = line.replace("PROMPT:", "").strip()
            elif current_section == "reasoning":
                reasoning += " " + line.strip()
            elif current_section == "prompt":
                new_prompt += "\n" + line
        
        # Check if we should enhance or finalize
        should_enhance = (decision == "ENHANCE" and 
                         iteration < max_iterations and 
                         evaluation_results.get("success_cases", 0) < evaluation_results.get("total_cases", 12) * 0.8)
        
        state["enhancement_makes_sense"] = should_enhance
        
        if should_enhance:
            state["current_prompt"] = new_prompt.strip()
            state["iteration"] = iteration + 1
            print(f"✅ Enhancement approved - Moving to iteration {state['iteration']}")
            print(f"💡 Reasoning: {reasoning}")
        else:
            state["final_prompt"] = new_prompt.strip()
            print("✅ Finalization approved")
            print(f"💡 Reasoning: {reasoning}")
            
            # Save to file
            with open("readme_prompt.md", "w") as f:
                f.write("# Final Optimized Prompt\n\n")
                f.write(f"## Original Prompt\n{state['original_prompt']}\n\n")
                f.write(f"## Final Prompt\n{state['final_prompt']}\n\n")
                f.write(f"## Optimization Summary\n")
                f.write(f"- Iterations: {iteration}\n")
                f.write(f"- Final Success Rate: {evaluation_results.get('success_cases', 0)}/{evaluation_results.get('total_cases', 12)}\n")
                f.write(f"- Last Enhancement Reasoning: {reasoning}\n\n")
                f.write(f"## Evaluation Results\n{evaluation_results.get('summary', 'No summary')}\n")
            
            print("💾 Final prompt saved to readme_prompt.md")
        
    except Exception as e:
        print(f"❌ Error in validation: {e}")
        state["enhancement_makes_sense"] = False
        state["final_prompt"] = current_prompt
    
    return state

# =============================================================================
# WORKFLOW DEFINITION
# =============================================================================

# Create the workflow graph
workflow = StateGraph(PromptOptimizationState)

# Add nodes (agents)
workflow.add_node("dataset_generator", agent1_dataset_generator)
workflow.add_node("prompt_executor", agent2_prompt_executor)
workflow.add_node("evaluation_analyzer", agent3_evaluation_analyzer)
workflow.add_node("enhancement_validator", agent4_enhancement_validator)

# Define the flow
workflow.set_entry_point("dataset_generator")

# Sequential flow for first iteration
workflow.add_edge("dataset_generator", "prompt_executor")
workflow.add_edge("prompt_executor", "evaluation_analyzer") 
workflow.add_edge("evaluation_analyzer", "enhancement_validator")

# Conditional flow after validation
def should_continue(state: PromptOptimizationState) -> str:
    """Decide whether to continue optimization or end"""
    if state.get("enhancement_makes_sense", False) and state.get("iteration", 1) < state.get("max_iterations", 3):
        return "prompt_executor"  # Continue with another iteration
    else:
        return END  # Finalize

workflow.add_conditional_edges(
    "enhancement_validator",
    should_continue,
    {
        "prompt_executor": "prompt_executor",
        END: END
    }
)

# Compile the graph
app = workflow.compile()

print("✅ LangGraph workflow compiled successfully!")



✅ State definition completed!
✅ LangGraph workflow compiled successfully!


In [10]:
# Example usage
user_prompt = """
You are a helpful assistant that writes creative product descriptions for e-commerce. 
Given a product name and basic features, write an engaging product description that highlights key benefits and appeals to customers.
"""

user_prompt = '''## Context:
    Evaluate the relevance and necessity of all the provided medical care in relation to each item in the ICD and surgery descriptions list using your professional medical knowledge.
   
    ## Instruction:
    For each item in the ICD and surgery descriptions list, determine:
    If the medical care is necessary for diagnostic or examination purposes.
    If the medical care is effective for the disease.
    If the benefits of medical care outweigh any risks.
    If the medical care is a standard practice for the diagnosis.
    If the medical care is essential for the disease and not for cosmetic/lifestyle purposes.
    If the medical care indirectly treats the disease.
    Medical care that is follow-up visits, repeat visits, repeat consultation, total amount, GST information, subsidies or discounts all conclude as 'yes'. Especially for GST, their descriptions may come in forms such as 'GST - ADD GST', 'GST - LESS GST'; for any descriptions that resemble these, conclude as 'yes'.
    Considering the interactions between drugs, some drugs and treatments may not be designed for the patient's diagnoses and surgeries, but rather to counteract the side effects caused by other drugs. this scenario also needs to conclude as 'yes'
    Analyse the exclusion details and verify if the treatment is relevant to the provided exclusion. if the treatment is relevant to the exclusion, conclude as 'no'. 'explanation' should be provided as the treatment is relevant to the exclusion list.
    Conversely, consider the treatment medically unnecessary if it is ineffective, has safer alternatives, is discouraged by guidelines, or if risks outweigh benefits.
   
    ## Input: Given Medical Care: {treatments}.
    ## ICD and Surgery Description list: {icd_surgery_description}.
    ## Exclusion details: {exclusions}.
   
    ### Question: are all the given medical care at least a relevant testing or treatment for one of the items in the ICD and surgery descriptions list.
   
    ### Conclusion:
    1. Explain the given result as a doctor.
    2. Given different patient profiles and medical histories, provide a probability score without an explanation based on your analysis for the Yes/No binary output you provided to indicate the likelihood of this claim getting approved.
   
    ### Response: MUST Provide the output in json format with a key "conclusion" stating yes/no of the relevance, a key "probability score" providing a probability score (0-1 scale) to indicate the likelihood of this claim getting approved with 2 decimal places, and another key "explanation" stating the explanation as a doctor. Ensure that the conclusion is "yes" only if the probability score is at least 0.5. Ensure that the explanation is given in complete sentences without any truncations.
    '''

In [11]:
# =============================================================================
# EXECUTION EXAMPLE
# =============================================================================

# Example usage
user_prompt = """
You are a helpful assistant that writes creative product descriptions for e-commerce. 
Given a product name and basic features, write an engaging product description that highlights key benefits and appeals to customers.
"""

print("🚀 Starting Prompt Optimization Process...")
print(f"📝 Original Prompt: {user_prompt}")
print("="*80)

# Initialize state
initial_state = {
    "original_prompt": user_prompt,
    "current_prompt": user_prompt,
    "test_dataset": [],
    "agent2_results": [],
    "evaluation_results": {},
    "enhancement_recommendations": "",
    "enhancement_makes_sense": False,
    "final_prompt": "",
    "iteration": 1,
    "max_iterations": 3
}

# Run the workflow
try:
    final_state = app.invoke(initial_state)
    
    print("\n" + "="*80)
    print("🎉 OPTIMIZATION COMPLETE!")
    print("="*80)
    
    print(f"📊 Final Results:")
    print(f"   - Total Iterations: {final_state.get('iteration', 1)}")
    print(f"   - Test Cases: {len(final_state.get('test_dataset', []))}")
    print(f"   - Success Rate: {final_state.get('evaluation_results', {}).get('success_cases', 0)}/{final_state.get('evaluation_results', {}).get('total_cases', 0)}")
    
    print(f"\n📝 Original Prompt:")
    print(f"   {final_state['original_prompt']}")
    
    print(f"\n🎯 Final Prompt:")
    print(f"   {final_state.get('final_prompt', final_state['current_prompt'])}")
    
    if os.path.exists("readme_prompt.md"):
        print(f"\n💾 Detailed results saved to: readme_prompt.md")
    
except Exception as e:
    print(f"❌ Error during execution: {e}")
    import traceback
    traceback.print_exc()

print("\n✅ Process completed!")

🚀 Starting Prompt Optimization Process...
📝 Original Prompt: ## Context:
    Evaluate the relevance and necessity of all the provided medical care in relation to each item in the ICD and surgery descriptions list using your professional medical knowledge.
   
    ## Instruction:
    For each item in the ICD and surgery descriptions list, determine:
    If the medical care is necessary for diagnostic or examination purposes.
    If the medical care is effective for the disease.
    If the benefits of medical care outweigh any risks.
    If the medical care is a standard practice for the diagnosis.
    If the medical care is essential for the disease and not for cosmetic/lifestyle purposes.
    If the medical care indirectly treats the disease.
    Medical care that is follow-up visits, repeat visits, repeat consultation, total amount, GST information, subsidies or discounts all conclude as 'yes'. Especially for GST, their descriptions may come in forms such as 'GST - ADD GST', 'GST - LE

Associating projects/255766800726/locations/us-central1/metadataStores/default/contexts/prompt-optimization-experiment-346d7abe-63cd-44b2-8526-8e7d03a920dd to Experiment: prompt-optimization-experiment


Computing metrics with a total of 12 Vertex Gen AI Evaluation Service API requests.


100%|██████████| 12/12 [00:01<00:00,  6.02it/s]

All 12 metric requests are successfully computed.
Evaluation Took:2.000043304869905 seconds





✅ Evaluation completed
📊 row_count: 12
prompt_quality/mean: 4.916666666666667
prompt_quality/std: 0.2886751345948129
Success Rate: 100.0% (12/12)
📝 Recommendations generated
🤖 Agent 4: Validating enhancements and finalizing prompt...
✅ Finalization approved
💡 Reasoning: The recommendations are excellent, specific, and actionable. They address the identified issues of minor score discrepancies, reliance on implicit knowledge, and ambiguity in defining relevance. The proposed enhancements, particularly the inclusion of examples and more explicit instructions on handling partial relevance and conflicting information, are likely to improve the prompt's robustness and consistency. The formatting and structural improvements will also enhance readability and clarity. The recommendations are reasonable and not overly complicated, making them a valuable addition to the prompt engineering process. 
💾 Final prompt saved to readme_prompt.md

🎉 OPTIMIZATION COMPLETE!
📊 Final Results:
   - Total Ite