In [26]:
from pathlib import Path
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from langgraph.graph import MessagesState, StateGraph
from langchain_core.tools import tool
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage, ToolMessage
from langgraph.prebuilt import ToolNode
from langgraph.graph import END
from langgraph.prebuilt import tools_condition
from langchain_core.documents import Document
from langchain.chat_models import init_chat_model
import time
import io
from contextlib import redirect_stdout
from datetime import datetime
from testing.response_eval_tools import set_api_key_from_path, initialize_evaluation_llm, evaluate_model_outputs_from_paths
from pathlib import Path
import json
from typing import List, Dict, Any
import uuid


set_api_key_from_path(Path('./testing/gemini-key'))

################# [ Validate GPU Available ] #################
import torch

# assert(torch.cuda.is_available())

##############################################################

chroma_path = "./test_db"
collection_name = 'multi_proceedings_test'
embedding_model_name = 'all-MiniLM-L6-v2'

client = chromadb.PersistentClient(path=chroma_path)
embedding_function = SentenceTransformerEmbeddingFunction(model_name=embedding_model_name)
collection = client.get_or_create_collection(name=collection_name, embedding_function=embedding_function)

llm = initialize_evaluation_llm()
# llm = init_chat_model("llama3.2:3b-instruct-q8_0", model_provider="ollama")

class ChatHistoryManager:
    def __init__(self, max_history_length=10):
        self.sessions = {}
        self.max_history_length = max_history_length

    def get_or_create_session(self, session_id: str) -> List:
        """Get or create a new chat session"""
        if session_id not in self.sessions:
            self.sessions[session_id] = []
        return self.sessions[session_id]

    def add_message(self, session_id: str, message) -> None:
        """Add a message to the chat history"""
        history = self.get_or_create_session(session_id)
        history.append(message)

        if len(history) > self.max_history_length * 2:  # Keep pairs of messages
            self.sessions[session_id] = history[-self.max_history_length*2:]

    def get_history(self, session_id: str) -> List:
        """Get the chat history for a session"""
        return self.get_or_create_session(session_id)

    def clear_history(self, session_id: str) -> None:
        """Clear the chat history for a session"""
        self.sessions[session_id] = []

    def save_history(self): # TODO: STORE HISTORY TO FILE
        pass

    def load_history(self): # TODO: LOAD HISTORY FROM FILE
        pass

chat_manager = ChatHistoryManager(max_history_length=15)

# Set up retrieval tool
@tool(response_format="content_and_artifact")
def retrieve(query: str, k: int = 8):
    """Retrieve information related to a query."""
    # Query ChromaDB directly
    results = collection.query(
        query_texts=[query],
        n_results=k,
    )

    # Format results for LangChain compatibility
    retrieved_docs = []
    for i in range(len(results['ids'][0])):
        doc_id = results['ids'][0][i]
        content = results['documents'][0][i]
        metadata = results['metadatas'][0][i] if results['metadatas'][0] else {}

        doc = Document(page_content=content, metadata=metadata)
        retrieved_docs.append(doc)

    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\n" f"Content: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

# Build the graph
def build_graph():
    graph_builder = StateGraph(MessagesState)

    def force_retrieval(state: MessagesState):
        """Always call the retrieve tool first."""
        # Get the latest human message
        latest_human_message = None
        for message in reversed(state["messages"]):
            if message.type == "human":
                latest_human_message = message
                break

        if latest_human_message is None:
            return {"messages": state["messages"]}

        # Create a tool call for retrieval with a properly formatted tool_calls attribute
        tool_call_id = str(uuid.uuid4())
        retrieval_message = AIMessage(
            content="I'll search for relevant information to answer your question.",
            tool_calls=[{
                "name": "retrieve",
                "id": tool_call_id,
                "args": {"query": latest_human_message.content, "k": 8}
            }]
        )

        return {
            "messages": state["messages"] + [retrieval_message]
        }

    # Execute the retrieval
    tools = ToolNode([retrieve])

    # Generate a response using the retrieved content
    def generate(state: MessagesState):
        """Generate answer."""
        # Get generated ToolMessages
        recent_tool_messages = []
        for message in reversed(state["messages"]):
            if message.type == "tool":
                recent_tool_messages.append(message)
            else:
                break
        tool_messages = recent_tool_messages[::-1]

        # Format into prompt
        docs_content = "\n\n".join(doc.content for doc in tool_messages)
        system_message_content = (
            """<SYSTEM>
                You are "GRC Regulatory Analysis Expert," an AI assistant specialized in California GRC proceedings.
                </SYSTEM>

                <INFORMATION SOURCES>
                Base your responses EXCLUSIVELY on:
                1. Retrieved documents (HIGHEST PRIORITY)
                2. User-provided context in the current session

                The retrieval system has already provided you with the most relevant information.
                Always cite your sources with specific references (e.g., "PG&E 2023 GRC, Exhibit 4, p.15").
                </INFORMATION SOURCES>

                <IDENTITY AND EXPERTISE>
                You are a regulatory specialist focused exclusively on California General Rate Case (GRC) proceedings and related CPUC filings with expertise in:
                - Rate case applications and testimony
                - Revenue requirement analysis
                - Procedural requirements and timelines
                - CPUC decisions and precedents
                </IDENTITY AND EXPERTISE>

                <RESPONSE FORMAT>
                Structure your responses with:
                1. Concise summary of key findings
                2. Detailed analysis with multiple supporting citations
                3. Relevant regulatory background and historical context
                4. Discussion of practical implications
                5. Complete citations formatted as markdown links

                Use markdown formatting (headers, tables, bullets) to enhance readability.
                </RESPONSE FORMAT>

                <PROFESSIONAL TONE>
                Maintain a voice that is:
                - Authoritative yet accessible
                - Technically precise
                - Thorough and explanatory
                - Objective in regulatory interpretation
                </PROFESSIONAL TONE>

                <ACCURACY REQUIREMENTS>
                - Never invent citations, docket numbers, or proceedings
                - Clearly indicate when information is missing or insufficient
                - Present multiple interpretations when guidance is ambiguous
                - Quote directly from sources for critical regulatory language
                </ACCURACY REQUIREMENTS>

                <SCOPE LIMITATIONS>
                Address only topics related to California GRC proceedings and CPUC regulatory matters.
                For other topics, politely explain they fall outside your expertise.
                </SCOPE LIMITATIONS>

                Always end responses with: "Would you like me to explore any aspect of this response in greater depth or address related regulatory considerations?"
                """
            f"Document Context: {docs_content}"
        )
        # Get conversation messages excluding tool messages and tool-calling AI messages
        conversation_messages = []
        for message in state["messages"]:
            if message.type == "human":
                conversation_messages.append(message)
            elif message.type == "ai" and not getattr(message, "tool_calls", None):
                conversation_messages.append(message)

        prompt = [SystemMessage(content=system_message_content)] + conversation_messages

        # Run llm
        response = llm.invoke(prompt)
        return {"messages": [response]}

    # Set up the graph connections
    graph_builder.add_node("force_retrieval", force_retrieval)
    graph_builder.add_node("tools", tools)
    graph_builder.add_node("generate", generate)

    graph_builder.set_entry_point("force_retrieval")
    graph_builder.add_edge("force_retrieval", "tools")
    graph_builder.add_edge("tools", "generate")
    graph_builder.add_edge("generate", END)

    return graph_builder.compile()

# Initialize the graph
graph = build_graph()

def process_query(query: str, session_id: str, retrieval_k: int = 8) -> Dict[str, Any]:
    # Set the K value for this query
    retrieve.bind(k=retrieval_k)

    start_time = time.time()

    # Get chat history
    history = chat_manager.get_history(session_id)

    # Format messages
    messages = []
    for msg in history:
        if msg["role"] == "user":
            messages.append(HumanMessage(content=msg["content"]))
        elif msg["role"] == "assistant":
            messages.append(AIMessage(content=msg["content"]))
        elif msg["role"] == "tool":
            # Create a tool message with appropriate attributes and a tool_call_id
            messages.append(ToolMessage(
                content=msg["content"],
                name=msg.get("tool_name", "retrieve"),
                tool_call_id=msg.get("tool_call_id", str(uuid.uuid4()))
            ))

    # Add the current query
    messages.append(HumanMessage(content=query))

    # Run the graph
    result = None
    tool_outputs = []

    # Process the query through the graph
    with io.StringIO() as buf, redirect_stdout(buf):
        for step in graph.stream(
            {"messages": messages},
            stream_mode="values",
        ):
            last_message = step["messages"][-1]
            if last_message.type == "tool":
                tool_outputs.append({
                    "tool_name": getattr(last_message, "name", "retrieve"),
                    "content": last_message.content
                })
            if last_message.type == "ai" and not getattr(last_message, "tool_calls", None):
                result = last_message.content

        debug_output = buf.getvalue()

    end_time = time.time()
    elapsed_time = end_time - start_time

    # Add messages to chat history
    chat_manager.add_message(session_id, {"role": "user", "content": query})

    # Add tool messages to history
    for tool_output in tool_outputs:
        chat_manager.add_message(session_id, {
            "role": "tool",
            "content": tool_output["content"],
            "tool_name": tool_output.get("tool_name", "retrieve")
        })

    if result:
        chat_manager.add_message(session_id, {"role": "assistant", "content": result})

    response = {
        "result": result,
        "processing_time": elapsed_time,
        "tool_outputs": tool_outputs,
        "session_id": session_id,
        "timestamp": datetime.now().isoformat(),
        "debug_output": debug_output if debug_output else None
    }

    return response

def get_chat_history(session_id: str):
    return chat_manager.get_history(session_id)

def clear_chat_history(session_id: str):
    chat_manager.clear_history(session_id)
    return {"status": "success", "message": f"Chat history cleared for session {session_id}"}

def generate_session_id():
    return str(uuid.uuid4())

In [27]:
class LLMChatSession():
    def __init__(self, console_mode = False) -> None:
        self.session_id = generate_session_id()
        self.console_mode = console_mode
        self.timestamp_queue = [] # New ones get inserted at end (using append); remove by deleting element 0
        self.max_queries = 15
        # When query, update message queue; if still at limit then stop, otherwise
    def query(self, user_input: str, k = None) -> Dict[str, Any]:
        # if self.is_under_limit():
        if k is None:
            response = process_query(user_input, self.session_id)
        else:
            response = process_query(user_input, self.session_id, k)


        if self.console_mode:
            print(f"\nGRC Assistant: {response['result']}")
            print(f"\nProcessing time: {response['processing_time']:.2f} seconds")
        self.timestamp_queue.append(datetime.fromisoformat(response['timestamp']))
        return {'result': response['result'],
                'messages_remaining': self.max_queries - sum([(datetime.now() - timestamp).seconds < 60 for timestamp in self.timestamp_queue]),
                'sec_remaining': [60 - (datetime.now() - timestamp).seconds for timestamp in self.timestamp_queue],
                'tool_outputs': response['tool_outputs'],
                }
        # else:
        #     raise ConnectionRefusedError(f"Rate Limit Exceeded | Try again in {60 - (datetime.now() - self.timestamp_queue[0]).seconds} seconds")

    def is_under_limit(self) -> bool:
        while len(self.timestamp_queue) > 0:
            if (datetime.now() - self.timestamp_queue[0]).seconds > 60: # Is over 60 sec time limit
                del self.timestamp_queue[0]
                continue
            else:
                break
        return len(self.timestamp_queue) < 15

    def hist_free_query(self, user_input: str) -> Dict[str, Any]:
        clear_chat_history(self.session_id)
        out = self.query(user_input)
        clear_chat_history(self.session_id)
        return out

In [23]:
results = collection.query(
        query_texts=["AAJKNKFEJ"],
        n_results=3,
    )

In [4]:
results

{'ids': [['405059555_898', '405059555_1331', '405059555_393']],
 'embeddings': None,
 'documents': [['AABBCCDDEEFF(cid:8)(cid:8)BBGGEEDDEEGGEEHHII(cid:8)(cid:8)FFJJKKKKEEIIIIEEJJLLMM(cid:8)(cid:8)IIGGNNGGHH(cid:8)(cid:8)JJOO(cid:8)(cid:8)FFNNDDEEOOJJPPLLEENN\nIINNLL(cid:8)(cid:8)OOPPNNLLFFEEIIFFJJMM(cid:8)(cid:8)FFNNDDEEOOJJPPLLEENN\n00112233224411556677(cid:9)(cid:9)889966::22112266;;22\n<<==77==>>??(cid:9)(cid:9)@@AABB(cid:9)(cid:9)CCAACCDD ZC\n(cid:23)(cid:30)(cid:9)(cid:9)(cid:9)(cid:0)(cid:2)(cid:0)(cid:3)(cid:4)(cid:5)(cid:6)(cid:7)(cid:8)(cid:0)(cid:9)(cid:5)(cid:10)(cid:10)(cid:3)(cid:8)(cid:6)(cid:11)(cid:12)(cid:13)(cid:9)(cid:11)(cid:14)(cid:2)(cid:9)(cid:11)(cid:15)(cid:3)(cid:9)(cid:15)(cid:16)(cid:17)(cid:7)(cid:18)(cid:16)(cid:4)(cid:2)\n(cid:23)’(cid:9)(cid:9)(cid:9)(cid:0)(cid:2)(cid:0)(cid:3)(cid:4)(cid:5)(cid:6)(cid:7)(cid:8)(cid:0)(cid:9)(cid:5)(cid:10)(cid:10)(cid:3)(cid:8)(cid:6)(cid:11)(cid:12)(cid:13)(cid:9)(cid:11)(cid:14)(cid:5)(cid:11)(cid:9)(cid:19)(cid:19)(

In [6]:
results['metadatas'][0]
Document(page_content=content, metadata=metadata)

[{'chunk_index': 898,
  'document_id': '405059555',
  'proceeding_id': 'A2106021',
  'source_url': 'https://docs.cpuc.ca.gov/PublishedDocs/Efile/G000/M405/K059/405059555.PDF',
  'text': 'AABBCCDDEEFF(cid:8)(cid:8)BBGGEEDDEEGGEEHHII(cid:8)(cid:8)FFJJKKKKEEIIIIEEJJLLMM(cid:8)(cid:8)IIGGNNGGHH(cid:8)(cid:8)JJOO(cid:8)(cid:8)FFNNDDEEOOJJPPLLEENN\nIINNLL(cid:8)(cid:8)OOPPNNLLFFEEIIFFJJMM(cid:8)(cid:8)FFNNDDEEOOJJPPLLEENN\n00112233224411556677(cid:9)(cid:9)889966::22112266;;22\n<<==77==>>??(cid:9)(cid:9)@@AABB(cid:9)(cid:9)CCAACCDD ZC\n(cid:23)(cid:30)(cid:9)(cid:9)(cid:9)(cid:0)(cid:2)(cid:0)(cid:3)(cid:4)(cid:5)(cid:6)(cid:7)(cid:8)(cid:0)(cid:9)(cid:5)(cid:10)(cid:10)(cid:3)(cid:8)(cid:6)(cid:11)(cid:12)(cid:13)(cid:9)(cid:11)(cid:14)(cid:2)(cid:9)(cid:11)(cid:15)(cid:3)(cid:9)(cid:15)(cid:16)(cid:17)(cid:7)(cid:18)(cid:16)(cid:4)(cid:2)\n(cid:23)’(cid:9)(cid:9)(cid:9)(cid:0)(cid:2)(cid:0)(cid:3)(cid:4)(cid:5)(cid:6)(cid:7)(cid:8)(cid:0)(cid:9)(cid:5)(cid:10)(cid:10)(cid:3)(cid:8)(cid:6)(

In [7]:
retrieved_docs = []
for i in range(len(results['ids'][0])):
    doc_id = results['ids'][0][i]
    content = results['documents'][0][i]
    metadata = results['metadatas'][0][i] if results['metadatas'][0] else {}

    doc = Document(page_content=content, metadata=metadata)
    retrieved_docs.append(doc)

serialized = "\n\n".join(
    (f"Source: {doc.metadata}\n" f"Content: {doc.page_content}")
    for doc in retrieved_docs
)

In [9]:
serialized

"Source: {'chunk_index': 898, 'document_id': '405059555', 'proceeding_id': 'A2106021', 'source_url': 'https://docs.cpuc.ca.gov/PublishedDocs/Efile/G000/M405/K059/405059555.PDF', 'text': 'AABBCCDDEEFF(cid:8)(cid:8)BBGGEEDDEEGGEEHHII(cid:8)(cid:8)FFJJKKKKEEIIIIEEJJLLMM(cid:8)(cid:8)IIGGNNGGHH(cid:8)(cid:8)JJOO(cid:8)(cid:8)FFNNDDEEOOJJPPLLEENN\\nIINNLL(cid:8)(cid:8)OOPPNNLLFFEEIIFFJJMM(cid:8)(cid:8)FFNNDDEEOOJJPPLLEENN\\n00112233224411556677(cid:9)(cid:9)889966::22112266;;22\\n<<==77==>>??(cid:9)(cid:9)@@AABB(cid:9)(cid:9)CCAACCDD ZC\\n(cid:23)(cid:30)(cid:9)(cid:9)(cid:9)(cid:0)(cid:2)(cid:0)(cid:3)(cid:4)(cid:5)(cid:6)(cid:7)(cid:8)(cid:0)(cid:9)(cid:5)(cid:10)(cid:10)(cid:3)(cid:8)(cid:6)(cid:11)(cid:12)(cid:13)(cid:9)(cid:11)(cid:14)(cid:2)(cid:9)(cid:11)(cid:15)(cid:3)(cid:9)(cid:15)(cid:16)(cid:17)(cid:7)(cid:18)(cid:16)(cid:4)(cid:2)\\n(cid:23)’(cid:9)(cid:9)(cid:9)(cid:0)(cid:2)(cid:0)(cid:3)(cid:4)(cid:5)(cid:6)(cid:7)(cid:8)(cid:0)(cid:9)(cid:5)(cid:10)(cid:10)(cid:3)(cid:8)(ci

In [3]:
results['distances']

[[1.3253289461135864, 1.3262158632278442, 1.3285788297653198]]

In [12]:
llm_session = LLMChatSession(console_mode=True)

In [18]:
llm_session.query("Hey, how's it going?")


GRC Assistant: I am doing well. How can I help you today?

Processing time: 0.38 seconds


{'result': 'I am doing well. How can I help you today?',
 'messages_remaining': 14,
 'sec_remaining': [-45, -39, -22, 60],
 'tool_outputs': []}

In [19]:
result = llm_session.query("What information do you vae on PG&E from your retrival")


GRC Assistant: I can provide information on PG&E from the documents I have access to. To give you the most relevant information, could you please specify what you're interested in? For example, are you interested in:

*   PG&E's 2023 General Rate Case (GRC) application?
*   PG&E's Short-Term Incentive Plan (STIP)?
*   PG&E's wildfire risk mitigation work?
*   PG&E's Non-Tariffed Products and Services?
*   PG&E's plans for electrifying vehicles?
*   Other topics related to PG&E?

Once you let me know the specific area you're interested in, I can provide a detailed summary with supporting citations.

Processing time: 2.12 seconds


In [20]:
llm_session.query("I am interested learning more about the Non-Tariffed Products and Services")


GRC Assistant: Okay, I can help you with information regarding PG&E's Non-Tariffed Products and Services (NTP&S) based on the documents I have.

**Overview of PG&E's Non-Tariffed Products and Services**

PG&E offers Non-Tariffed Products and Services (NTP&S) primarily using underutilized PG&E assets or capacity to generate incremental revenues by marketing products and services to third parties (PG&E GRC A.21-06-021, [Ex-06 at 2-17](https://docs.cpuc.ca.gov/PublishedDocs/Efile/G000/M520/K418/520418881.PDF)). These services are consistent with the Affiliate Transaction Rules, Rule VII, adopted by the Commission in D.06-12-029 (PG&E GRC A.21-06-021, [Ex-06 at 2-17](https://docs.cpuc.ca.gov/PublishedDocs/Efile/G000/M520/K418/520418881.PDF)).

**Examples of Non-Tariffed Products and Services**

Typical transactions under this program include (PG&E GRC A.21-06-021, [Ex-06](https://docs.cpuc.ca.gov/PublishedDocs/Efile/G000/M520/K418/520418881.PDF)):

*   Joint use pole attachment arrangemen

{'result': "Okay, I can help you with information regarding PG&E's Non-Tariffed Products and Services (NTP&S) based on the documents I have.\n\n**Overview of PG&E's Non-Tariffed Products and Services**\n\nPG&E offers Non-Tariffed Products and Services (NTP&S) primarily using underutilized PG&E assets or capacity to generate incremental revenues by marketing products and services to third parties (PG&E GRC A.21-06-021, [Ex-06 at 2-17](https://docs.cpuc.ca.gov/PublishedDocs/Efile/G000/M520/K418/520418881.PDF)). These services are consistent with the Affiliate Transaction Rules, Rule VII, adopted by the Commission in D.06-12-029 (PG&E GRC A.21-06-021, [Ex-06 at 2-17](https://docs.cpuc.ca.gov/PublishedDocs/Efile/G000/M520/K418/520418881.PDF)).\n\n**Examples of Non-Tariffed Products and Services**\n\nTypical transactions under this program include (PG&E GRC A.21-06-021, [Ex-06](https://docs.cpuc.ca.gov/PublishedDocs/Efile/G000/M520/K418/520418881.PDF)):\n\n*   Joint use pole attachment arra

In [8]:
llm_session.query("What was my first question?")


GRC Assistant: Your first question was: "What information do you vae on PG&E from your retrival"

Processing time: 0.52 seconds


{'result': 'Your first question was: "What information do you vae on PG&E from your retrival"',
 'messages_remaining': 12,
 'sec_remaining': [53, 60, 60],
 'tool_outputs': []}

# Prompt Testing

In [28]:
import io
import time
import random
from datetime import datetime
from pathlib import Path
from contextlib import redirect_stdout
import traceback
import re

from langchain.schema import HumanMessage
from google.api_core import exceptions as google_exceptions

def test_prompts_and_save(prompts):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = Path(
        f"./llm_output/test/llm_responses_with_context_qa_gemini_{timestamp}.txt"
    )
    llm_session = LLMChatSession(console_mode=True)
    clear_chat_history(llm_session.session_id)
    with output_path.open("w", encoding="utf-8") as f:
        for idx, prompt_text in enumerate(prompts):
            print(f"\n--- Running Prompt {idx+1}/{len(prompts)} ---")
            f.write(f"================= Prompt {idx+1} =================\n")
            f.write(f"PROMPT:\n{prompt_text}\n\n")

            success = False
            attempt = 0

            while not success:
                try:
                    response = llm_session.query(prompt_text)
                    response_result = response['result']
                    try:
                        response_context = response['tool_outputs'][0]['content']
                        f.write(f'SOURCES:\n{response_context}\n\n')
                    except:
                        f.write(f'SOURCES:\nNONE\n\n')
                    f.write(f'LLM RESPONSE:\n{response_result}\n\n')
                    clear_chat_history(llm_session.session_id)
                    success = True
                # except ConnectionRefusedError as e:
                #     error_message = str(e)
                #     match = re.search(r"Try again in (\d+) seconds", error_message)
                #     if match:
                #         wait_time = int(match.group(1))
                #         print(f"Rate limited. Waiting for {wait_time} seconds...")
                #         time.sleep(wait_time)
                #     clear_chat_history(llm_session.session_id)
                except google_exceptions.ResourceExhausted as e:
                    error_message = str(e)
                    print(f"Rate limit hit")

                    # Extract the retry_delay using regex
                    retry_match = re.search(r"retry_delay\s*\{\s*seconds:\s*(\d+)", error_message)

                    if retry_match:
                        retry_seconds = int(retry_match.group(1))
                        print(f"API suggests waiting for {retry_seconds} seconds")

                        # Wait for the suggested time
                        time.sleep(retry_seconds)
            if success:
                print(f"Prompt {idx+1} complete.")
            else:
                print(f"Prompt {idx+1} failed after {attempt} attempts.")

            # Delay between prompts
            if idx < len(prompts) - 1:
                time.sleep(random.uniform(1, 3))

    print(f"\nAll responses and contexts saved to: {output_path.resolve()}")


In [29]:
import json

with open('./qa_pairs/test_qa.json', 'r', encoding='utf-8') as file:
    test_data = json.load(file)

In [30]:
test_list = list(pair['question'] for pair in test_data['qa_pairs'])

In [31]:
test_prompts_and_save(test_list)


--- Running Prompt 1/60 ---

GRC Assistant: I am unable to determine the witness for the CSO Closure and Transformation Proposal from the provided documents. Would you like me to explore any aspect of this response in greater depth or address related regulatory considerations?

Processing time: 1.02 seconds
Prompt 1 complete.

--- Running Prompt 2/60 ---

GRC Assistant: PG&E filed Application A.22-04-016 in April 2022, proposing to permanently close all 65 of its Customer Service Offices (CSOs) (PG&E 2023 GRC, [520418881](https://docs.cpuc.ca.gov/PublishedDocs/Efile/G000/M520/K418/520418881.PDF), p.531; PG&E 2023 GRC, [520896345](https://docs.cpuc.ca.gov/PublishedDocs/Published/G000/M520/K896/520896345.pdf), p.534; PG&E 2023 GRC, [520114360](https://docs.cpuc.ca.gov/PublishedDocs/Efile/G000/M520/K114/520114360.PDF), p.531). The Commission authorized the permanent closure of PG&E’s 65 Customer Service Offices effective January 1, 2023, in D.22-12-033 (PG&E 2023 GRC, [520896345](https:/

---

In [32]:
from pathlib import Path
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from langgraph.graph import MessagesState, StateGraph
from langchain_core.tools import tool
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage, ToolMessage
from langgraph.prebuilt import ToolNode
from langgraph.graph import END
from langgraph.prebuilt import tools_condition
from langchain_core.documents import Document
from langchain.chat_models import init_chat_model
import time
import io
from contextlib import redirect_stdout
from datetime import datetime
from testing.response_eval_tools import set_api_key_from_path, initialize_evaluation_llm, evaluate_model_outputs_from_paths
from pathlib import Path
import json
from typing import List, Dict, Any, Optional, Union
import uuid
import pandas as pd
import re

from qa_pairs.qa_data_loader import load_static_demo_pairs
set_api_key_from_path(Path('./testing/gemini-key'))
llm = initialize_evaluation_llm()
llm_session = LLMChatSession()

In [33]:
import re
from pathlib import Path
from typing import List, Dict, Any

def parse_prompt_blocks(file_path: str) -> List[Dict[str, Any]]:
    path = Path(file_path)

    if not path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")

    with open(path, 'r', encoding='utf-8') as f:
        file_content = f.read()

    delimiter_pattern = r'=================\s*Prompt\s+(\d+)\s*=================\n'

    # Split the file content by prompt secitons
    blocks = re.split(delimiter_pattern, file_content)

    # First element will be empty or any text before the first delimiter
    blocks = blocks[1:] if blocks[0].strip() == '' else blocks

    # Process the blocks (pairs of [prompt_number, content])
    results = []

    # Handle paired elements (every odd index is a prompt number, every even index is content)
    for i in range(0, len(blocks)-1, 2):
        prompt_num = int(blocks[i])
        content = blocks[i+1]

        # Extract the prompt text
        prompt_match = re.search(r'PROMPT:\n(.*?)\n\n', content, re.DOTALL)
        prompt_text = prompt_match.group(1) if prompt_match else "Prompt text not found"

        # Get the response content
        response = content[prompt_match.end():] if prompt_match else content

        results.append({
            "prompt_num": prompt_num,
            "prompt_text": prompt_text,
            "response": response,
            "full_block": f"================= Prompt {prompt_num} =================\n{content}"
        })

    return results

In [34]:
import json
import time
import random
import traceback
from typing import List, Dict, Any, Optional
from pathlib import Path
from google.api_core import exceptions as google_exceptions

def build_evaluation_prompt(model_output_text: str) -> str:
    prompt = f"""
    You are evaluating a retrieval-augmented generation (RAG) system. For each Q/A pair in the data below, you will:
    1. Assign a relevance score (0–5) based on how well the retrieved context matches the question.
       - 0: No relevance, the context is completely off.
       - 1: Very low relevance, context is weakly related.
       - 2: Some relevance, context contains some useful details, but is not fully aligned.
       - 3: Good relevance, context provides useful information that mostly answers the question.
       - 4: Very good relevance, context almost entirely answers the question, with minimal extra information.
       - 5: Perfect relevance, context directly and completely answers the question.

    2. Assign a usage score (0–5) based on whether the context was properly used in generating the answer:
       - 0: The context was not used at all.
       - 1: The context was partially used but not enough to affect the response.
       - 2: The context was mentioned but not effectively incorporated.
       - 3: The context was used, but there were gaps in how it was integrated.
       - 4: The context was effectively used, though some information might be missing.
       - 5: The context was fully integrated and used to generate the response.

    3. Optional: Identify and flag any hallucinations or unsupported claims. If the AI refers to information not found in the context, it should be noted.

    Please format the output as a JSON array with the following fields: "question", "relevance_score", "usage_score", "hallucination_notes".

    Here is the data to evaluate:
    {model_output_text}
    """
    return prompt


def build_accuracy_evaluation_prompt(question: str, original_answer: str, model_answer: str) -> str:
    prompt = f"""
    You are evaluating the accuracy of an AI assistant's answer compared to a reference answer.

    Question: {question}

    Reference Answer: {original_answer}

    AI Assistant's Answer: {model_answer}

    Please evaluate the accuracy of the AI Assistant's answer on a scale from 0 to 5:

    - 0: Completely incorrect or contradicts the reference answer.
    - 1: Mostly incorrect with minimal accurate elements.
    - 2: Partially correct but contains significant inaccuracies or missing critical information.
    - 3: Moderately accurate with some minor inaccuracies or omissions.
    - 4: Highly accurate with very minor omissions or imprecisions.
    - 5: Perfectly accurate, capturing all key information from the reference answer.

    Provide your assessment as a JSON object with the following fields:
    - "accuracy_score": The numeric score (0-5)
    - "reasoning": Brief explanation of your scoring decision
    - "key_differences": Notable differences between the reference and assistant's answer

    Evaluate based on semantic correctness, not exact wording. The AI Assistant's answer can be phrased differently and still be accurate if it conveys the same meaning.
    """
    return prompt


def evaluate_with_retry(prompt: str, llm) -> Optional[str]:
    success = False

    while not success:
        try:
            response = llm.invoke(prompt)
            success = True
            return response.content

        except google_exceptions.ResourceExhausted as e:
            error_string = str(e)

            retry_match = re.search(r'retry_delay\s*{\s*seconds:\s*(\d+)', error_string)
            if retry_match:
                retry_seconds = int(retry_match.group(1))

            print(f"Waiting for {retry_seconds:.1f} seconds before retrying...")

            time.sleep(retry_seconds)

def parse_json_response(response_text: str) -> List[Dict[str, Any]]:
    if not response_text:
        return []

    try:
        cleaned = str(response_text).strip()

        # Remove json code blocks
        cleaned = cleaned.replace("```json", "").replace("```", "").strip()

        json_match = re.search(r'\[\s*{.+}\s*\]', cleaned, re.DOTALL)
        if json_match:
            cleaned = json_match.group(0)

        # Load json
        metrics = json.loads(cleaned)

        return metrics

    except json.JSONDecodeError as e:
        print(f"Failed to parse response as JSON: {e}")
        print("Raw content:")
        print(response_text)
        return []


def extract_model_answer(block_response: str) -> str:
    # Try various patterns to extract the answer
    answer_patterns = [
        # Pattern: "Answer: [answer text]"
        r'(?:answer|response):\s*(.*?)(?:\n\n|\Z)',
        # Pattern: Q: [question] A: [answer]
        r'Q:.*?A:\s*(.*?)(?:\n\n|\Z)',
        # Pattern: "The answer is [answer text]"
        r'(?:the answer is|the response is)\s*(.*?)(?:\n\n|\Z)',
    ]

    for pattern in answer_patterns:
        match = re.search(pattern, block_response, re.IGNORECASE | re.DOTALL)
        if match:
            return match.group(1).strip()



def evaluate_answer_accuracy(question: str, original_answer: str, model_answer: str, llm) -> dict:
    prompt = build_accuracy_evaluation_prompt(question, original_answer, model_answer)

    response_content = evaluate_with_retry(prompt, llm)

    if not response_content:
        return {
            "accuracy_score": 0,
            "reasoning": "Failed to evaluate due to technical issues",
            "key_differences": "N/A"
        }

    try:
        # Clean up common JSON formatting issues
        result = parse_json_response(response_content)

        result["accuracy_score"] = result.get("accuracy_score", 0)
        result["reasoning"] = result.get("reasoning", "No reasoning provided")
        result["key_differences"] = result.get("key_differences", "No differences noted")

        return result

    except Exception as e:
        print(f"Failed to parse accuracy evaluation response: {e}")
        print("Raw content:")
        print(response_content)

        # Return a default result
        return {
            "accuracy_score": 0,
            "reasoning": "Failed to parse evaluation",
            "key_differences": "Error processing evaluation response"
        }


def save_evaluation_results(results: List[Dict[str, Any]], output_path: str) -> None:
    path = Path(output_path)

    path.parent.mkdir(parents=True, exist_ok=True)

    # Save the complete results
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"\nEvaluation results saved to: {path.resolve()}")

    csv_path = path.with_suffix('.csv')

    # Extract metrics into a flat structure
    flat_metrics = []

    for result in results:
        block_num = result['block_info']['prompt_num']

        for metric in result['evaluation_metrics']:
            flat_metric = {
                'prompt_num': block_num,
                'question': metric.get('question', 'N/A'),
                'relevance_score': metric.get('relevance_score', 'N/A'),
                'usage_score': metric.get('usage_score', 'N/A'),
                'answer_accuracy': metric.get('answer_accuracy', 'N/A'),
                'accuracy_reasoning': metric.get('accuracy_reasoning', 'N/A')[:200] + '...' if metric.get('accuracy_reasoning') and len(metric.get('accuracy_reasoning')) > 200 else metric.get('accuracy_reasoning', 'N/A'),
                'difficulty': metric.get('difficulty', 'N/A'),
                'document_id': metric.get('document_id', 'N/A'),
                'hallucination_notes': metric.get('hallucination_notes', '')
            }
            flat_metrics.append(flat_metric)

    df = pd.DataFrame(flat_metrics)
    df.to_csv(csv_path, index=False)
    print(f"Summary CSV saved to: {csv_path.resolve()}")


def evaluate_parsed_blocks(parsed_blocks: List[Dict[str, Any]], llm, original_qa_data: Optional[List[Dict[str, Any]]] = None, output_path: Optional[str] = None) -> List[Dict[str, Any]]:
    all_metrics = []
    results_with_block_info = []

    # Map original Q&A data by question for quick lookup if provided
    qa_lookup = {}
    if original_qa_data:
        # Create a lookup by normalizing questions (lowercase, remove punctuation)
        for idx, qa_item in enumerate(original_qa_data):
            # Create normalized question for matching
            norm_q = re.sub(r'[^\w\s]', '', qa_item['question'].lower())
            norm_q = re.sub(r'\s+', ' ', norm_q).strip()
            qa_lookup[norm_q] = (idx, qa_item)

    for i, block in enumerate(parsed_blocks):
        print(f"\nEvaluating block {i+1}/{len(parsed_blocks)} (Prompt #{block['prompt_num']})...")

        # Build evaluation prompt using the full_block content
        prompt = build_evaluation_prompt(block['full_block'])

        # Evaluate with retry
        response_content = evaluate_with_retry(prompt, llm)

        if response_content:
            metrics = parse_json_response(response_content)

            if metrics:
                for metric in metrics:
                    metric['prompt_num'] = block['prompt_num']

                    # If we have original Q&A data, try to match and calculate accuracy
                    if qa_lookup:
                        norm_q = re.sub(r'[^\w\s]', '', metric['question'].lower())
                        norm_q = re.sub(r'\s+', ' ', norm_q).strip()


                        idx, qa_item = qa_lookup[norm_q]
                        matched_qa = (idx, qa_item)


                        idx, qa_item = matched_qa
                        metric['original_answer'] = qa_item['answer']
                        metric['original_qa_index'] = idx
                        metric['difficulty'] = qa_item.get('difficulty', 'unknown')
                        metric['document_id'] = qa_item.get('document_identification', 'unknown')

                        model_answer = extract_model_answer(block['response'])

                        # LLM accuracy evaluation
                        print(f"  Evaluating answer accuracy for question: '{metric['question'][:50]}...'")
                        accuracy_result = evaluate_answer_accuracy(
                            metric['question'],
                            qa_item['answer'],
                            model_answer,
                            llm
                        )

                        metric['answer_accuracy'] = accuracy_result['accuracy_score']
                        metric['accuracy_reasoning'] = accuracy_result['reasoning']
                        metric['accuracy_key_differences'] = accuracy_result['key_differences']

                all_metrics.extend(metrics)

                results_with_block_info.append({
                    'block_info': block,
                    'evaluation_metrics': metrics,
                    'raw_response': response_content
                })

                print(f"Successfully evaluated block {i+1}. Found {len(metrics)} metric entries.")
            else:
                print(f"Failed to parse metrics for block {i+1}.")
        else:
            print(f"Failed to evaluate block {i+1} after multiple retries.")

    if output_path and results_with_block_info:
        save_evaluation_results(results_with_block_info, output_path)

    return all_metrics

In [35]:
# Evaluate the parsed blocks
input_file = "./llm_output/test/llm_responses_with_context_qa_gemini_20250521_143250.txt"
output_file = "./llm_output/evaluations/FIXED_RETRIEVAL.json"
parsed_blocks = parse_prompt_blocks(input_file)
print(f"Found {len(parsed_blocks)} prompt blocks to evaluate.")

original_qa_data = test_data['qa_pairs']

evaluation_metrics = evaluate_parsed_blocks(parsed_blocks, llm, original_qa_data, output_file)

# Calculate overall statistics
if evaluation_metrics:
    relevance_scores = [m.get('relevance_score', 0) for m in evaluation_metrics if isinstance(m.get('relevance_score'), (int, float))]
    usage_scores = [m.get('usage_score', 0) for m in evaluation_metrics if isinstance(m.get('usage_score'), (int, float))]
    accuracy_scores = [m.get('answer_accuracy', 0) for m in evaluation_metrics if isinstance(m.get('answer_accuracy'), (int, float))]

    if relevance_scores:
        avg_relevance = sum(relevance_scores) / len(relevance_scores)
        print(f"\nAverage relevance score: {avg_relevance:.2f}/5")

    if usage_scores:
        avg_usage = sum(usage_scores) / len(usage_scores)
        print(f"Average usage score: {avg_usage:.2f}/5")

    if accuracy_scores:
        avg_accuracy = sum(accuracy_scores) / len(accuracy_scores)
        print(f"Average answer accuracy: {avg_accuracy:.2f}/5")

    hallucinations = sum(1 for m in evaluation_metrics if m.get('hallucination_notes') and m.get('hallucination_notes').strip())
    print(f"Number of entries with hallucinations: {hallucinations}/{len(evaluation_metrics)}")

    # Group metrics by difficulty if available
    difficulty_groups = {}
    for metric in evaluation_metrics:
        difficulty = metric.get('difficulty', 'unknown')
        if difficulty not in difficulty_groups:
            difficulty_groups[difficulty] = []
        difficulty_groups[difficulty].append(metric)

    # Print stats by difficulty
    print("\nMetrics by difficulty:")
    for difficulty, metrics in difficulty_groups.items():
        if metrics:
            avg_rel = sum(m.get('relevance_score', 0) for m in metrics if isinstance(m.get('relevance_score'), (int, float))) / len(metrics)
            avg_use = sum(m.get('usage_score', 0) for m in metrics if isinstance(m.get('usage_score'), (int, float))) / len(metrics)
            avg_acc = sum(m.get('answer_accuracy', 0) for m in metrics if isinstance(m.get('answer_accuracy'), (int, float))) / len(metrics)
            print(f"  {difficulty.capitalize()} ({len(metrics)} questions):")
            print(f"    Relevance: {avg_rel:.2f}/5")
            print(f"    Usage: {avg_use:.2f}/5")
            print(f"    Accuracy: {avg_acc:.2f}/5")

Found 60 prompt blocks to evaluate.

Evaluating block 1/60 (Prompt #1)...
  Evaluating answer accuracy for question: 'Who is the witness for the CSO Closure and Transfo...'
Successfully evaluated block 1. Found 1 metric entries.

Evaluating block 2/60 (Prompt #2)...
  Evaluating answer accuracy for question: 'When did PG&E file its application to close its 65...'
Successfully evaluated block 2. Found 1 metric entries.

Evaluating block 3/60 (Prompt #3)...
  Evaluating answer accuracy for question: 'Why did PG&E propose transitioning its CSO workfor...'
Successfully evaluated block 3. Found 1 metric entries.

Evaluating block 4/60 (Prompt #4)...
  Evaluating answer accuracy for question: 'How did PG&E reassign its CSO workforce in respons...'
Successfully evaluated block 4. Found 1 metric entries.

Evaluating block 5/60 (Prompt #5)...
  Evaluating answer accuracy for question: 'Yes/No: By the end of 2021, did 98% of exclusive C...'
Successfully evaluated block 5. Found 1 metric entries.

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 19
}
].


  Evaluating answer accuracy for question: 'Why did PG&E include partnerships with community-b...'
Waiting for 17.0 seconds before retrying...
Successfully evaluated block 8. Found 1 metric entries.

Evaluating block 9/60 (Prompt #9)...
  Evaluating answer accuracy for question: 'Yes/No: Will PG&E post signage at the closed CSO l...'
Successfully evaluated block 9. Found 1 metric entries.

Evaluating block 10/60 (Prompt #10)...
  Evaluating answer accuracy for question: 'How will PG&E measure and report the performance o...'
Successfully evaluated block 10. Found 1 metric entries.

Evaluating block 11/60 (Prompt #11)...
  Evaluating answer accuracy for question: 'Which parties entered into the Memorandum of Under...'
Successfully evaluated block 11. Found 1 metric entries.

Evaluating block 12/60 (Prompt #12)...
  Evaluating answer accuracy for question: 'How does PG&E’s CSO Closure and Transformation Pro...'
Successfully evaluated block 12. Found 1 metric entries.

Evaluating block 13

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 40
}
].


Successfully evaluated block 15. Found 1 metric entries.

Evaluating block 16/60 (Prompt #16)...
Waiting for 38.0 seconds before retrying...
  Evaluating answer accuracy for question: 'How did PG&E reassign its CSO workforce in respons...'
Successfully evaluated block 16. Found 1 metric entries.

Evaluating block 17/60 (Prompt #17)...
  Evaluating answer accuracy for question: 'Yes/No: By the end of 2021, did 98% of exclusive C...'
Successfully evaluated block 17. Found 1 metric entries.

Evaluating block 18/60 (Prompt #18)...
  Evaluating answer accuracy for question: 'Yes/No: Will PG&E submit its last CSO Annual Repor...'
Successfully evaluated block 18. Found 1 metric entries.

Evaluating block 19/60 (Prompt #19)...
  Evaluating answer accuracy for question: 'What communication methods will PG&E use to inform...'
Successfully evaluated block 19. Found 1 metric entries.

Evaluating block 20/60 (Prompt #20)...
  Evaluating answer accuracy for question: 'Why did PG&E include partnershi

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 41
}
].


  Evaluating answer accuracy for question: 'Which parties entered into the Memorandum of Under...'
Successfully evaluated block 23. Found 1 metric entries.

Evaluating block 24/60 (Prompt #24)...


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 36
}
].


  Evaluating answer accuracy for question: 'How does PG&E's CSO Closure and Transformation Pro...'
Waiting for 34.0 seconds before retrying...
Successfully evaluated block 24. Found 1 metric entries.

Evaluating block 25/60 (Prompt #25)...
  Evaluating answer accuracy for question: 'Under which rule did PG&E submit its motion reques...'
Successfully evaluated block 25. Found 1 metric entries.

Evaluating block 26/60 (Prompt #26)...
  Evaluating answer accuracy for question: 'As of the filing of this motion, for how long have...'
Successfully evaluated block 26. Found 1 metric entries.

Evaluating block 27/60 (Prompt #27)...
  Evaluating answer accuracy for question: 'Why does PG&E argue that reopening the CSOs prior ...'
Successfully evaluated block 27. Found 1 metric entries.

Evaluating block 28/60 (Prompt #28)...
  Evaluating answer accuracy for question: 'What pre-pandemic CSO usage trends did PG&E cite t...'
Successfully evaluated block 28. Found 1 metric entries.

Evaluating bloc

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 39
}
].


  Evaluating answer accuracy for question: 'As of the filing of this motion, for how long have...'
Waiting for 37.0 seconds before retrying...
Successfully evaluated block 32. Found 1 metric entries.

Evaluating block 33/60 (Prompt #33)...
  Evaluating answer accuracy for question: 'Why does PG&E argue that reopening the CSOs prior ...'
Successfully evaluated block 33. Found 1 metric entries.

Evaluating block 34/60 (Prompt #34)...
  Evaluating answer accuracy for question: 'What pre-pandemic CSO usage trends did PG&E cite t...'
Successfully evaluated block 34. Found 1 metric entries.

Evaluating block 35/60 (Prompt #35)...
  Evaluating answer accuracy for question: 'Yes/No: Does the motion state that 98% of exclusiv...'
Successfully evaluated block 35. Found 1 metric entries.

Evaluating block 36/60 (Prompt #36)...
  Evaluating answer accuracy for question: 'Which statutory provision does PG&E cite to confir...'
Successfully evaluated block 36. Found 1 metric entries.

Evaluating bloc

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 41
}
].


  Evaluating answer accuracy for question: 'Why did the Settling Parties request a decision by...'
Waiting for 39.0 seconds before retrying...
Successfully evaluated block 40. Found 1 metric entries.

Evaluating block 41/60 (Prompt #41)...
  Evaluating answer accuracy for question: 'How does the Decision justify that the self-insura...'
Successfully evaluated block 41. Found 1 metric entries.

Evaluating block 42/60 (Prompt #42)...
  Evaluating answer accuracy for question: 'Yes/No: Were any responses filed opposing the Join...'
Successfully evaluated block 42. Found 1 metric entries.

Evaluating block 43/60 (Prompt #43)...
  Evaluating answer accuracy for question: 'Which application number does Decision 25-03-008 a...'
Successfully evaluated block 43. Found 1 metric entries.

Evaluating block 44/60 (Prompt #44)...
  Evaluating answer accuracy for question: 'On what date was Decision 25-03-008 issued?...'
Successfully evaluated block 44. Found 1 metric entries.

Evaluating block 45/60

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 40
}
].


  Evaluating answer accuracy for question: 'Yes/No: Were any responses filed opposing the Join...'


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 37
}
].


Successfully evaluated block 48. Found 1 metric entries.

Evaluating block 49/60 (Prompt #49)...
Waiting for 35.0 seconds before retrying...
  Evaluating answer accuracy for question: 'What is the effective date for the proposed rate i...'
Successfully evaluated block 49. Found 1 metric entries.

Evaluating block 50/60 (Prompt #50)...
  Evaluating answer accuracy for question: 'Approximately what percentage of the requested rev...'
Successfully evaluated block 50. Found 1 metric entries.

Evaluating block 51/60 (Prompt #51)...
  Evaluating answer accuracy for question: 'Why did PG&E include an analysis of climate change...'
Successfully evaluated block 51. Found 1 metric entries.

Evaluating block 52/60 (Prompt #52)...
  Evaluating answer accuracy for question: 'How does PG&E plan to manage the expected decline ...'
Successfully evaluated block 52. Found 1 metric entries.

Evaluating block 53/60 (Prompt #53)...
  Evaluating answer accuracy for question: 'Yes/No: Does PG&E propose to re

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 42
}
].


Successfully evaluated block 56. Found 1 metric entries.

Evaluating block 57/60 (Prompt #57)...
Waiting for 40.0 seconds before retrying...
  Evaluating answer accuracy for question: 'Why did PG&E include an analysis of climate change...'
Successfully evaluated block 57. Found 1 metric entries.

Evaluating block 58/60 (Prompt #58)...
  Evaluating answer accuracy for question: 'How does PG&E plan to manage the expected decline ...'
Successfully evaluated block 58. Found 1 metric entries.

Evaluating block 59/60 (Prompt #59)...
  Evaluating answer accuracy for question: 'Yes/No: Does PG&E propose to retain and operate th...'
Successfully evaluated block 59. Found 1 metric entries.

Evaluating block 60/60 (Prompt #60)...
  Evaluating answer accuracy for question: 'Yes/No: Does the 2023 GRC forecast include funding...'
Successfully evaluated block 60. Found 1 metric entries.

Evaluation results saved to: C:\Users\Elijah\PycharmProjects\GRCResponder_tools\llm_output\evaluations\FIXED_RETRI

In [11]:
test_list

['Who is the witness for the CSO Closure and Transformation Proposal?',
 'When did PG&E file its application to close its 65 Customer Service Offices?',
 'Why did PG&E propose transitioning its CSO workforce to proactive customer outreach?',
 'How did PG&E reassign its CSO workforce in response to the COVID-19 pandemic?',
 'Yes/No: By the end of 2021, did 98% of exclusive CSO customers make payments through other channels?',
 'Yes/No: Will PG&E submit its last CSO Annual Report on March 1, 2027 with data for 2026?',
 'What communication methods will PG&E use to inform its customers within 90 days of the Commission’s approval of the CSO Closure and Transformation Proposal?',
 'Why did PG&E include partnerships with community-based organizations in its CSO communications plan?',
 'Yes/No: Will PG&E post signage at the closed CSO locations for at least one year?',
 'How will PG&E measure and report the performance of its CSO Closure and Transformation Program?',
 'Which parties entered in

# Simple Context Length Testing

In [3]:
llm_session = LLMChatSession(console_mode=True)

In [17]:
result = llm_session.query("Why did PG&E include partnerships with community-based organizations in its CSO communications plan?")


GRC Assistant: PG&E included partnerships with Community Based Organizations (CBOs) in its communications plan for several reasons, primarily to improve outreach to underserved communities and ensure vulnerable customers have access to important information. Here's a breakdown of the key motivations:

*   **Improved Outreach to Underserved Communities:** PG&E aimed to enhance its outreach to underserved communities by collaborating with CBOs. These organizations can disseminate information in multiple languages about financial assistance and bill savings programs, rate options, wildfire safety-related topics, and emergency preparedness, thus helping to reach vulnerable customers (A2106021, 397495992.PDF, p. 3).
*   **Engagement with Vulnerable Communities:** PG&E engaged with organizations like TURN (The Utility Reform Network), CforAT (Center for Accessible Technology), Cal Advocates, and NDC (National Diversity Coalition) to ensure that the voices of their constituents were heard in

In [15]:
clear_chat_history(llm_session.session_id)

{'status': 'success',
 'message': 'Chat history cleared for session a0c070db-dc46-46b9-8c20-2ce6d0b7da1b'}

In [16]:
result = llm_session.query("Why did PG&E include partnerships with community-based organizations in its CSO communications plan?", k = 20)


GRC Assistant: I do not have access to the specifics of PG&E's CSO communications plan. However, I can use the retrieve tool to search for information regarding why PG&E might include partnerships with community-based organizations in its communications plan. What would you like me to search for?

Processing time: 0.80 seconds


In [14]:
clear_chat_history(llm_session.session_id)

{'status': 'success',
 'message': 'Chat history cleared for session a0c070db-dc46-46b9-8c20-2ce6d0b7da1b'}

In [12]:
result = llm_session.query("Why did PG&E include partnerships with community-based organizations in its CSO communications plan?", k = 50)


GRC Assistant: I do not have access to the specifics of PG&E's CSO communications plan. However, I can use a search query to retrieve information about why PG&E might include partnerships with community-based organizations in its communications plan. What query would you like me to use?

Processing time: 0.82 seconds
