# Evaluating the fine tuned model

### Needed packages and imports

In [1]:
!pip install -r requirements.txt

Collecting torch~=2.5.1 (from -r requirements.txt (line 16))
  Obtaining dependency information for torch~=2.5.1 from https://files.pythonhosted.org/packages/d1/35/e8b2daf02ce933e4518e6f5682c72fd0ed66c15910ea1fb4168f442b71c4/torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl.metadata
  Downloading torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Downloading torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl (906.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m906.5/906.5 MB[0m [31m283.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[0mInstalling collected packages: torch
[0mSuccessfully installed torch
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### Model inference parameters

The parameters to the fine tuned model.

In [2]:
import requests
import os
import yaml
import json
import re
import time
import pandas as pd
import torch

from typing import Iterator
from pathlib import Path
from openai import OpenAI

from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate

from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document as LCDocument

from langchain_openai import ChatOpenAI
from langchain_community.llms import VLLMOpenAI
from langchain_milvus import Milvus
from langchain_text_splitters import RecursiveCharacterTextSplitter

from docling.document_converter import DocumentConverter


In [3]:
import os

from dotenv import load_dotenv

load_dotenv()

False

In [4]:
MAX_TOKENS=2048
TEMPERATURE=0.00

### Milvus connection info

Defaults to local db

In [5]:
MILVUS_URI = os.getenv("MILVUS_URI", "./milvus_local.db")
MILVUS_USERNAME = os.getenv("MILVUS_USERNAME", "")
MILVUS_PASSWORD = os.getenv("MILVUS_PASSWORD", "")
MILVUS_COLLECTION = os.getenv("MILVUS_COLLECTION", "my_org_documents")

## Sanity check model

In [6]:
def create_llm(testing_config):
    if testing_config.get("model_type") == "openai":
        print("Creating OpenAI model")
        return ChatOpenAI(
            openai_api_key=re.sub(r"\s+", "", testing_config["api_key"]),
            model=testing_config["model_name"],
            streaming=False
        )
    print("Creating VLLM model")
    return VLLMOpenAI(
        openai_api_key=re.sub(r"\s+", "", testing_config["api_key"]),
        openai_api_base=testing_config["endpoint_url"], #https://model...com/v1
        model_name=testing_config["model_name"],
        temperature=0.00,
        max_tokens=2048,
        streaming=False
    )

def qna_request(llm, template_str, question):
    num_retries = 1
    for attempt in range(num_retries):
        try:
            prompt = PromptTemplate.from_template(template_str)
            chain = prompt | llm | StrOutputParser()
            answer = chain.invoke({"question": question})
            print(answer)
            return answer.strip()
        except Exception as e:
            print(f"Request failed: {e}")
            if attempt + 1 < num_retries:
                print(f"Retrying in 5 seconds...")
                time.sleep(5)
            else:
                return ""


In [7]:
with open("llm_config.yaml", "r") as f:
    llm_config = yaml.safe_load(f)

llm = create_llm(llm_config["testing_config"][0])
template_str = llm_config["testing_config"][0]["qna_template"]

question = " What is the State Share for the Gateway Tunnel Project?"
qna_request(llm, template_str, question)

Creating VLLM model
The State Share for the Gateway Tunnel Project is 1.3 billion dollars. This information is accurate as of the time of the response. The Gateway Tunnel Project is a significant infrastructure initiative in the United States, and the State Share represents the financial commitment of one of the project's partners, which is New York State.


"The State Share for the Gateway Tunnel Project is 1.3 billion dollars. This information is accurate as of the time of the response. The Gateway Tunnel Project is a significant infrastructure initiative in the United States, and the State Share represents the financial commitment of one of the project's partners, which is New York State."

## Creating an Milvus DB with documents

## Initial index creation and document ingestion

#### Load pdfs

In [8]:
class DoclingPDFLoader(BaseLoader):

    def __init__(self, file_path: str | list[str]) -> None:
        self._file_paths = file_path if isinstance(file_path, list) else [file_path]
        self._converter = DocumentConverter()

    def lazy_load(self) -> Iterator[LCDocument]:
        for source in self._file_paths:
            dl_doc = self._converter.convert(source).document
            text = dl_doc.export_to_markdown()
            yield LCDocument(page_content=text)

In [9]:
pdf_folder_path = "./source_docs"
file_paths = [str(path) for path in Path(pdf_folder_path).rglob('*.pdf')]
file_paths

['source_docs/ny_budget/fy25cp-en.pdf',
 'source_docs/policies/telecommuting/its-p10-003-its-telecommuting-policy.pdf']

In [10]:
loader = DoclingPDFLoader(file_path=file_paths)

#### Split documents into chunks with some overlap

In [11]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=100,
)

docs = loader.load()
all_splits = text_splitter.split_documents(docs)
all_splits[0]

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

[91m*ERR* --- *ERR*[0m
[91m*ERR* Table is not square! *ERR*[0m
[93m*Padding to square...*[0m
[91m*ERR* --- *ERR*[0m
[91m*ERR* Table is not square! *ERR*[0m
[93m*Padding to square...*[0m
[91m*ERR* --- *ERR*[0m
[91m*ERR* Table is not square! *ERR*[0m
[93m*Padding to square...*[0m
ERR#: COULD NOT CONVERT TO RS THIS TABLE TO COMPUTE SPANS
[91m*ERR* --- *ERR*[0m
[91m*ERR* Table is not square! *ERR*[0m
[93m*Padding to square...*[0m


Document(metadata={}, page_content='<!-- image -->\n\n<!-- image -->')

#### Create the index and ingest the documents

In [12]:
device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)

model_kwargs = {"trust_remote_code": True, "device": device}
embeddings = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1.5",
    model_kwargs=model_kwargs,
    show_progress=True
)

db = Milvus(
    embedding_function=embeddings,
    connection_args={
        "uri": MILVUS_URI,
        "user": MILVUS_USERNAME, 
        "password": MILVUS_PASSWORD
    },
    collection_name=MILVUS_COLLECTION,
    auto_id=True,
    drop_old=True
)


  embeddings = HuggingFaceEmbeddings(
<All keys matched successfully>


In [13]:
loaded = db.add_documents(all_splits)
print(f"{len(loaded)} documents loaded.")

Batches:   0%|          | 0/120 [00:00<?, ?it/s]

3829 documents loaded.


#### Test vector DB search

In [14]:
query = "What percentage of existing State-related debt is projected to be retired in 15 years?"
docs_with_score = db.similarity_search_with_score(query)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [15]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

--------------------------------------------------------------------------------
Score:  167.80189514160156
## $^{ }$State Debt as a Percent of Personal Income

The State debt projections from FY 2025 to FY 2029 reflect a 9.8 percent average annual increase in debt levels and 4.1 percent average annual increase in statewide personal income. As a result, debt as a percentage of personal income is expected to increase over the five-year Plan period, from 3.9 percent in FY 2025 to 4.8 percent in FY 2029.

Over a longer time horizon, State-related debt outstanding as a percentage of personal income is projected to drop from 5.0 percent in FY 2015 to 4.8 percent in FY 2029. This decline can be largely attributed to the retirement of State debt, which has accelerated over the past three years due to early retirement (defeasance) of debt due to prepayments. Over the last ten-years debt has remained virtually the same at $54.3 billion and personal income grew at 4.0 percent annually, which has

#### Test out RAG request

In [16]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def rag_request(llm, template_str, question):
    num_retries = 1
    for attempt in range(num_retries):
        try:
            prompt = PromptTemplate.from_template(template_str)
            rag_chain = (
                    {"context": retriever | format_docs, "question": RunnablePassthrough()}
                    | prompt
                    | llm
                    | StrOutputParser()
            )
            response = rag_chain.invoke(question)
            return response.strip()
        except Exception as e:
            print(f"Request failed: {e}")
            if attempt < num_retries:
                print(f"Retrying in 5 seconds...")
                time.sleep(5)
            else:
                return ""

In [17]:
llm = create_llm(llm_config["testing_config"][0])
template_str = llm_config["testing_config"][0]["rag_template"]

question = "What percentage of existing State-related debt is projected to be retired in 15 years?"
result = rag_request(llm, template_str, question)
result

Creating VLLM model


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'The percentage of existing State-related debt projected to be retired in 15 years is 59%. This information can be found in the table provided in the context, which shows the cumulative percentage of existing debt to be retired for different periods. In this case, 59% of the existing debt is projected to be retired in 15 years.'

## Generate Answers

### Use qna.yaml to create some questions and ground truth answers

In [18]:
with open("llm_config.yaml", "r") as f:
    llm_config = yaml.safe_load(f)

directory = "./source_docs"
output_directory = llm_config.get("name", "output")
os.makedirs(output_directory, exist_ok=True)

qna_list = []

for file_path in Path(directory).rglob('qna.yaml'):
    print(file_path)
    if not file_path.name == 'qna.yaml':
        continue
    with open(file_path) as file:
        qna = yaml.load(file, Loader=yaml.FullLoader)
        for seed_example in qna["seed_examples"]:
            for questions_and_answers in seed_example["questions_and_answers"]:
                qna_list.append(
                    {
                        "question": questions_and_answers["question"].strip(),
                        "ground_truth": questions_and_answers["answer"].strip()                     
                    }
                )
                
# print(qna_list)

qna_df = pd.DataFrame(qna_list)
# df.to_csv('qna.csv', index=False)
qna_df.to_json(f"{output_directory}/qna.jsonl", orient="records", lines=True)


source_docs/ny_budget/qna.yaml


## Get responses from each of the available models with and without RAG

In [19]:
import yaml

def replace_special_char(original_str):
    return re.sub(r"[^\w]", "_", original_str)

with open("llm_config.yaml", "r") as f:
    llm_config = yaml.safe_load(f)

output_directory = llm_config.get("name", "output")
os.makedirs(output_directory, exist_ok=True)   
    
qna_df = pd.read_json(f"{output_directory}/qna.jsonl", orient="records", lines=True)

for testing_config in llm_config["testing_config"]:
    answers = qna_df.copy()
    answers["answer"] = ""
    answers["rag_answer"] = ""
    llm = create_llm(testing_config)
    for index, row in answers.iterrows():
        question = row["question"]
        print(index, question)
        if testing_config.get("qna_template"):
            answer = qna_request(llm, testing_config.get("qna_template"), question)
            print("QnA Answer: " + answer[:40])
            answers.at[index, "answer"] = answer
        if testing_config.get("rag_template"):
            answer = rag_request(llm, testing_config.get("rag_template"), question)
            print("RAG Answer: " + answer[:40])
            answers.at[index, "rag_answer"] = answer
    testing_config_name = replace_special_char(testing_config["name" or "model_name"])
    answers.to_json(f"{output_directory}/{testing_config_name}_answers.jsonl", orient="records", lines=True)
    # answers.to_csv(f"{output_directory}/{base_filename}_answers.csv")

Creating VLLM model
0 Are some of the State's transformative infrastructure projects financed outside of the State budget?
Yes, some of the State's transformative infrastructure projects are financed outside of the State budget. This means that these projects receive funding from sources other than the State's regular budget. This can include funding from federal grants, public-private partnerships, or other external sources. By leveraging external funding, the State can potentially offset the cost of these large-scale projects, make them more financially viable, and avoid increasing taxes for residents.
QnA Answer: Yes, some of the State's transformative 


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: Yes, some of the State's transformative 
1 What is the State Share for the Gateway Tunnel Project?
The State Share for the Gateway Tunnel Project is 1.3 billion dollars. This information is accurate as of the time of the response. The Gateway Tunnel Project is a significant infrastructure initiative in the United States, involving the construction of a new two-track tunnel under the Hudson River between New York and New Jersey. The project has a total estimated cost of 16.0 billion dollars, with the State Share being the portion that the State of New Jersey is responsible for financing.
QnA Answer: The State Share for the Gateway Tunnel P


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The State Share for the Gateway Tunnel P
2 Who are the funding partners for the State's transformative infrastructure projects?
The funding partners for the State's transformative infrastructure projects are the State, local governments, the Federal government, public authorities, and private entities. This means that the State, along with various levels of government and private organizations, contribute financially to these significant projects. The specific allocation of funds and the roles of each partner can vary depending on the project, but this general structure is common in large-scale infrastructure development.
QnA Answer: The funding partners for the State's tra


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The funding partners for the State's tra
3 What percent of State capital spending in FY 2025 is expected to be supported from Federal aid?
The answer is 20 percent. This means that 20 percent of the State's capital spending in FY 2025 is expected to be funded by Federal aid. This information can be useful for understanding the role of Federal funding in the State's capital program and for making informed decisions about resource allocation.
QnA Answer: The answer is 20 percent. This means tha


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: 19 percent of State capital spending in 
4 Is capital spending expected to increase in FY 2025?
Yes, capital spending is indeed expected to rise in FY 2025. This projection is based on the information provided in the question, which suggests that there is a positive outlook for capital spending in the fiscal year. However, it's important to note that the exact percentage or amount of the increase may not be specified in the response. Nonetheless, the fact that capital spending is anticipated to grow is a clear indication of a positive economic development.
QnA Answer: Yes, capital spending is indeed expected


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: Yes, capital spending is expected to inc
5 Why is State capital spending projected to increase in FY 2025?
State capital spending is projected to increase in FY 2025 due to increased investments in economic development. This is because the state aims to create jobs, strengthen the economy, and generate economic opportunities by investing in various sectors such as infrastructure, housing, and other key areas.
QnA Answer: State capital spending is projected to i


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: State capital spending is projected to i
6 What is capital spending for the State projected in FY 2025?
The capital spending for the State projected in FY 2025 is $17.7 billion.
QnA Answer: The capital spending for the State proje


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: Capital spending for the State is projec
7 What are new large scale projects in the Five-Year DOT capital plan?
The Five-Year DOT Capital Plan includes several new large-scale projects. One of them is the Interborough Express, which is a proposed 10-mile light rail system that will connect Brooklyn, Queens, and the Bronx. This project aims to provide a more efficient and eco-friendly transportation option for the residents of these boroughs. Another significant project is the expansion of the Second Avenue Subway, which will include the addition of a new westward expansion and four new fully ADA compliant stations. This expansion will improve accessibility and provide better connectivity to the Hudson River Line. Additionally, the plan includes the Hunts Point Interstate Access Improvement Project, which will modernize the Hunts Point Interstate Access and improve the efficiency of the truck terminal. These are just a few examples of the new large-scale projects in the Five

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The new large scale projects in the Five
8 How much does the DOT Capital plan include for the "Bridge NY" program?
The DOT Capital Plan includes $1 billion for the "Bridge NY" program. This program focuses on the reconstruction, replacement, or repair of bridges across the State of New York, aiming to improve the overall condition and safety of these structures. The $1 billion allocation is part of the larger DOT Capital Plan, which consists of funding for various transportation infrastructure projects across the state.
QnA Answer: The DOT Capital Plan includes $1 billion


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The DOT Capital Plan includes $1 billion
9 What percentage of existing State-releated debt is projected to be retired in 15 years?
The answer is 14%. This means that, based on current projections, approximately 14% of the existing State-related debt is expected to be retired within 15 years. This percentage is a measure of the State's debt repayment capacity and its ability to manage and reduce its debt over time.
QnA Answer: The answer is 14%. This means that, base


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The percentage of existing State-related
10 What does 100% State-releated debt retirement represent?
100% State-related debt retirement refers to the complete repayment of all debts that are directly linked to the financial operations and responsibilities of a state or sovereign entity. This includes debts issued by the state itself, as well as debts issued by public authorities or agencies on behalf of the state. The term "retirement" implies that the debt obligation has been discharged, and the state has no further commitment to make payments on that debt. It is important to note that while the debt is retired, the state may still be liable for any inter-governmental loans or guarantees associated with the debt.
QnA Answer: 100% State-related debt retirement refer


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: 100% State-related debt retirement repre
11 What impact does the rate of debt retirement have on the State?
The rate of debt retirement has a significant impact on the State. When debt is retired at a faster rate, the State can reduce the amount of interest it pays over time. This is because the principal amount of debt decreases, leading to lower interest payments. Additionally, retiring debt more quickly can help the State free up capital for other purposes, as the debt service burden is alleviated. However, it's important to balance the benefits of faster debt retirement with the State's financial resources and priorities, as retiring debt too quickly may hinder the State's ability to invest in critical infrastructure and other essential services.
QnA Answer: The rate of debt retirement has a signif


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The rate at which State-related debt is 
12 How much will be appropriated on Community Services Programs for people with development disabilities in FY 2026?
The amount appropriated for Community Services Programs for people with development disabilities in FY 2026 is 74 million dollars.
QnA Answer: The amount appropriated for Community Se


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The answer cannot be determined from the
13 What are the projected appropriations from FY 2025 to FY 2027 for Non-Bondable Projects for people with developmental disabilities?
The projected appropriations for Non-Bondable Projects for people with developmental disabilities from FY 2025 to FY 2027 are as follows:

* FY 2025: 1,000 thousand dollars
* FY 2026: 1,000 thousand dollars
* FY 2027: 1,000 thousand dollars

These figures remain constant for each fiscal year, indicating a consistent annual appropriation for Non-Bondable Projects.
QnA Answer: The projected appropriations for Non-Bon


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The projected appropriations for Non-Bon
14 What is the total projected appropriation on programs for people with development disabilities in FY 2026?
The total projected appropriation on programs for people with development disabilities in FY 2026 is 564,692 thousand dollars. This amount is the sum of various programs' funding for that fiscal year, as detailed in the provided document.
QnA Answer: The total projected appropriation on pro


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The total projected appropriation on pro
Creating VLLM model
0 Are some of the State's transformative infrastructure projects financed outside of the State budget?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: Yes, some of the State's transformative 
1 What is the State Share for the Gateway Tunnel Project?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The State Share for the Gateway Tunnel P
2 Who are the funding partners for the State's transformative infrastructure projects?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The funding partners for the State's tra
3 What percent of State capital spending in FY 2025 is expected to be supported from Federal aid?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The answer is 19 percent. This informati
4 Is capital spending expected to increase in FY 2025?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: Yes, capital spending is expected to inc
5 Why is State capital spending projected to increase in FY 2025?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: State capital spending is projected to i
6 What is capital spending for the State projected in FY 2025?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: Capital spending for the State is projec
7 What are new large scale projects in the Five-Year DOT capital plan?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The new large-scale projects in the Five
8 How much does the DOT Capital plan include for the "Bridge NY" program?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The DOT Capital plan includes $1 billion
9 What percentage of existing State-releated debt is projected to be retired in 15 years?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: 59% of existing State-related debt is pr
10 What does 100% State-releated debt retirement represent?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: 100% State-related debt retirement repre
11 What impact does the rate of debt retirement have on the State?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The rate at which State-related debt is 
12 How much will be appropriated on Community Services Programs for people with development disabilities in FY 2026?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The context provided does not include sp
13 What are the projected appropriations from FY 2025 to FY 2027 for Non-Bondable Projects for people with developmental disabilities?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The projected appropriations from FY 202
14 What is the total projected appropriation on programs for people with development disabilities in FY 2026?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The total projected appropriation on pro
Creating OpenAI model
0 Are some of the State's transformative infrastructure projects financed outside of the State budget?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: Yes, some of the State's transformative 
1 What is the State Share for the Gateway Tunnel Project?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The State Share for the Gateway Tunnel P
2 Who are the funding partners for the State's transformative infrastructure projects?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The funding partners for the State's tra
3 What percent of State capital spending in FY 2025 is expected to be supported from Federal aid?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: 19 percent of State capital spending in 
4 Is capital spending expected to increase in FY 2025?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: Yes, capital spending in FY 2025 is proj
5 Why is State capital spending projected to increase in FY 2025?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: State capital spending is projected to i
6 What is capital spending for the State projected in FY 2025?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: Capital spending for the State is projec
7 What are new large scale projects in the Five-Year DOT capital plan?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The new large-scale projects in the Five
8 How much does the DOT Capital plan include for the "Bridge NY" program?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The DOT Capital Plan includes $1 billion
9 What percentage of existing State-releated debt is projected to be retired in 15 years?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: 59%
10 What does 100% State-releated debt retirement represent?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: 100% State-related debt retirement repre
11 What impact does the rate of debt retirement have on the State?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The rate at which debt is retired or pai
12 How much will be appropriated on Community Services Programs for people with development disabilities in FY 2026?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The text doesn't provide specific inform
13 What are the projected appropriations from FY 2025 to FY 2027 for Non-Bondable Projects for people with developmental disabilities?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: I'm sorry, but I can't provide the infor
14 What is the total projected appropriation on programs for people with development disabilities in FY 2026?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

RAG Answer: The document does not provide specific f


## Grade responses using Judge Model

In [20]:
import yaml

with open("llm_config.yaml", "r") as f:
    llm_config = yaml.safe_load(f)

output_directory = llm_config.get("name", "output")
os.makedirs(output_directory, exist_ok=True)

In [21]:
from langchain.prompts import PromptTemplate

scoring_template_str = """You are an evaluation system tasked with assessing the answer quality of a AI generated response in relation to the posed question and reference answer. Assess if the response is correct, accurate, and factual based on the reference answer. Evaluate the answer_quality as:
    - Score 1: The response is completely incorrect, inaccurate, and/or not factual.
    - Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
    - Score 3: The response is somewhat correct, accurate, and/or factual.
    - Score 4: The response is mostly correct, accurate, and factual.
    - Score 5: The response is completely correct, accurate, and factual.
    Here is the question: \n ------- \n {question} \n -------
    Here is model answer: \n ------- \n {answer} \n -------
    Here is the reference answer(may be very short and lack details or indirect, long and extractive):  \n ------- \n {reference_answer} \n ------- \n
    Assess the quality of model answer with respect to the Reference Answer, but do not penalize the model answer for adding details or give a direct answer to user question. Provide the quality level as a JSON object with two keys: 'reasoning' and 'answer_quality'.
    """

scoring_template_str = llm_config["judge"].get("template", scoring_template_str)
SCORING_PROMPT = PromptTemplate.from_template(scoring_template_str)

In [22]:
judge_client = OpenAI(api_key=llm_config["judge"]["api_key"])
judge_model_name = llm_config["judge"]["model_name"]


def score_request(question, answer, reference_answer):
    messages = [
        {
            "role": "user",
            "content": SCORING_PROMPT.format(
                question=question,
                answer=answer,
                reference_answer=reference_answer
            )
        }
    ]

    completion = judge_client.chat.completions.create(
        model=judge_model_name,
        messages=messages,
        n=1,
        temperature=0.0,
        max_tokens=1024,
    )
    
    response_content = completion.choices[0].message.content
    result = json.loads(response_content)
    score = result["answer_quality"]
    reasoning = result["reasoning"]
    return score, reasoning


In [23]:
for testing_config in llm_config["testing_config"]:
    testing_config_name = replace_special_char(testing_config["name" or "model_name"])
    answers_filename = f"{output_directory}/{testing_config_name}_answers.jsonl"
    scores = pd.read_json(answers_filename, orient="records", lines=True)
    position = scores.columns.get_loc("answer")
    scores.insert(position + 1, "answer_score", "")
    scores.insert(position + 2, "answer_score_reasoning", "")
    position = scores.columns.get_loc("rag_answer")
    scores.insert(position + 1, "rag_answer_score", "")
    scores.insert(position + 2, "rag_answer_score_reasoning", "")

    for index, row in scores.iterrows():
        question = row["question"]
        answer = row["answer"]
        reference_answer = row["ground_truth"]
        print(index, question)
        if answer:
            score, reasoning = score_request(question, answer, reference_answer)
            scores.at[index, "answer_score"] = score
            scores.at[index, "answer_score_reasoning"] = reasoning
            print(answer[:40], score, reasoning[:40])
        rag_answer = row["rag_answer"]
        if rag_answer:
            score, reasoning = score_request(question, rag_answer, reference_answer)
            scores.at[index, "rag_answer_score"] = score
            scores.at[index, "rag_answer_score_reasoning"] = reasoning
            print(rag_answer[:40], score, reasoning[:40])
    judge_name = replace_special_char(judge_model_name)
    scores_filename = f"{output_directory}/{testing_config_name}_{judge_name}_scores"
    scores.to_json(f"{scores_filename}.jsonl", orient="records", lines=True)
    scores.to_csv(f"{scores_filename}.csv", index=False)


0 Are some of the State's transformative infrastructure projects financed outside of the State budget?
Yes, some of the State's transformative  5 The model answer is in line with the ref
Yes, some of the State's transformative  5 The model answer is completely correct, 
1 What is the State Share for the Gateway Tunnel Project?
The State Share for the Gateway Tunnel P 5 The model answer is completely correct, 
The State Share for the Gateway Tunnel P 5 The model answer is completely correct, 
2 Who are the funding partners for the State's transformative infrastructure projects?
The funding partners for the State's tra 5 The model answer is completely correct, 
The funding partners for the State's tra 5 The model answer is completely correct, 
3 What percent of State capital spending in FY 2025 is expected to be supported from Federal aid?
The answer is 20 percent. This means tha 4 The model answer is mostly correct. It c
19 percent of State capital spending in  5 The model answer correc

## Create resulting score report CSV

In [24]:
with open("llm_config.yaml", "r") as f:
    llm_config = yaml.safe_load(f)

output_directory = llm_config.get("name", "output")
os.makedirs(output_directory, exist_ok=True)

judge_client = OpenAI(api_key=llm_config["judge"]["api_key"])
judge_model_name = llm_config["judge"]["model_name"]
judge_name = replace_special_char(judge_model_name)

summary_output_df = pd.DataFrame()

for testing_config in llm_config["testing_config"]:
    testing_config_name = replace_special_char(testing_config["name" or "model_name"])
    scores_filename = f"{output_directory}/{testing_config_name}_{judge_name}_scores.jsonl"
    scores = pd.read_json(scores_filename, orient="records", lines=True)
    if testing_config.get("qna_template"):
        summary_output_df[f"{testing_config_name}_answer_score"] = scores["answer_score"]
    if testing_config.get("rag_template"):
        summary_output_df[f"{testing_config_name}_rag_answer_score"] = scores["rag_answer_score"]

average_row = summary_output_df.mean(axis=0, numeric_only=True)
print(average_row)
summary_output_df.loc[len(summary_output_df)] = average_row
question_indices = [f"Q{i+1}" for i in range(len(summary_output_df)-1)]
question_indices.append("Average")
summary_output_df.insert(0, 'question index', question_indices)

summary_filepath = f"{output_directory}/summary_{judge_name}_scores"
# summary_output_df.to_json(f"{summary_filepath}.jsonl", orient="records", lines=True)
summary_output_df.to_csv(f"{summary_filepath}.csv", index=False)

ny_state_answer_score          3.666667
ny_state_rag_answer_score      4.400000
granite_7b_rag_answer_score    4.266667
gpt_4_rag_answer_score         4.200000
dtype: float64


In [25]:
with pd.ExcelWriter(f"{output_directory}/{judge_name}_scores.xlsx") as writer:
    summary_output_df = pd.read_csv(f"{summary_filepath}.csv")
    summary_output_df.to_excel(writer, sheet_name="Summary", index=False)

    for testing_config in llm_config["testing_config"]:
        testing_config_name = replace_special_char(testing_config["name" or "model_name"])
        scores_filename = f"{output_directory}/{testing_config_name}_{judge_name}_scores.jsonl"
        scores = pd.read_json(scores_filename, orient="records", lines=True)
        scores.to_excel(writer, sheet_name=f"{testing_config_name}_scores")