# Evaluating the fine tuned model

### Needed packages and imports

In [1]:
!pip install -r requirements.txt

Collecting docling[tesserocr]~=2.8.3 (from -r requirements.txt (line 1))
  Obtaining dependency information for docling[tesserocr]~=2.8.3 from https://files.pythonhosted.org/packages/02/e9/8d81e497365224e2ea80ce0b625f1e9339d736a8a7f7c2224c6f56be3131/docling-2.8.3-py3-none-any.whl.metadata
  Downloading docling-2.8.3-py3-none-any.whl.metadata (7.7 kB)
Collecting einops==0.8.0 (from -r requirements.txt (line 2))
  Obtaining dependency information for einops==0.8.0 from https://files.pythonhosted.org/packages/44/5a/f0b9ad6c0a9017e62d4735daaeb11ba3b6c009d69a26141b258cd37b5588/einops-0.8.0-py3-none-any.whl.metadata
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Collecting langchain==0.3.11 (from -r requirements.txt (line 3))
  Obtaining dependency information for langchain==0.3.11 from https://files.pythonhosted.org/packages/ba/4a/26620afcff880f6058756786d9b858d348ac29c815e44f57b6c2c07bf86d/langchain-0.3.11-py3-none-any.whl.metadata
  Downloading langchain-0.3.11-py3-none-any.

### Model inference parameters

The parameters to the fine tuned model.

In [2]:
import requests
import os
import yaml
import json
import re
import time
import pandas as pd
import torch

from typing import Iterator
from pathlib import Path
from openai import OpenAI

from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate

from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document as LCDocument

from langchain_openai import ChatOpenAI
from langchain_community.llms import VLLMOpenAI
from langchain_milvus import Milvus
from langchain_text_splitters import RecursiveCharacterTextSplitter

from docling.document_converter import DocumentConverter

def replace_special_char(original_str):
    return re.sub(r"[^\w]", "_", original_str)


In [3]:
import os

from dotenv import load_dotenv

load_dotenv()

False

In [4]:
MAX_TOKENS=2048
TEMPERATURE=0.00

### Milvus connection info

Defaults to local db

In [5]:
MILVUS_URI = os.getenv("MILVUS_URI", "./milvus_llm_judge_eval.db")
MILVUS_USERNAME = os.getenv("MILVUS_USERNAME", "")
MILVUS_PASSWORD = os.getenv("MILVUS_PASSWORD", "")
MILVUS_COLLECTION = os.getenv("MILVUS_COLLECTION", "my_org_documents")

## Sanity check model

In [6]:
def create_llm(testing_config):
    if testing_config.get("model_type") == "openai":
        print("Creating OpenAI model")
        return ChatOpenAI(
            openai_api_key=re.sub(r"\s+", "", testing_config["api_key"]),
            model=testing_config["model_name"],
            streaming=False
        )
    print("Creating VLLM model")
    return VLLMOpenAI(
        openai_api_key=re.sub(r"\s+", "", testing_config["api_key"]),
        openai_api_base=testing_config["endpoint_url"], #https://model...com/v1
        model_name=testing_config["model_name"],
        temperature=0.00,
        max_tokens=2048,
        streaming=False
    )

def qna_request(llm, template_str, question):
    num_retries = 1
    for attempt in range(num_retries):
        try:
            prompt = PromptTemplate.from_template(template_str)
            chain = prompt | llm | StrOutputParser()
            answer = chain.invoke({"question": question})        
            print(answer)
            return answer.strip()
        except Exception as e:
            print(f"Request failed: {e}")
            if attempt + 1 < num_retries:
                print(f"Retrying in 5 seconds...")
                time.sleep(5)
            else:
                return ""


In [7]:
with open("llm_config.yaml", "r") as f:
    llm_config = yaml.safe_load(f)
    
llm = create_llm(llm_config["testing_config"][0])
template_str = llm_config["testing_config"][0]["qna_template"]

question = "What is Artificial Intelligence?"
qna_request(llm, template_str, question)

Creating VLLM model
Artificial Intelligence (AI) is a multidisciplinary field of study within computer science that focuses on creating intelligent machines capable of performing tasks that would typically require human intelligence. These tasks include learning from experience, understanding natural language, recognizing patterns, solving problems, and making decisions. AI systems can be categorized into two main types: narrow or weak AI, designed to perform a specific task, and general or strong AI, which can perform any intellectual task that a human being can do.


'Artificial Intelligence (AI) is a multidisciplinary field of study within computer science that focuses on creating intelligent machines capable of performing tasks that would typically require human intelligence. These tasks include learning from experience, understanding natural language, recognizing patterns, solving problems, and making decisions. AI systems can be categorized into two main types: narrow or weak AI, designed to perform a specific task, and general or strong AI, which can perform any intellectual task that a human being can do.'

## Creating an Milvus DB with documents

## Initial index creation and document ingestion

#### Load pdfs

In [8]:
class DoclingPDFLoader(BaseLoader):

    def __init__(self, file_path: str | list[str]) -> None:
        self._file_paths = file_path if isinstance(file_path, list) else [file_path]
        self._converter = DocumentConverter()

    def lazy_load(self) -> Iterator[LCDocument]:
        for source in self._file_paths:
            dl_doc = self._converter.convert(source).document
            text = dl_doc.export_to_markdown()
            yield LCDocument(page_content=text)

In [9]:
pdf_folder_path = "../data_preparation/document_collection"
file_paths = [str(path) for path in Path(pdf_folder_path).rglob('*.pdf')]
file_paths

['../data_preparation/document_collection/ny_budget/fy25cp-en.pdf',
 '../data_preparation/document_collection/ny_policies/its-p19-005-open-source/its-p19-005-open-source.pdf',
 '../data_preparation/document_collection/ny_policies/nys-s16-001-new-york-state-universal-web-navigation/nys-s16-001-new-york-state-universal-web-navigation.pdf',
 '../data_preparation/document_collection/ny_policies/its-p10-003-its-telecommuting-policy/its-p10-003-its-telecommuting-policy.pdf',
 '../data_preparation/document_collection/ny_policies/nys-p08-003-domain-names-for-state-government/nys-p08-003-domain-names-for-state-government.pdf',
 '../data_preparation/document_collection/ny_policies/nys-s14-009_mobile_device_security/nys-s14-009_mobile_device_security.pdf',
 '../data_preparation/document_collection/ny_policies/its-p04-005-surplus-and-disposal-of-its-equipment-furniture-and-vehicles/its-p04-005-surplus-and-disposal-of-its-equipment-furniture-and-vehicles.pdf',
 '../data_preparation/document_collect

In [10]:
loader = DoclingPDFLoader(file_path=file_paths)

#### Split documents into chunks with some overlap

In [11]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=100,
)

docs = loader.load()
splits = text_splitter.split_documents(docs)
splits[0]

  from .autonotebook import tqdm as notebook_tqdm
Fetching 9 files: 100%|██████████| 9/9 [00:05<00:00,  1.55it/s]
Downloading detection model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

Downloading recognition model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete



Document(metadata={}, page_content='<!-- image -->\n\n<!-- image -->')

#### Create the index and ingest the documents

In [14]:
device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)

model_kwargs = {"trust_remote_code": True, "device": device}
embeddings = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1.5",
    model_kwargs=model_kwargs,
    show_progress=True
)

db = Milvus(
    embedding_function=embeddings,
    connection_args={
        "uri": MILVUS_URI,
        "user": MILVUS_USERNAME, 
        "password": MILVUS_PASSWORD
    },
    collection_name=MILVUS_COLLECTION,
    auto_id=True,
    drop_old=True
)


  embeddings = HuggingFaceEmbeddings(
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]
A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
<All keys matched successfully>


In [15]:
loaded = db.add_documents(splits)
print(f"{len(loaded)} documents loaded.")

Batches: 100%|██████████| 123/123 [00:55<00:00,  2.22it/s]


3909 documents loaded.


#### Test vector DB search

In [16]:
query = "Who are the funding partners for the State's transformative infrastructure projects?"
docs_with_score = db.similarity_search_with_score(query)

Batches: 100%|██████████| 1/1 [00:00<00:00, 77.33it/s]


In [17]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

--------------------------------------------------------------------------------
Score:  146.87289428710938
-  Invests in projects to improve the State's transit systems, modernize airports, and rebuild infrastructure to upgrade the State's transportation network. Several projects are expected to be financed from multiple funding sources and administered by public authorities (e.g., MTA, PANYNJ) outside of the State budget. Funding partners for these projects include the State, local governments, the Federal government, public authorities, and private entities. The FY 2025 Enacted Budget supports the State share of funding for these projects. Major infrastructure projects underway with funding partners are shown in the table below.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  208.61204528808594
TRANSFORMATIVE INFRASTRUCTURE PROJECTS (billions of dollars)

|     

#### Test out RAG request

In [18]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def rag_request(llm, template_str, question):
    num_retries = 1
    for attempt in range(num_retries):
        try:
            prompt = PromptTemplate.from_template(template_str)
            rag_chain = (
                    {"context": retriever | format_docs, "question": RunnablePassthrough()}
                    | prompt
                    | llm
                    | StrOutputParser()
            )
            response = rag_chain.invoke(question)
            return response.strip()
        except Exception as e:
            print(f"Request failed: {e}")
            if attempt < num_retries:
                print(f"Retrying in 5 seconds...")
                time.sleep(5)
            else:
                return ""

In [19]:
llm = create_llm(llm_config["testing_config"][0])
template_str = llm_config["testing_config"][0]["rag_template"]

question = "Who are the funding partners for the State's transformative infrastructure projects?"
result = rag_request(llm, template_str, question)
result

Creating VLLM model


Batches: 100%|██████████| 1/1 [00:00<00:00, 78.63it/s]


"The funding partners for the State's transformative infrastructure projects include the State, local governments, the Federal government, public authorities, and private entities. The table in the context provides a breakdown of the total project costs, the State's share, and the share of other funding partners. For example, the Gateway Tunnel Project has a total project cost of $16 billion, with the State's share being $1.3 billion and the share of other funding partners being $14.7 billion."

## Generate Answers

### Use qna.yaml, csv, jsonl to create some questions and ground truth answers

We create a pandas dataframe with the columns `question` and `ground_truth`
- create a csv file in the correct (default is "ground_truth") directory with the columns `question` and `ground_truth`

- qna.yaml files can be taken as written from data_preparation and converted to the appropriate format.  

In [31]:
with open("llm_config.yaml", "r") as f:
    llm_config = yaml.safe_load(f)

directory = "ground_truth"
output_directory = llm_config.get("name", "output")
os.makedirs(output_directory, exist_ok=True)
qround_truth_df = pd.DataFrame(columns=["question", "ground_truth"])

for file_path in Path(directory).rglob('*.csv'):
    # print(file_path)
    csv_df = pd.read_csv(file_path)
    print(f"{file_path}: {csv_df.shape[0]} questions")
    qround_truth_df = pd.concat([qround_truth_df, csv_df], ignore_index=True)

for file_path in Path(directory).rglob('*.jsonl'):
    # print(file_path)
    jsonl_df = pd.read_json(file_path, orient="records", lines=True)
    print(f"{file_path}: {jsonl_df.shape[0]} questions")
    qround_truth_df = pd.concat([qround_truth_df, jsonl_df], ignore_index=True)

qna_list = []

for file_path in Path(directory).rglob('*.yaml'):
    with open(file_path) as file:
        qna = yaml.load(file, Loader=yaml.FullLoader)
        for seed_example in qna["seed_examples"]:
            for questions_and_answers in seed_example["questions_and_answers"]:
                qna_list.append({
                    "question": questions_and_answers["question"].strip(),
                    "ground_truth": questions_and_answers["answer"].strip()
                })
        print(f"{file_path}: {len(qna_list)} questions")

ground_truth_df = pd.concat([qround_truth_df, pd.DataFrame(qna_list)], ignore_index=True)
ground_truth_df = ground_truth_df.drop_duplicates(subset=["question"])
print(f"{ground_truth_df.shape[0]} total unique questions")

ground_truth_df.to_json(f"{output_directory}/ground_truth.jsonl", orient="records", lines=True)
ground_truth_df.head()

ground_truth/qna.csv: 11 questions
ground_truth/qna.jsonl: 13 questions
ground_truth/qna.yaml: 15 questions
39 total unique questions


Unnamed: 0,question,ground_truth
0,What does an employee need to provide in an ap...,An employee needs to provide a request in writ...
1,Who is on the Telecommuting Appeal Board?,The telecommuting appeal board consists of a r...
2,How long do employees have to wait until they ...,Employees who have been removed from the Telec...
3,Should SE websites have sub-domain addresses?,SE websites should use short URLs over a separ...
4,When should SE domain names use acronyms?,SE websites should avoid the use of acronyms u...


## Get responses from each of the available models with and without RAG

In [32]:
import yaml


with open("llm_config.yaml", "r") as f:
    llm_config = yaml.safe_load(f)

output_directory = llm_config.get("name", "output")
os.makedirs(output_directory, exist_ok=True)   
    
qna_df = pd.read_json(f"{output_directory}/ground_truth.jsonl", orient="records", lines=True)

for testing_config in llm_config["testing_config"]:
    answers = qna_df.copy()
    answers["answer"] = ""
    answers["rag_answer"] = ""
    llm = create_llm(testing_config)
    for index, row in answers.iterrows():
        question = row["question"]
        print(index, question[:40])
        if testing_config.get("qna_template"):
            answer = qna_request(llm, testing_config.get("qna_template"), question)
            if answer:
                print("QnA Answer: " + answer[:40])
            answers.at[index, "answer"] = answer
        if testing_config.get("rag_template"):
            answer = rag_request(llm, testing_config.get("rag_template"), question)
            if answer:
                print("RAG Answer: " + answer[:40])
            answers.at[index, "rag_answer"] = answer
    testing_config_name = replace_special_char(testing_config["name" or "model_name"])
    answers.to_json(f"{output_directory}/{testing_config_name}_answers.jsonl", orient="records", lines=True)
    # answers.to_csv(f"{output_directory}/{base_filename}_answers.csv")

Creating VLLM model
0 What does an employee need to provide in
In an appeal to the Telecommuting Appeal Board, an employee typically needs to provide a written request with reasons for disagreement with the determination made by the Telecommuting Program Coordinator. This request should include any new information or evidence that supports the employee's case. The employee may also have the opportunity to present their case in person during a hearing before the Appeal Board. It's important to note that the specific requirements for an appeal may vary depending on the organization, so it's always a good idea to check the relevant policies and guidelines for the most accurate information.
QnA Answer: In an appeal to the Telecommuting Appeal


Batches: 100%|██████████| 1/1 [00:00<00:00, 79.41it/s]


RAG Answer: In an appeal to the Telecommuting Appeal
1 Who is on the Telecommuting Appeal Board
The Telecommuting Appeal Board typically consists of three members. These members are selected from various departments within the organization to ensure a fair and unbiased review of telecommuting appeals. The specific departments and roles of the members can vary depending on the organization, but they are chosen to bring a diverse set of perspectives and expertise to the decision-making process.
QnA Answer: The Telecommuting Appeal Board typically


Batches: 100%|██████████| 1/1 [00:00<00:00, 79.77it/s]


RAG Answer: The Telecommuting Appeal Board consists 
2 How long do employees have to wait until
Employees must wait six months before submitting a new application for the telecommuting program. This waiting period allows them to demonstrate their commitment and performance in the current telecommuting arrangement before applying again.
QnA Answer: Employees must wait six months before su


Batches: 100%|██████████| 1/1 [00:00<00:00, 79.62it/s]


RAG Answer: Employees who have had their application
3 Should SE websites have sub-domain addre
Yes, SE (State Entity) websites should have sub-domain addresses. Sub-domain addresses, such as "agency\_name".ny.gov or "program\_name".ny.gov, are highly recommended for SE websites. Using these addresses ensures that the public can easily recognize official state sites and helps maintain brand consistency. Additionally, sub-domain addresses are preferred over using only the ny.gov domain, as they provide more flexibility in managing and customizing the site's appearance and functionality.
QnA Answer: Yes, SE (State Entity) websites should h


Batches: 100%|██████████| 1/1 [00:00<00:00, 83.89it/s]


RAG Answer: Yes, SE websites should use short URLs o
4 When should SE domain names use acronyms
SE (State Entity) domain names should use acronyms in specific cases to optimize user experience and branding. Acronyms can help users quickly recognize the organization's name and reduce cognitive load. However, they should be used judiciously to avoid confusion and ensure accessibility for all users, including those who may not be familiar with the acronym.

Additionally, SEs should consider the following guidelines when using acronyms in domain names:

1. **Consistency:** Use the acronym consistently across all platforms and communications.
2. **Expansion:** Explain the acronym on first use or provide a link to a glossary for users who are unfamiliar with it.
3. **Legibility:** Ensure that the acronym is easily readable and recognizable.
4. **Accessibility:** Avoid using acronyms that may be confusing or difficult to pronounce for users with disabilities.
5. **Branding:** Use acronyms to 

Batches: 100%|██████████| 1/1 [00:00<00:00, 78.78it/s]


RAG Answer: SE domain names should use acronyms only
5 Can SE domain names contail special char
Yes, SE (State Entity) domain names can contain special characters. However, it's important to note that the use of special characters in domain names can affect their functionality and usability. Therefore, it's recommended to consult with a domain name expert or your SE's IT department to ensure that the chosen domain name meets the necessary requirements and guidelines for your specific use case.
QnA Answer: Yes, SE (State Entity) domain names can 


Batches: 100%|██████████| 1/1 [00:00<00:00, 81.42it/s]


RAG Answer: No, SE domain names cannot contain speci
6 Should OSS options be considered first f
No, OSS (Open Source Software) options should not be considered first for all potential ITS software solutions. While OSS can be a valuable option for certain projects due to its cost-effectiveness, flexibility, and community support, it may not always be the best fit for all use cases. Factors such as the complexity of the project, the need for proprietary features, and the availability of skilled support should all be taken into account when evaluating software options.
QnA Answer: No, OSS (Open Source Software) options s


Batches: 100%|██████████| 1/1 [00:00<00:00, 87.58it/s]


RAG Answer: Yes, OSS options should be considered fi
7 What conditions should OSS meet to be us
Open Source Software (OSS) should meet the following conditions to be used as an ITS (Information Technology Services) software solution:

1. **Quality and Maturity**: The OSS should have a proven track record, a stable codebase, and a well-established community that can provide support, maintenance, and updates.

2. **Security**: ITS places a high priority on security. The OSS should have a solid security record, with no known vulnerabilities or security issues that could compromise the system or sensitive data.

3. **Licensing**: The OSS should be licensed in a way that is consistent with ITS policies and allows for the use, modification, and distribution of the software as needed.

4. **Support**: ITS should have access to adequate support and resources to ensure the successful implementation and maintenance of the OSS.

5. **Compliance**: The OSS should comply with all relevant laws, reg

Batches: 100%|██████████| 1/1 [00:00<00:00, 82.55it/s]


RAG Answer: OSS should meet the following four condi
8 Who approves software licenses for ITS?
The ITS Division of Legal Affairs (DLA) is responsible for approving software licenses for ITS. This ensures that the licenses used by ITS comply with all relevant laws and regulations, and that the terms of the license are fair and reasonable for the agency.
QnA Answer: The ITS Division of Legal Affairs (DLA) 


Batches: 100%|██████████| 1/1 [00:00<00:00, 82.67it/s]


RAG Answer: The ITS Division of Legal Affairs (DLA) 
9 What are examples of ITS E-equipment?
Examples of ITS E-equipment include, but are not limited to, desktops, laptops, mobile devices, servers, mobile devices, video conferencing equipment, copiers, printers, and data center equipment. This equipment is owned, leased, or maintained by the State and is used to support ongoing business functions.
QnA Answer: Examples of ITS E-equipment include, but


Batches: 100%|██████████| 1/1 [00:00<00:00, 83.29it/s]


RAG Answer: ITS E-equipment refers to hardware used 
10 What ITS guidelines must be followed whe
The ITS (Information Technology Standards) guidelines that must be followed when disposing of end-of-life E-equipment include the 46 CFR Part 168 and 169. These regulations are established by the United States Coast Guard and apply to the disposal of E-equipment. It is essential to adhere to these guidelines to ensure proper handling and disposal of electronic waste, protecting both the environment and personal data.
QnA Answer: The ITS (Information Technology Standard


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.34it/s]


RAG Answer: The ITS guidelines that must be followed
11 What must be submitted once ITS E-equipm
An ITSM ServiceNow Surplus/LDA Service Request must be submitted once ITS E-equipment is prepared for surplus or disposal.
QnA Answer: An ITSM ServiceNow Surplus/LDA Service R


Batches: 100%|██████████| 1/1 [00:00<00:00, 83.29it/s]


RAG Answer: An ITSM ServiceNow Surplus/LDA Service R
12 What is the purpose of the Use of Artifi
The Use of Artificial Intelligence policy is designed to govern the use of AI systems within the State Engineering (SE) division. The policy aims to ensure that SE leadership approves all AI systems, establishes guidelines for acceptable use, and identifies appropriate oversight. It also emphasizes the importance of transparency, fairness, and accountability in AI system development and deployment. Additionally, the policy highlights the need to address potential biases and discrimination, and to comply with relevant laws, regulations, and policies. Furthermore, it underscores the importance of data management, security, and privacy in AI system use. Overall, the Use of Artificial Intelligence policy seeks to promote responsible and ethical AI use within the SE division.
QnA Answer: The Use of Artificial Intelligence polic


Batches: 100%|██████████| 1/1 [00:00<00:00, 81.51it/s]


RAG Answer: The purpose of the Use of Artificial Int
13 What is the benefit of the Use of Artifi
The benefit of the Use of Artificial Intelligence policy is that it encourages the responsible use of AI technologies, ensuring that they are used ethically and effectively to drive innovation, increase operational efficiencies, and better serve New Yorkers while protecting privacy, managing risk, promoting accountability, safety, and equity, and complying with all applicable laws, regulations, and policies.
QnA Answer: The benefit of the Use of Artificial Int


Batches: 100%|██████████| 1/1 [00:00<00:00, 81.19it/s]


RAG Answer: The benefit of the Use of Artificial Int
14 Are the guidelines on the use of Artific
The guidelines on the use of Artificial Intelligence are not a policy or a standard. They are more accurately described as a set of principles or guiding values. These principles provide a framework for responsible AI use, but they do not have the same legal standing as a policy or standard. A policy typically outlines specific rules and regulations, while a standard refers to a widely accepted set of practices or technical specifications. The guidelines on AI use, however, are more flexible and open-ended, allowing for interpretation and adaptation to various contexts.
QnA Answer: The guidelines on the use of Artificial 


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.67it/s]


RAG Answer: The guidelines on the use of Artificial 
15 Who can I email if I have a question abo
You can email its.sm.dla@its.ny.gov for any questions about ITS policies and standards for technology use. This email address is specifically designated for inquiries related to ITS policies and standards, and it is likely to be staffed by knowledgeable individuals who can provide accurate and helpful information.
QnA Answer: You can email its.sm.dla@its.ny.gov for 


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.58it/s]


RAG Answer: You can email the Chief Data Office at c
16 Where can I find policies, standards, an
The policies, standards, and guidelines for New York State technology policies can be found on the official New York State Office of Information Technology Services (ITS) website. This centralized resource provides access to a wide range of technology policies, ensuring that all stakeholders have access to the most up-to-date and accurate information.
QnA Answer: The policies, standards, and guidelines 


Batches: 100%|██████████| 1/1 [00:00<00:00, 79.70it/s]


RAG Answer: The policies, standards, and guidelines 
17 What is the telephone number to submit i
The telephone number to submit inquiries and requests about ITS policies and standards is 518-473-5115. This number is specifically dedicated to handling questions and requests related to ITS policies and standards, providing a direct line of communication for clarification and assistance.
QnA Answer: The telephone number to submit inquiries


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.15it/s]


RAG Answer: The telephone number to submit inquiries
18 What was the change made to the ITS poli
The change made to the ITS policy on the surplus and disposal of equipment in January 2011 was the addition of a section for the surplus and disposal of E-equipment. This update was made to ensure that the policy covered all types of equipment being surplus or disposed of by ITS.
QnA Answer: The change made to the ITS policy on the


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.81it/s]


RAG Answer: The change made to the ITS policy on the
19 Who have been the reviewers on changes t
The reviewers on changes to the ITS policy for the disposal of equipment are John Doe and Jane Doe.
QnA Answer: The reviewers on changes to the ITS poli


Batches: 100%|██████████| 1/1 [00:00<00:00, 80.19it/s]


RAG Answer: The ITS policy for the disposal of equip
20 When was the policy on the surplus and d
The policy on the surplus and disposal of ITS (New York State Office of Information Technology Services) equipment was first created in 2004. This policy outlines the process for handling ITS equipment that is no longer needed or used, and it is established to ensure that such equipment is disposed of in a secure, environmentally responsible, and cost-effective manner.
QnA Answer: The policy on the surplus and disposal o


Batches: 100%|██████████| 1/1 [00:00<00:00, 82.82it/s]


RAG Answer: The policy on the surplus and disposal o
21 What are documents related to the ITS po
The documents related to the ITS policy on surplus and disposal of equipment are the "Surplus and Disposal of ITS Equipment, Furniture and Vehicles" policy, the "NYS-S13-003 Sanitization/Secure Disposal" standard, the "NYS-P03-002 Information Security Policy," and the "NYS-P14-001 Acceptable Use of Information Technology Resources" policy. These documents provide guidelines and procedures for the surplus and disposal of ITS equipment, furniture, and vehicles, as well as ensure that proper security measures are taken when disposing of sensitive information.
QnA Answer: The documents related to the ITS policy 


Batches: 100%|██████████| 1/1 [00:00<00:00, 83.70it/s]


RAG Answer: The documents related to the ITS policy 
22 Which forms are related to the ITS polic
The forms related to the ITS policy on surplus and disposal of equipment are the CS-201 Form, CS-201.1 Form, CS-201.2 Form, CS-201.3 Form, CS-202 Form, CS-202.1 Form, CS-202.2 Form, CS-202.3 Form, NYS-S13-003 Sanitization/Secure Disposal, ITS-201 Form, ITS-201.1 Form, ITS-201.2 Form, ITS-201.3 Form, ITS-202 Form, ITS-202.1 Form, ITS-202.2 Form, ITS-202.3 Form. These forms are used to facilitate the surplus and disposal process for ITS equipment in accordance with the established policy.
QnA Answer: The forms related to the ITS policy on s


Batches: 100%|██████████| 1/1 [00:00<00:00, 81.60it/s]


RAG Answer: The forms related to the ITS policy on s
23 Are there documents related to the ITS p
Yes, there are documents related to the ITS policy on surplus and disposal of equipment. These documents include the ITSM ServiceNow Surplus/LDA Service Request, the Report of Surplus Personal Property (CS-201) Form, the Report of Surplus Motor Vehicles and Motorized Equipment (CS-201.1) Form, NYS-S13-003 Sanitization/Secure Disposal, and ITS-S18-001 Fleet Management Usage. These forms and guidelines are used to properly surplus and dispose of ITS-owned or managed electronic equipment, furniture, miscellaneous equipment, and vehicles when they are no longer needed.
QnA Answer: Yes, there are documents related to the 


Batches: 100%|██████████| 1/1 [00:00<00:00, 75.07it/s]


RAG Answer: Yes, there are several related documents
24 Are some of the State's transformative i
Yes, some of the State's transformative infrastructure projects are financed outside of the State budget. This means that funds for these projects come from sources other than the State's regular financial resources. These external funding sources can include federal grants, public-private partnerships, and private investments. By leveraging external funding, the State can potentially accelerate project delivery, reduce debt issuances, and achieve other financial benefits. However, it's important to note that the allocation and management of external funding for infrastructure projects can be complex and require careful coordination between various stakeholders.
QnA Answer: Yes, some of the State's transformative 


Batches: 100%|██████████| 1/1 [00:00<00:00, 85.72it/s]


RAG Answer: Yes, some of the State's transformative 
25 What is the State Share for the Gateway 
The State Share for the Gateway Tunnel Project is 1.3 billion dollars. This information is based on internal knowledge, as it represents the financial commitment made by the State of New Jersey towards the project.
QnA Answer: The State Share for the Gateway Tunnel P


Batches: 100%|██████████| 1/1 [00:00<00:00, 80.98it/s]


RAG Answer: The State Share for the Gateway Tunnel P
26 Who are the funding partners for the Sta
The funding partners for the State's transformative infrastructure projects typically include the State itself, along with various funding partners such as local governments, the Federal government, public authorities, and private entities. This collaborative approach allows for the pooling of resources and expertise to finance and deliver large-scale, complex projects that benefit the entire state.
QnA Answer: The funding partners for the State's tra


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.54it/s]


RAG Answer: The funding partners for the State's tra
27 What percent of State capital spending i
The answer is 20 percent. This means that 20 percent of the State's capital spending in FY 2025 is expected to be funded by Federal aid. This information is based on the State's internal knowledge and is likely derived from their budgeting and financial planning processes.
QnA Answer: The answer is 20 percent. This means tha


Batches: 100%|██████████| 1/1 [00:00<00:00, 86.19it/s]


RAG Answer: 19 percent of State capital spending in 
28 Is capital spending expected to increase
Yes, capital spending is indeed expected to rise in FY 2025. This projection is based on the organization's financial outlook and strategic planning for the upcoming fiscal year. The increase in capital spending is likely to be driven by various factors, such as investments in infrastructure, research and development, and digital transformation initiatives. However, it is essential to monitor the actual spending figures and any potential adjustments that may be made during the fiscal year.
QnA Answer: Yes, capital spending is indeed expected


Batches: 100%|██████████| 1/1 [00:00<00:00, 80.90it/s]


RAG Answer: Yes, capital spending is expected to inc
29 Why is State capital spending projected 
State capital spending is projected to rise in FY 2025 due to increased investments in economic development. This focus on economic development aims to create jobs, strengthen the economy, and promote growth. By allocating more resources towards capital spending, the state can directly contribute to these objectives and foster a more prosperous environment.
QnA Answer: State capital spending is projected to r


Batches: 100%|██████████| 1/1 [00:00<00:00, 82.07it/s]


RAG Answer: State capital spending is projected to i
30 What is capital spending for the State p
The State's capital spending is projected to be $17.7 billion in FY 2025. This figure represents the amount the State plans to invest in its infrastructure, including transportation, education, and environmental projects, among others. It's important to note that capital spending is a significant component of a country's economic growth, as it contributes to the development of physical infrastructure and the overall quality of life. In the case of the State of New York, a higher capital spending projection may indicate a stronger commitment to economic development and infrastructure improvements, which could have positive implications for the state's economy and residents.
QnA Answer: The State's capital spending is projecte


Batches: 100%|██████████| 1/1 [00:00<00:00, 89.84it/s]


RAG Answer: Capital spending for the State is projec
31 What are new large scale projects in the
The Five-Year DOT Capital Plan includes several new large-scale projects. One of them is the Interborough Express, which is a proposed 10-mile light rail system connecting Brooklyn, Queens, and the Bronx. This project aims to provide a more efficient and eco-friendly transportation option for the residents and businesses in these boroughs.

Another significant project is the expansion of the Second Avenue Subway, which will increase capacity and improve accessibility in Manhattan. The project includes the construction of a new fully ADA compliant station at the West End Concourse and the expansion of the existing subway to the west.

Additionally, the plan includes the modernization of the John F. Kennedy Airport, which will involve the transformation of the airport's eight disparate terminal sites into a unified JFK Airport. The project includes demolishing old terminals, utilizing vacant 

Batches: 100%|██████████| 1/1 [00:00<00:00, 80.21it/s]


RAG Answer: The new large scale projects in the Five
32 How much does the DOT Capital plan inclu
The DOT Capital Plan includes $1 billion for the "Bridge NY" program. This program focuses on the reconstruction, replacement, or repair of bridges across the State of New York, aiming to improve the overall condition and safety of the state's bridge infrastructure.
QnA Answer: The DOT Capital Plan includes $1 billion


Batches: 100%|██████████| 1/1 [00:00<00:00, 81.43it/s]


RAG Answer: The DOT Capital Plan includes $1 billion
33 What percentage of existing State-releat
The answer is 15%. This means that, based on current projections, one-fourth of the existing State-related debt is expected to be retired within 15 years. This information highlights the State's debt management and repayment strategy, which is an essential aspect of fiscal responsibility and financial planning.
QnA Answer: The answer is 15%. This means that, base


Batches: 100%|██████████| 1/1 [00:00<00:00, 83.58it/s]


RAG Answer: The percentage of existing State-related
34 What does 100% State-releated debt retir
100% State-related debt retirement refers to the complete repayment of all debt obligations that are directly linked to the State. This includes both principal and interest payments, and it signifies that the State has fulfilled its financial commitment to the debt. It is important to note that this does not include debt retirements funded by local governments or other entities, only debt issued by the State itself.
QnA Answer: 100% State-related debt retirement refer


Batches: 100%|██████████| 1/1 [00:00<00:00, 81.25it/s]


RAG Answer: 100% State-related debt retirement repre
35 What impact does the rate of debt retire
The rate of debt retirement has a significant impact on the State's financial health and flexibility. When debt is retired more quickly, the State saves money on interest payments over the life of the debt. This frees up resources that can be used for other priorities, such as education, infrastructure, or social services. Additionally, retiring debt more quickly can improve the State's credit rating, making it easier and cheaper to borrow money in the future. However, retiring debt more quickly may also limit the State's ability to use its debt as a financial tool to manage cash flow or invest in long-term projects. Therefore, the rate of debt retirement is an important consideration for the State's financial management.
QnA Answer: The rate of debt retirement has a signif


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.46it/s]


RAG Answer: The rate at which State-related debt is 
36 How much will be appropriated on Communi
The amount appropriated for Community Services Programs for people with development disabilities in FY 2026 is 132,113 thousand dollars.
QnA Answer: The amount appropriated for Community Se


Batches: 100%|██████████| 1/1 [00:00<00:00, 85.89it/s]


RAG Answer: The answer cannot be determined from the
37 What are the projected appropriations fr
The projected appropriations for Non-Bondable Projects for people with developmental disabilities from FY 2025 to FY 2027 are as follows:

* FY 2025: 1,000 thousand dollars
* FY 2026: 1,000 thousand dollars
* FY 2027: 1,000 thousand dollars

These amounts remain constant for each fiscal year, totaling 3,000 thousand dollars over the three-year period.
QnA Answer: The projected appropriations for Non-Bon


Batches: 100%|██████████| 1/1 [00:00<00:00, 82.44it/s]


RAG Answer: The projected appropriations for Non-Bon
38 What is the total projected appropriatio
The total projected appropriation on programs for people with development disabilities in FY 2026 is 564,692 thousand dollars. This information is based on the internal knowledge and resources available to answer the question.
QnA Answer: The total projected appropriation on pro


Batches: 100%|██████████| 1/1 [00:00<00:00, 86.50it/s]


RAG Answer: The total projected appropriation on pro
Creating VLLM model
0 What does an employee need to provide in


Batches: 100%|██████████| 1/1 [00:00<00:00, 83.10it/s]


RAG Answer: An employee needs to provide a written r
1 Who is on the Telecommuting Appeal Board


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.97it/s]


RAG Answer: The Telecommuting Appeal Board consists 
2 How long do employees have to wait until


Batches: 100%|██████████| 1/1 [00:00<00:00, 85.25it/s]


RAG Answer: Employees who have been removed from the
3 Should SE websites have sub-domain addre


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.17it/s]


RAG Answer: Based on the provided context, SE websit
4 When should SE domain names use acronyms


Batches: 100%|██████████| 1/1 [00:00<00:00, 83.40it/s]


RAG Answer: SE domain names should use acronyms when
5 Can SE domain names contail special char


Batches: 100%|██████████| 1/1 [00:00<00:00, 75.90it/s]


RAG Answer: No, SE domain names cannot contain speci
6 Should OSS options be considered first f


Batches: 100%|██████████| 1/1 [00:00<00:00, 79.51it/s]


RAG Answer: Yes, OSS options should be considered fi
7 What conditions should OSS meet to be us


Batches: 100%|██████████| 1/1 [00:00<00:00, 68.50it/s]


RAG Answer: To be used as an ITS software solution, 
8 Who approves software licenses for ITS?


Batches: 100%|██████████| 1/1 [00:00<00:00, 69.77it/s]


RAG Answer: The ITS Division of Legal Affairs (DLA) 
9 What are examples of ITS E-equipment?


Batches: 100%|██████████| 1/1 [00:00<00:00, 75.81it/s]


RAG Answer: Based on the provided context, examples 
10 What ITS guidelines must be followed whe


Batches: 100%|██████████| 1/1 [00:00<00:00, 74.60it/s]


RAG Answer: The ITS guidelines that must be followed
11 What must be submitted once ITS E-equipm


Batches: 100%|██████████| 1/1 [00:00<00:00, 73.85it/s]


RAG Answer: Once ITS E-equipment is prepared for sur
12 What is the purpose of the Use of Artifi


Batches: 100%|██████████| 1/1 [00:00<00:00, 71.46it/s]


RAG Answer: The purpose of the Use of Artificial Int
13 What is the benefit of the Use of Artifi


Batches: 100%|██████████| 1/1 [00:00<00:00, 81.98it/s]


RAG Answer: The benefit of the Use of Artificial Int
14 Are the guidelines on the use of Artific


Batches: 100%|██████████| 1/1 [00:00<00:00, 79.58it/s]


RAG Answer: The guidelines on the use of Artificial 
15 Who can I email if I have a question abo


Batches: 100%|██████████| 1/1 [00:00<00:00, 81.56it/s]


RAG Answer: You can email the Chief Data Office at c
16 Where can I find policies, standards, an


Batches: 100%|██████████| 1/1 [00:00<00:00, 83.88it/s]


RAG Answer: You can find policies, standards, and gu
17 What is the telephone number to submit i


Batches: 100%|██████████| 1/1 [00:00<00:00, 75.08it/s]


RAG Answer: The telephone number to submit inquiries
18 What was the change made to the ITS poli


Batches: 100%|██████████| 1/1 [00:00<00:00, 81.39it/s]


RAG Answer: The context does not provide information
19 Who have been the reviewers on changes t


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.06it/s]


RAG Answer: The reviewers for changes to the ITS pol
20 When was the policy on the surplus and d


Batches: 100%|██████████| 1/1 [00:00<00:00, 83.48it/s]


RAG Answer: The policy on the surplus and disposal o
21 What are documents related to the ITS po


Batches: 100%|██████████| 1/1 [00:00<00:00, 83.11it/s]


RAG Answer: The related documents to the ITS policy 
22 Which forms are related to the ITS polic


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.86it/s]


RAG Answer: The forms related to the ITS policy on s
23 Are there documents related to the ITS p


Batches: 100%|██████████| 1/1 [00:00<00:00, 83.06it/s]


RAG Answer: Yes, there are related documents to the 
24 Are some of the State's transformative i


Batches: 100%|██████████| 1/1 [00:00<00:00, 80.43it/s]


RAG Answer: Yes, some of the State's transformative 
25 What is the State Share for the Gateway 


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.36it/s]


RAG Answer: The State Share for the Gateway Tunnel P
26 Who are the funding partners for the Sta


Batches: 100%|██████████| 1/1 [00:00<00:00, 81.30it/s]


RAG Answer: The funding partners for the State's tra
27 What percent of State capital spending i


Batches: 100%|██████████| 1/1 [00:00<00:00, 83.25it/s]


RAG Answer: 19 percent of State capital spending in 
28 Is capital spending expected to increase


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.33it/s]


RAG Answer: Yes, capital spending is expected to inc
29 Why is State capital spending projected 


Batches: 100%|██████████| 1/1 [00:00<00:00, 82.57it/s]


RAG Answer: State capital spending is projected to i
30 What is capital spending for the State p


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.31it/s]


RAG Answer: Capital spending for the State is projec
31 What are new large scale projects in the


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.78it/s]


RAG Answer: The new large-scale projects in the Five
32 How much does the DOT Capital plan inclu


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.13it/s]


RAG Answer: The DOT Capital Plan includes $1 billion
33 What percentage of existing State-releat


Batches: 100%|██████████| 1/1 [00:00<00:00, 82.82it/s]


RAG Answer: The percentage of existing State-related
34 What does 100% State-releated debt retir


Batches: 100%|██████████| 1/1 [00:00<00:00, 83.46it/s]


RAG Answer: 100% State-related debt retirement would
35 What impact does the rate of debt retire


Batches: 100%|██████████| 1/1 [00:00<00:00, 84.86it/s]


RAG Answer: The rate of debt retirement has a signif
36 How much will be appropriated on Communi


Batches: 100%|██████████| 1/1 [00:00<00:00, 81.40it/s]


RAG Answer: The context provided does not include sp
37 What are the projected appropriations fr


Batches: 100%|██████████| 1/1 [00:00<00:00, 85.47it/s]


RAG Answer: The context provided does not include sp
38 What is the total projected appropriatio


Batches: 100%|██████████| 1/1 [00:00<00:00, 72.00it/s]


RAG Answer: The total projected appropriation on pro


## Grade responses using Judge Model

In [33]:
import yaml

with open("llm_config.yaml", "r") as f:
    llm_config = yaml.safe_load(f)

output_directory = llm_config.get("name", "output")
os.makedirs(output_directory, exist_ok=True)

In [34]:
from langchain.prompts import PromptTemplate

scoring_template_str = llm_config["judge"].get("template")
assert scoring_template_str
SCORING_PROMPT = PromptTemplate.from_template(scoring_template_str)

In [35]:
judge_client = OpenAI(api_key=llm_config["judge"]["api_key"])
judge_model_name = llm_config["judge"]["model_name"]


def score_request(question, answer, reference_answer):
    messages = [
        {
            "role": "user",
            "content": SCORING_PROMPT.format(
                question=question,
                answer=answer,
                reference_answer=reference_answer
            )
        }
    ]

    completion = judge_client.chat.completions.create(
        model=judge_model_name,
        messages=messages,
        n=1,
        temperature=0.0,
        max_tokens=1024,
    )
    response_content = completion.choices[0].message.content
    # print(response_content)
    response_content = re.sub(r'^```json', '', response_content)
    response_content = re.sub(r'```$', '', response_content)
    # print(response_content)
    try:
        result = json.loads(response_content)
    except Exception as e:
        result = {"answer_quality": 0, "reasoning": "Error"}
        print("response_content:", response_content)
        print(f"An error occurred: {e}")

    score = result["answer_quality"]
    reasoning = result["reasoning"]
    return score, reasoning


In [36]:
for testing_config in llm_config["testing_config"]:
    testing_config_name = replace_special_char(testing_config["name" or "model_name"])
    print("-" * 80)
    print(testing_config_name)
    answers_filename = f"{output_directory}/{testing_config_name}_answers.jsonl"
    scores = pd.read_json(answers_filename, orient="records", lines=True)
    position = scores.columns.get_loc("answer")
    scores.insert(position + 1, "answer_score", "")
    scores.insert(position + 2, "answer_score_reasoning", "")
    position = scores.columns.get_loc("rag_answer")
    scores.insert(position + 1, "rag_answer_score", "")
    scores.insert(position + 2, "rag_answer_score_reasoning", "")

    for index, row in scores.iterrows():
        question = row["question"]
        answer = row["answer"]
        reference_answer = row["ground_truth"]
        print(index, question)
        if answer:
            score, reasoning = score_request(question, answer, reference_answer)
            scores.at[index, "answer_score"] = score
            scores.at[index, "answer_score_reasoning"] = reasoning
            print(answer[:40], score, reasoning[:40])
        rag_answer = row["rag_answer"]
        if rag_answer:
            score, reasoning = score_request(question, rag_answer, reference_answer)
            scores.at[index, "rag_answer_score"] = score
            scores.at[index, "rag_answer_score_reasoning"] = reasoning
            print(rag_answer[:40], score, reasoning[:40])
    judge_name = replace_special_char(judge_model_name)
    scores_filename = f"{output_directory}/{testing_config_name}_scores"
    scores.to_json(f"{scores_filename}.jsonl", orient="records", lines=True)
    scores.to_csv(f"{scores_filename}.csv", index=False)


--------------------------------------------------------------------------------
finetuned
0 What does an employee need to provide in an appeal to the Telecommuting Appeal board?
In an appeal to the Telecommuting Appeal 4 The reference answer specifies that an e
In an appeal to the Telecommuting Appeal 4 The reference answer specifies that an e
1 Who is on the Telecommuting Appeal Board?
The Telecommuting Appeal Board typically 3 The reference answer specifies that the 
The Telecommuting Appeal Board consists  5 The reference answer states that the Tel
2 How long do employees have to wait until they may submit a new application for the telecommuting program?
Employees must wait six months before su 4 The reference answer states that employe
Employees who have had their application 3 The reference answer states that employe
3 Should SE websites have sub-domain addresses?
Yes, SE (State Entity) websites should h 2 The reference answer suggests that SE we
Yes, SE websites should use short

## Create resulting score report CSV

In [37]:
with open("llm_config.yaml", "r") as f:
    llm_config = yaml.safe_load(f)

output_directory = llm_config.get("name", "output")
os.makedirs(output_directory, exist_ok=True)

judge_client = OpenAI(api_key=llm_config["judge"]["api_key"])
judge_model_name = llm_config["judge"]["model_name"]
judge_name = replace_special_char(judge_model_name)

summary_output_df = pd.DataFrame()

for testing_config in llm_config["testing_config"]:
    testing_config_name = replace_special_char(testing_config["name" or "model_name"])
    scores_filename = f"{output_directory}/{testing_config_name}_scores.jsonl"
    scores = pd.read_json(scores_filename, orient="records", lines=True)
    if testing_config.get("qna_template"):
        summary_output_df[f"{testing_config_name}_answer_score"] = scores["answer_score"]
    if testing_config.get("rag_template"):
        summary_output_df[f"{testing_config_name}_rag_answer_score"] = scores["rag_answer_score"]

average_row = summary_output_df.mean(axis=0, numeric_only=True)
print(average_row)
summary_output_df.loc[len(summary_output_df)] = average_row
question_indices = [f"Q{i+1}" for i in range(len(summary_output_df)-1)]
question_indices.append("Average")
summary_output_df.insert(0, 'question index', question_indices)

summary_filepath = f"{output_directory}/summary_scores"
# summary_output_df.to_json(f"{summary_filepath}.jsonl", orient="records", lines=True)
summary_output_df.to_csv(f"{summary_filepath}.csv", index=False)

finetuned_answer_score                      3.128205
finetuned_rag_answer_score                  3.974359
granite_3_0_8b_instruct_rag_answer_score    3.897436
dtype: float64


In [38]:
with pd.ExcelWriter(f"{output_directory}/{judge_name}_scores.xlsx") as writer:
    summary_output_df = pd.read_csv(f"{summary_filepath}.csv")
    summary_output_df.to_excel(writer, sheet_name="Summary", index=False)

    for testing_config in llm_config["testing_config"]:
        testing_config_name = replace_special_char(testing_config["name" or "model_name"])
        scores_filename = f"{output_directory}/{testing_config_name}_scores.jsonl"
        scores = pd.read_json(scores_filename, orient="records", lines=True)
        scores.to_excel(writer, sheet_name=f"{testing_config_name}_scores"[:30])