# Evaluating the fine tuned model

### Needed packages and imports

In [1]:
!pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### Model inference parameters

The parameters to the fine tuned model.

In [2]:
import requests
import os
import yaml
import json
import re
import time
import pandas as pd
import torch

from typing import Iterator
from pathlib import Path
from openai import OpenAI

from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate

from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document as LCDocument

from langchain_openai import ChatOpenAI
from langchain_community.llms import VLLMOpenAI
from langchain_milvus import Milvus
from langchain_text_splitters import RecursiveCharacterTextSplitter

from docling.document_converter import DocumentConverter


def replace_special_char(original_str):
    return re.sub(r"[^\w]", "_", original_str)


def get_config():
    with open("llm_config.yaml", "r") as f:
        llm_config = yaml.safe_load(f)
    return llm_config


def get_output_dir():
    llm_config = get_config()
    output_directory = replace_special_char(llm_config.get("name", "output"))
    os.makedirs(output_directory, exist_ok=True)
    return output_directory


def get_testing_config_name(testing_config):
    name = testing_config.get("name")
    if name:
        return replace_special_char(name)

    name = testing_config.get("model_name")
    if name and testing_config.get("rag"):
        name = replace_special_char(name + "_rag")
    return name


In [3]:
import os

from dotenv import load_dotenv

load_dotenv()

False

In [4]:
MAX_TOKENS = 2048
TEMPERATURE = 0.00

### Milvus connection info

Defaults to local db

In [5]:
MILVUS_URI = os.getenv("MILVUS_URI", "./milvus_llm_judge_eval.db")
MILVUS_USERNAME = os.getenv("MILVUS_USERNAME", "")
MILVUS_PASSWORD = os.getenv("MILVUS_PASSWORD", "")
MILVUS_COLLECTION = os.getenv("MILVUS_COLLECTION", "my_org_documents")

## Sanity check model

In [6]:
def create_llm(testing_config):
    openai_api_key = re.sub(r"\s+", "", testing_config.get("api_key"))
    model_name = testing_config.get("model_name")
    if testing_config.get("model_type") == "openai":
        # print("Creating OpenAI model")
        return ChatOpenAI(
            openai_api_key=openai_api_key,
            model_name=model_name,
            streaming=False)
    # print("Creating VLLM model")
    openai_api_base = testing_config.get("endpoint_url")
    return VLLMOpenAI(
        openai_api_key=openai_api_key,
        openai_api_base=openai_api_base,  #https://model...com/v1
        model_name=model_name,
        temperature=TEMPERATURE,
        max_tokens=MAX_TOKENS,
        streaming=False
    )


def qna_request(llm, template_str, question):
    num_retries = 1
    for attempt in range(num_retries):
        try:
            prompt = PromptTemplate.from_template(template_str)
            chain = prompt | llm | StrOutputParser()
            answer = chain.invoke({"question": question})
            # print(answer)
            return answer.strip()
        except Exception as e:
            print(f"Request failed: {e}")
            if attempt + 1 < num_retries:
                print(f"Retrying in 5 seconds...")
                time.sleep(5)
            else:
                return ""


In [7]:
llm_config = get_config()
for testing_config in llm_config["testing_config"]:
    if testing_config.get("rag"):
        continue
    else:
        print(testing_config.get("name"))
        llm = create_llm(testing_config)
        template_str = testing_config.get("template")
        question = "Who are you?"
        answer = qna_request(llm, template_str, question)
        print(answer)
        break

finetuned
As a helpful and knowledgeable AI assistant, I am designed to provide accurate and relevant information to users. I am not a human, but rather an advanced algorithm that has been trained on a vast amount of data to understand and respond to a wide range of inquiries. My purpose is to assist and make your life easier by offering valuable insights and answering your questions to the best of my ability.


## Creating an Milvus DB with documents

## Initial index creation and document ingestion

#### Load pdfs

In [None]:
class DoclingPDFLoader(BaseLoader):
    def __init__(self, file_path: str | list[str]) -> None:
        self._file_paths = file_path if isinstance(file_path, list) else [file_path]
        self._converter = DocumentConverter()

    def lazy_load(self) -> Iterator[LCDocument]:
        for source in self._file_paths:
            dl_doc = self._converter.convert(source).document
            text = dl_doc.export_to_markdown()
            yield LCDocument(page_content=text)

In [None]:
pdf_folder_path = "../data_preparation/document_collection"
file_paths = [str(path) for path in Path(pdf_folder_path).rglob('*.pdf')]
file_paths

In [None]:
loader = DoclingPDFLoader(file_path=file_paths)

#### Split documents into chunks with some overlap

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=100,
)

docs = loader.load()
splits = text_splitter.split_documents(docs)
splits[0]

#### Create the index and ingest the documents

In [None]:
device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)

model_kwargs = {"trust_remote_code": True, "device": device}
embeddings = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1.5",
    model_kwargs=model_kwargs,
    show_progress=True
)

db = Milvus(
    embedding_function=embeddings,
    connection_args={
        "uri": MILVUS_URI,
        "user": MILVUS_USERNAME,
        "password": MILVUS_PASSWORD
    },
    collection_name=MILVUS_COLLECTION,
    auto_id=True,
    drop_old=True
)


In [None]:
loaded = db.add_documents(splits)
print(f"{len(loaded)} documents loaded.")

#### Test vector DB search

In [None]:
query = "Who are you?"
# query = "Who are the funding partners for the State's transformative infrastructure projects?"
docs_with_score = db.similarity_search_with_score(query)

In [None]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

#### Test out RAG request

In [None]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


def rag_request(llm, template_str, question):
    num_retries = 1
    for attempt in range(num_retries):
        try:
            prompt = PromptTemplate.from_template(template_str)
            rag_chain = (
                    {"context": retriever | format_docs, "question": RunnablePassthrough()}
                    | prompt
                    | llm
                    | StrOutputParser()
            )
            response = rag_chain.invoke(question)
            return response.strip()
        except Exception as e:
            print(f"Request failed: {e}")
            if attempt < num_retries:
                print(f"Retrying in 5 seconds...")
                time.sleep(5)
            else:
                return ""

In [None]:
llm_config = get_config()
for testing_config in llm_config["testing_config"]:
    if not testing_config.get("rag"):
        continue
    else:
        llm = create_llm(testing_config)
        template_str = testing_config.get("template")
        question = "Who are you?"
        answer = rag_request(llm, template_str, question)
        print(answer)
        break

## Generate Answers

### Use qna.yaml, csv, jsonl to create some questions and ground truth answers

We create a pandas dataframe with the columns `question` and `ground_truth`
- create a csv file in the correct (default is "ground_truth") directory with the columns `question` and `ground_truth`

- qna.yaml files can be taken as written from data_preparation and converted to the appropriate format.  

In [None]:
llm_config = get_config()
output_directory = get_output_dir()
qround_truth_df = pd.DataFrame(columns=["user_input", "reference"])

directory = "reference_answers"
for file_path in Path(directory).rglob('*.csv'):
    # print(file_path)
    csv_df = pd.read_csv(file_path)
    print(f"{file_path}: {csv_df.shape[0]} questions")
    qround_truth_df = pd.concat([qround_truth_df, csv_df], ignore_index=True)

for file_path in Path(directory).rglob('*.jsonl'):
    # print(file_path)
    jsonl_df = pd.read_json(file_path, orient="records", lines=True)
    print(f"{file_path}: {jsonl_df.shape[0]} questions")
    qround_truth_df = pd.concat([qround_truth_df, jsonl_df], ignore_index=True)

qna_list = []

for file_path in Path(directory).rglob('*.yaml'):
    with open(file_path) as file:
        qna = yaml.load(file, Loader=yaml.FullLoader)
        for seed_example in qna["seed_examples"]:
            for questions_and_answers in seed_example["questions_and_answers"]:
                qna_list.append({
                    "user_input": questions_and_answers["question"].strip(),
                    "reference": questions_and_answers["answer"].strip()
                })
        print(f"{file_path}: {len(qna_list)} questions")

ground_truth_df = pd.concat([qround_truth_df, pd.DataFrame(qna_list)], ignore_index=True)
ground_truth_df = ground_truth_df.drop_duplicates(subset=["user_input"])
print(f"{ground_truth_df.shape[0]} total unique questions")

ground_truth_df.to_json(f"{output_directory}/reference_answers.jsonl", orient="records", lines=True)
ground_truth_df.head()

## Get responses from each of the available models

In [8]:
from instructlab_ragas import ModelConfig, RagasEvaluator, RunConfig, Sample
import os

llm_config = get_config()
output_directory = get_output_dir()
reference_answers_df = pd.read_json(f"{output_directory}/reference_answers.jsonl", orient="records", lines=True)
reference_answers_df

for testing_config in llm_config["testing_config"]:
    testing_config_name = get_testing_config_name(testing_config)
    print("-" * 80)
    print(testing_config_name)

    reference_answers_df = pd.read_json(f"{output_directory}/reference_answers.jsonl", orient="records", lines=True)
    dataset = reference_answers_df.to_dict(orient="records")

    student_model = ModelConfig(
        base_url=testing_config.get("endpoint_url"),
        model_name=testing_config.get("model_name"),
        api_key=testing_config.get("api_key"),
    )

    os.environ["OPENAI_API_KEY"] = llm_config["judge"]["api_key"]
    evaluator = RagasEvaluator()
    # result_df = evaluator._generate_answers_from_model(questions, student_model)
    evaluation_result = evaluator.run(dataset=dataset, student_model=student_model)



  from .autonotebook import tqdm as notebook_tqdm


--------------------------------------------------------------------------------
finetuned


BadRequestError: Error code: 400 - {'object': 'error', 'message': '[{\'type\': \'dict_type\', \'loc\': (\'body\', \'messages\', 0, \'typed-dict\'), \'msg\': \'Input should be a valid dictionary\', \'input\': "You are an advanced AI assistant designed to provide precise and accurate information.\\nYour primary goal is to answer queries with the most up-to-date and factual information available.\\nFocus on delivering clear, concise, and correct responses.\\nIf you\'re uncertain about any aspect of the query, state your level of confidence and provide the most accurate information you can.\\nYour responses should prioritize accuracy over all other considerations."}, {\'type\': \'dict_type\', \'loc\': (\'body\', \'messages\', 0, \'typed-dict\'), \'msg\': \'Input should be a valid dictionary\', \'input\': "You are an advanced AI assistant designed to provide precise and accurate information.\\nYour primary goal is to answer queries with the most up-to-date and factual information available.\\nFocus on delivering clear, concise, and correct responses.\\nIf you\'re uncertain about any aspect of the query, state your level of confidence and provide the most accurate information you can.\\nYour responses should prioritize accuracy over all other considerations."}, {\'type\': \'dict_type\', \'loc\': (\'body\', \'messages\', 0, \'typed-dict\'), \'msg\': \'Input should be a valid dictionary\', \'input\': "You are an advanced AI assistant designed to provide precise and accurate information.\\nYour primary goal is to answer queries with the most up-to-date and factual information available.\\nFocus on delivering clear, concise, and correct responses.\\nIf you\'re uncertain about any aspect of the query, state your level of confidence and provide the most accurate information you can.\\nYour responses should prioritize accuracy over all other considerations."}, {\'type\': \'dict_type\', \'loc\': (\'body\', \'messages\', 0, \'typed-dict\'), \'msg\': \'Input should be a valid dictionary\', \'input\': "You are an advanced AI assistant designed to provide precise and accurate information.\\nYour primary goal is to answer queries with the most up-to-date and factual information available.\\nFocus on delivering clear, concise, and correct responses.\\nIf you\'re uncertain about any aspect of the query, state your level of confidence and provide the most accurate information you can.\\nYour responses should prioritize accuracy over all other considerations."}, {\'type\': \'dict_type\', \'loc\': (\'body\', \'messages\', 0, \'typed-dict\'), \'msg\': \'Input should be a valid dictionary\', \'input\': "You are an advanced AI assistant designed to provide precise and accurate information.\\nYour primary goal is to answer queries with the most up-to-date and factual information available.\\nFocus on delivering clear, concise, and correct responses.\\nIf you\'re uncertain about any aspect of the query, state your level of confidence and provide the most accurate information you can.\\nYour responses should prioritize accuracy over all other considerations."}, {\'type\': \'dict_type\', \'loc\': (\'body\', \'messages\', 0, \'typed-dict\'), \'msg\': \'Input should be a valid dictionary\', \'input\': "You are an advanced AI assistant designed to provide precise and accurate information.\\nYour primary goal is to answer queries with the most up-to-date and factual information available.\\nFocus on delivering clear, concise, and correct responses.\\nIf you\'re uncertain about any aspect of the query, state your level of confidence and provide the most accurate information you can.\\nYour responses should prioritize accuracy over all other considerations."}, {\'type\': \'dict_type\', \'loc\': (\'body\', \'messages\', 1, \'typed-dict\'), \'msg\': \'Input should be a valid dictionary\', \'input\': \'What does an employee need to provide in an appeal to the Telecommuting Appeal board?\'}, {\'type\': \'dict_type\', \'loc\': (\'body\', \'messages\', 1, \'typed-dict\'), \'msg\': \'Input should be a valid dictionary\', \'input\': \'What does an employee need to provide in an appeal to the Telecommuting Appeal board?\'}, {\'type\': \'dict_type\', \'loc\': (\'body\', \'messages\', 1, \'typed-dict\'), \'msg\': \'Input should be a valid dictionary\', \'input\': \'What does an employee need to provide in an appeal to the Telecommuting Appeal board?\'}, {\'type\': \'dict_type\', \'loc\': (\'body\', \'messages\', 1, \'typed-dict\'), \'msg\': \'Input should be a valid dictionary\', \'input\': \'What does an employee need to provide in an appeal to the Telecommuting Appeal board?\'}, {\'type\': \'dict_type\', \'loc\': (\'body\', \'messages\', 1, \'typed-dict\'), \'msg\': \'Input should be a valid dictionary\', \'input\': \'What does an employee need to provide in an appeal to the Telecommuting Appeal board?\'}, {\'type\': \'dict_type\', \'loc\': (\'body\', \'messages\', 1, \'typed-dict\'), \'msg\': \'Input should be a valid dictionary\', \'input\': \'What does an employee need to provide in an appeal to the Telecommuting Appeal board?\'}]', 'type': 'BadRequestError', 'param': None, 'code': 400}

In [11]:
evaluation_result.scores

[{'domain_specific_rubrics': 9},
 {'domain_specific_rubrics': 3},
 {'domain_specific_rubrics': 8},
 {'domain_specific_rubrics': 3},
 {'domain_specific_rubrics': 9},
 {'domain_specific_rubrics': 2},
 {'domain_specific_rubrics': 3},
 {'domain_specific_rubrics': 6},
 {'domain_specific_rubrics': 4},
 {'domain_specific_rubrics': 8},
 {'domain_specific_rubrics': 6},
 {'domain_specific_rubrics': 5},
 {'domain_specific_rubrics': 8},
 {'domain_specific_rubrics': 8},
 {'domain_specific_rubrics': 7},
 {'domain_specific_rubrics': 3},
 {'domain_specific_rubrics': 8},
 {'domain_specific_rubrics': 2},
 {'domain_specific_rubrics': 3},
 {'domain_specific_rubrics': 1},
 {'domain_specific_rubrics': 5},
 {'domain_specific_rubrics': 5},
 {'domain_specific_rubrics': 5},
 {'domain_specific_rubrics': 4},
 {'domain_specific_rubrics': 4},
 {'domain_specific_rubrics': 5},
 {'domain_specific_rubrics': 6},
 {'domain_specific_rubrics': 10},
 {'domain_specific_rubrics': 4},
 {'domain_specific_rubrics': 5},
 {'domain

In [None]:
llm_config = get_config()
output_directory = get_output_dir()
reference_answers_df = pd.read_json(f"{output_directory}/reference_answers.jsonl", orient="records", lines=True)

for testing_config in llm_config["testing_config"]:
    print("-" * 80)
    print(testing_config.get("name") or testing_config.get("model_name"))
    responses = reference_answers_df.copy()
    responses["response"] = ""
    llm = create_llm(testing_config)
    for index, row in responses.iterrows():
        question = row["user_input"]
        print(index, question[:40])
        if testing_config.get("rag"):
            answer = rag_request(llm, testing_config.get("template"), question)
        else:
            answer = qna_request(llm, testing_config.get("template"), question)
        print("Answer: " + answer[:40])
        responses.at[index, "response"] = answer
    testing_config_name = get_testing_config_name(testing_config)
    responses.to_json(f"{output_directory}/{testing_config_name}_responses.jsonl", orient="records", lines=True)
    # responses.to_csv(f"{output_directory}/{base_filename}_answers.csv")

## Grade responses using InstructLab

In [13]:
llm_config = get_config()
responses_directory = get_output_dir()
output_directory = responses_directory + "/ilab_scores"
os.makedirs(output_directory, exist_ok=True)

In [None]:
from instructlab_ragas import ModelConfig, RagasEvaluator, RunConfig, Sample
import os

for testing_config in llm_config["testing_config"]:
    testing_config_name = get_testing_config_name(testing_config)
    print("-" * 80)
    print(testing_config_name)

    responses_filename = f"{responses_directory}/{testing_config_name}_responses.jsonl"
    print(responses_filename)
    responses = pd.read_json(responses_filename, orient="records", lines=True)
    responses_list = responses[["user_input", "reference", "response"]].to_dict(orient="records")

    os.environ["OPENAI_API_KEY"] = llm_config["judge"]["api_key"]
    evaluator = RagasEvaluator()
    evaluation_result = evaluator.run(dataset=responses_list)

    scores = pd.DataFrame(responses_list)
    scores["score"] = [score["domain_specific_rubrics"] for score in evaluation_result.scores]
    scores_filename = f"{output_directory}/{testing_config_name}_ilab_scores"
    scores.to_json(f"{scores_filename}.jsonl", orient="records", lines=True)


## Create resulting score report CSV

In [None]:
llm_config = get_config()
output_directory = get_output_dir() + "/ilab_scores"
os.makedirs(output_directory, exist_ok=True)

In [None]:
judge_client = OpenAI(api_key=llm_config["judge"]["api_key"])
judge_model_name = llm_config["judge"]["model_name"]
judge_name = replace_special_char(judge_model_name)

summary_output_df = pd.DataFrame()

for testing_config in llm_config["testing_config"]:
    testing_config_name = get_testing_config_name(testing_config)
    scores_filename = f"{output_directory}/{testing_config_name}_ilab_scores.jsonl"
    print(f"Adding {scores_filename}")
    scores = pd.read_json(scores_filename, orient="records", lines=True)
    summary_output_df[f"{testing_config_name}_score"] = scores["score"]

average_row = summary_output_df.mean(axis=0, numeric_only=True)
print(average_row)
summary_output_df.loc[len(summary_output_df)] = average_row
question_indices = [f"Q{i + 1}" for i in range(len(summary_output_df) - 1)]
question_indices.append("Average")
summary_output_df.insert(0, 'question index', question_indices)

summary_filepath = f"{output_directory}/ilab_summary_scores"
# summary_output_df.to_json(f"{summary_filepath}.jsonl", orient="records", lines=True)
summary_output_df.to_csv(f"{summary_filepath}.csv", index=False)

In [None]:
with pd.ExcelWriter(f"{output_directory}/ilab_scores.xlsx") as writer:
    summary_output_df = pd.read_csv(f"{summary_filepath}.csv")
    summary_output_df.to_excel(writer, sheet_name="Summary", index=False)

    for testing_config in llm_config["testing_config"]:
        testing_config_name = get_testing_config_name(testing_config)
        scores_filename = f"{output_directory}/{testing_config_name}_ilab_scores.jsonl"
        scores = pd.read_json(scores_filename, orient="records", lines=True)
        scores.to_excel(writer, sheet_name=f"{testing_config_name}_ilab_scores"[:30])