# Evaluation with Data
In this notebook, we introduce built-in evaluators and guide you through creating your own custom evaluators. We'll cover both code-based and prompt-based custom evaluators. Finally, we'll demonstrate how to use the `evaluate` API to assess data using these evaluators.


In [6]:
# Clearing any old installation
# This is important since older version of promptflow has one package.
# Now it is split into number of them.
! pip uninstall -y promptflow promptflow-cli promptflow-azure promptflow-core promptflow-devkit promptflow-tools promptflow-evals

# Install packages in this order
! pip install promptflow-evals==0.3.0.post3 --extra-index-url https://azuremlsdktestpypi.azureedge.net/promptflow/

[0mFound existing installation: promptflow-azure 1.11.0
Uninstalling promptflow-azure-1.11.0:
  Successfully uninstalled promptflow-azure-1.11.0
Found existing installation: promptflow-core 1.12.0.dev127371921
Uninstalling promptflow-core-1.12.0.dev127371921:
  Successfully uninstalled promptflow-core-1.12.0.dev127371921
Found existing installation: promptflow-devkit 1.12.0.dev127371921
Uninstalling promptflow-devkit-1.12.0.dev127371921:
  Successfully uninstalled promptflow-devkit-1.12.0.dev127371921
[0mFound existing installation: promptflow-evals 0.3.0.post2
Uninstalling promptflow-evals-0.3.0.post2:
  Successfully uninstalled promptflow-evals-0.3.0.post2
Looking in indexes: https://pypi.org/simple, https://azuremlsdktestpypi.azureedge.net/promptflow/
Collecting promptflow-evals==0.3.0.post3
  Downloading https://azuremlsdktestpypi.blob.core.windows.net/repo/promptflow/promptflow_evals-0.3.0.post3-py3-none-any.whl?sv=2023-08-03&st=2024-05-23T01%3A23%3A47Z&se=2025-05-23T01%3A23%3A4

## Evaluate the eval dataset using the fine tuned model

In [16]:
import os
from dotenv import load_dotenv

load_dotenv()

experiment_name="vampire-bats-6"
experiment_dir=f"dataset/{experiment_name}-files"

dataset_path_hf_eval = f"{experiment_dir}/{experiment_name}-hf.eval.jsonl"
dataset_path_hf_eval_answer = f"{experiment_dir}/{experiment_name}-hf.eval.answer.jsonl"
dataset_path_hf_eval_answer_baseline = f"{experiment_dir}/{experiment_name}-hf.eval.answer.baseline.jsonl"

dataset_path_ft_eval = f"{experiment_dir}/{experiment_name}-ft.eval.jsonl"
dataset_path_ft_eval_baseline = f"{experiment_dir}/{experiment_name}-ft.eval.baseline.jsonl"
dataset_path_ft_eval_score = f"{experiment_dir}/{experiment_name}-ft.eval.score.jsonl"

EVAL_OPENAI_BASE_URL_BASE = os.getenv('EVAL_OPENAI_BASE_URL_BASE')
EVAL_OPENAI_API_KEY_BASE = os.getenv('EVAL_OPENAI_API_KEY_BASE')
EVAL_OPENAI_DEPLOYMENT_BASE = os.getenv('EVAL_OPENAI_DEPLOYMENT_BASE')

EVAL_OPENAI_BASE_URL_FT = os.getenv('EVAL_OPENAI_BASE_URL_FT')
EVAL_OPENAI_API_KEY_FT = os.getenv('EVAL_OPENAI_API_KEY_FT')
EVAL_OPENAI_DEPLOYMENT_FT = os.getenv('EVAL_OPENAI_DEPLOYMENT_FT')

def obfuscate(secret):
    l = len(secret)
    return '.' * (l - 4) + secret[-4:]

print(f"experiment_name={experiment_name}")
print(f"experiment_dir={experiment_dir}")
print(f"dataset_path_hf_eval={dataset_path_hf_eval}")
print(f"dataset_path_hf_eval_answer={dataset_path_hf_eval_answer}")
print(f"dataset_path_hf_eval_answer_baseline={dataset_path_hf_eval_answer_baseline}")
print(f"dataset_path_ft_eval={dataset_path_ft_eval}")
print(f"dataset_path_ft_eval_baseline={dataset_path_ft_eval_baseline}")
print(f"dataset_path_ft_eval_score={dataset_path_ft_eval_score}")


print(f"EVAL_OPENAI_BASE_URL_BASE={EVAL_OPENAI_BASE_URL_BASE}")
print(f"EVAL_OPENAI_API_KEY_BASE={obfuscate(EVAL_OPENAI_API_KEY_BASE)}")
print(f"EVAL_OPENAI_DEPLOYMENT_BASE={EVAL_OPENAI_DEPLOYMENT_BASE}")

print(f"EVAL_OPENAI_BASE_URL_FT={EVAL_OPENAI_BASE_URL_FT}")
print(f"EVAL_OPENAI_API_KEY_FT={obfuscate(EVAL_OPENAI_API_KEY_FT)}")
print(f"EVAL_OPENAI_DEPLOYMENT_FT={EVAL_OPENAI_DEPLOYMENT_FT}")


experiment_name=vampire-bats-6
experiment_dir=dataset/vampire-bats-6-files
dataset_path_hf_eval=dataset/vampire-bats-6-files/vampire-bats-6-hf.eval.jsonl
dataset_path_hf_eval_answer=dataset/vampire-bats-6-files/vampire-bats-6-hf.eval.answer.jsonl
dataset_path_hf_eval_answer_baseline=dataset/vampire-bats-6-files/vampire-bats-6-hf.eval.answer.baseline.jsonl
dataset_path_ft_eval=dataset/vampire-bats-6-files/vampire-bats-6-ft.eval.jsonl
dataset_path_ft_eval_baseline=dataset/vampire-bats-6-files/vampire-bats-6-ft.eval.baseline.jsonl
dataset_path_ft_eval_score=dataset/vampire-bats-6-files/vampire-bats-6-ft.eval.score.jsonl
EVAL_OPENAI_BASE_URL_BASE=https://Llama-2-7b-raft-ucb-sh-man-yzqgd-serverless.westus3.inference.ai.azure.com/v1
EVAL_OPENAI_API_KEY_BASE=............................pDkf
EVAL_OPENAI_DEPLOYMENT_BASE=Llama-2-7b-lnqzi
EVAL_OPENAI_BASE_URL_FT=https://Llama-2-7b-raft-vampire-bats-serverless.westus3.inference.ai.azure.com/v1
EVAL_OPENAI_API_KEY_FT=............................wnr

### Baseline

In [None]:
!unset AZURE_OPENAI_ENDPOINT && \
unset AZURE_OPENAI_API_KEY && \
unset OPENAI_API_VERSION && \
OPENAI_BASE_URL=$EVAL_OPENAI_BASE_URL_FT \
OPENAI_API_KEY=$EVAL_OPENAI_API_KEY_BASE \
python ../eval.py \
    --question-file $dataset_path_hf_eval \
    --answer-file $dataset_path_hf_eval_answer_baseline \
    --model $EVAL_OPENAI_DEPLOYMENT_BASE

### Fine tuned model

In [None]:
!unset AZURE_OPENAI_ENDPOINT && \
unset AZURE_OPENAI_API_KEY && \
unset OPENAI_API_VERSION && \
OPENAI_BASE_URL=$EVAL_OPENAI_BASE_URL_FT \
OPENAI_API_KEY=$EVAL_OPENAI_API_KEY_FT \
python ../eval.py \
    --question-file $dataset_path_hf_eval \
    --answer-file $dataset_path_hf_eval_answer \
    --model $EVAL_OPENAI_DEPLOYMENT_FT

## 0. Prepare eval dataset

In [27]:
! python ../format.py \
    --input $dataset_path_hf_eval_answer \
    --input-type jsonl \
    --output $dataset_path_ft_eval \
    --output-format eval

[32m2024-05-23 07:45:31[0m [1;30m INFO[0m [    ] [34mraft[0m Dataset has 93 rows
[32m2024-05-23 07:45:31[0m [1;30m INFO[0m [    ] [34mraft[0m Converting jsonl file dataset/vampire-bats-6-files/vampire-bats-6-hf.eval.answer.jsonl to jsonl eval file dataset/vampire-bats-6-files/vampire-bats-6-ft.eval.jsonl
Creating json from Arrow format: 100%|████████████| 1/1 [00:00<00:00, 86.07ba/s]


In [None]:
! python ../format.py \
    --input $dataset_path_hf_eval_answer_baseline \
    --input-type jsonl \
    --output $dataset_path_ft_eval_baseline \
    --output-format eval

In [18]:
import pandas as pd

In [28]:
df = pd.read_json(dataset_path_ft_eval, lines=True)
df.head()

Unnamed: 0,question,answer,gold_final_answer,final_answer,context
0,What kind of vision do most microbats have?,"To answer the question, we need to identify th...",Mesopic vision,Mesopic vision,<DOCUMENT>These bats must dealwith changes in ...
1,What year was Anne Rice's Vampire Chronicles p...,"To answer the question, we need to identify th...",1976,1976,<DOCUMENT>Huntingtonbrought surfing to the Cal...
2,In how many films has Dracula appeared?,"To answer the question, we need to identify th...",The exact number of films is not specified in ...,More than one,<DOCUMENT>This formula was followed in novelis...
3,Who is the character that appears in more film...,"To answer the question, we need to identify th...",Sherlock Holmes,Sherlock Holmes,"<DOCUMENT>At rest, theymay wrap their wings ar..."
4,What was the time period of Stephenie Meyer's ...,"To answer the question, we need to identify th...",2005-2008,Modern,<DOCUMENT>.</DOCUMENT>\n<DOCUMENT>This formula...


In [None]:
pd.read_json(dataset_path_ft_eval_baseline, lines=True).head()

## 1. Built-in Evaluators

The table below lists all the built-in evaluators we support. In the following sections, we will select a few of these evaluators to demonstrate how to use them.

| Category       | Namespace                                        | Evaluator Class           | Notes                                             |
|----------------|--------------------------------------------------|---------------------------|---------------------------------------------------|
| Quality        | promptflow.evals.evaluators                      | GroundednessEvaluator     | Measures how well the answer is entailed by the context and is not hallucinated |
|                |                                                  | RelevanceEvaluator        | How well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. |
|                |                                                  | CoherenceEvaluator        | How well all the sentences fit together and sound naturally as a whole. |
|                |                                                  | FluencyEvaluator          | Quality of individual sentences in the answer, and whether they are well-written and grammatically correct. |
|                |                                                  | SimilarityEvaluator       | Measures the similarity between the predicted answer and the correct answer |
|                |                                                  | F1ScoreEvaluator          | F1 score |
| Content Safety | promptflow.evals.evaluators.content_safety       | ViolenceEvaluator         |                                                   |
|                |                                                  | SexualEvaluator           |                                                   |
|                |                                                  | SelfHarmEvaluator         |                                                   |
|                |                                                  | HateUnfairnessEvaluator   |                                                   |
| Composite      | promptflow.evals.evaluators                      | QAEvaluator               | Built on top of individual quality evaluators.    |
|                |                                                  | ChatEvaluator             | Similar to QAEvaluator but designed for evaluating chat messages. |
|                |                                                  | ContentSafetyEvaluator    | Built on top of individual content safety evaluators. |



### 1.1 Quality Evaluator

In [10]:
import os
from promptflow.core import AzureOpenAIModelConfiguration

azure_endpoint=os.environ.get("EVAL_AZURE_OPENAI_ENDPOINT_EVALUATORS")
api_key=os.environ.get("EVAL_AZURE_OPENAI_API_KEY_EVALUATORS")
azure_deployment=os.environ.get("EVAL_AZURE_OPENAI_DEPLOYMENT_EVALUATORS")
api_version=os.environ.get("EVAL_OPENAI_API_VERSION_EVALUATORS")

print("azure_endpoint=" + azure_endpoint)
print("azure_deployment=" + azure_deployment)
print("api_version=" + api_version)

# Initialize Azure OpenAI Connection
model_config = AzureOpenAIModelConfiguration(
    azure_endpoint=azure_endpoint,
    api_key=api_key,
    azure_deployment=azure_deployment,
    api_version=api_version,
)

project_scope = {
    "subscription_id": "75703df0-38f9-4e2e-8328-45f6fc810286",
    "resource_group_name": "rg_cvi_ai_mvpsummit24_useast2",
    "project_name": "cvi_mvpsummit24"
}

azure_endpoint=https://ai-cviaiwestus1288043977207.openai.azure.com/
azure_deployment=gpt-4-turbo
api_version=2023-03-15-preview


In [30]:
from promptflow.evals.evaluators import RelevanceEvaluator

# Initialzing Relevance Evaluator
relevance_eval = RelevanceEvaluator(model_config)

In [None]:
sample=df.iloc[1]
sample

In [None]:
# Running Relevance Evaluator on single input row
relevance_score = relevance_eval(
    question=sample['question'],
    answer=sample['final_answer'],
    context=sample['context'],
    ground_truth=sample['gold_answer'],
)
print(relevance_score)

In [12]:
question="What kind of vision do most microbats have?"
answer="Mesopic vision"
context="""<DOCUMENT>These bats must dealwith changes in the Doppler shift due to changes in their flightspeed. They have adapted to change their pulse emissionfrequency in relation to their flight speed so echoes still return inthe optimal hearing range.[94][95]In addition to echolocating prey, bat ears are sensitive to soundsmade by their prey, such as the fluttering of moth wings. Thecomplex geometry of ridges on the inner surface of bat ears helps to sharply focus echolocation signals, and topassively listen for any other sound produced by the prey. These ridges can be regarded as the acousticequivalent of a Fresnel lens, and exist in a large variety of unrelated animals, such as the aye-aye, lesser galago,bat-eared fox, mouse lemur, and others.[96][97][98] Bats can estimate the elevation of their target using theinterference patterns from the echoes reflecting from the tragus, a flap of skin in the external ear.[92]By repeated scanning, bats can mentally construct an accurate image of theenvironment in which they are moving and of their prey.[101] Some speciesof moth have exploited this, such as the tiger moths, which producesaposematic ultrasound signals to warn bats that they are chemicallyprotected and therefore distasteful.[99][100] Moth species including the tigermoth can produce signals to jam bat echolocation. Many moth species havea hearing organ called a tympanum, which responds to an incoming batsignal by causing the moth's flight muscles to twitch erratically, sending themoth into random evasive manoeuvres.[102][103][104]The eyes of most microbat species are small and poorly developed, leadingto poor visual acuity, but no species is blind.[105] Most microbats have mesopic vision, meaning that they candetect light only in low levels, whereas other mammals have photopic vision, which allows colour vision.Microbats may use their vision for orientation and while travelling between their roosting grounds and feedinggrounds, as echolocation is effective only over short distances.</DOCUMENT> <DOCUMENT>She reads hisjournal and passes it along to Van Helsing. This unfolds the first clue to the identity of Lucy'sassailant, which later prompts Mina to collect all of the events of Dracula's appearance in newsarticles, saved letters, newspaper clippings and the journals of each member of the group. This assiststhe group in investigating Dracula's movements and later discovering that Renfield's behaviour isdirectly influenced by Dracula.</DOCUMENT> <DOCUMENT>The recent development offoilboards, which plane very early on a hydrofoil fin and therebylift off the water producing low friction, represent the idealcomplementary hydrodynamic platform for wings.[3]The history of wing foiling, or simply "winging" begins with theinvention of pre-hydrofoil technology wing surfing dating back to1981, when aeronautical engineer Jim Drake, the same individualwho also invented windsurfing,[4] and Uli Stanciu, Europeanwindsurfing pioneer, together invented and patented the world'sfirst wing.[5] Their patented concept was used on a large, non-foiling windsurf board of that era. Drake's wing was theoreticallybased on the symmetrical shape of a flying fish.</DOCUMENT> <DOCUMENT>Thus vampires were merely sufferers of porphyria seeking toreplace haem and alleviate their symptoms.[133]The theory has been rebuffed medically as suggestions that porphyria sufferers crave the haem inhuman blood, or that the consumption of blood might ease the symptoms of porphyria, are based on amisunderstanding of the disease. Furthermore, Dolphin was noted to have confused fictional(bloodsucking) vampires with those of folklore, many of whom were not noted to drink blood.[134]Similarly, a parallel is made between sensitivity to sunlight by sufferers, yet this was associated withfictional and not folkloric vampires.</DOCUMENT>"""
ground_truth="Mesopic vision"

In [11]:
from promptflow.evals.evaluators import GroundednessEvaluator, SimilarityEvaluator

# Initialzing Relevance Evaluator
groundedness_eval = GroundednessEvaluator(project_scope=project_scope)
similarity_eval = SimilarityEvaluator(model_config)

In [13]:
# Running Relevance Evaluator on single input row
relevance_score = groundedness_eval(
    question=question,
    answer=answer,
    context=context,
    ground_truth=ground_truth,
)
print(relevance_score)

[2024-05-23 02:17:04 +0000][flowinvoker][INFO] - Getting connections from pf client with provider from args: local...
[2024-05-23 02:17:04 +0000][flowinvoker][INFO] - Promptflow get connections successfully. keys: dict_keys([])
[2024-05-23 02:17:04 +0000][flowinvoker][INFO] - Promptflow executor starts initializing...
[2024-05-23 02:17:04 +0000][flowinvoker][INFO] - Promptflow executor initiated successfully.
[2024-05-23 02:17:04 +0000][flowinvoker][INFO] - Validating flow input with data {'question': 'What kind of vision do most microbats have?', 'answer': 'Mesopic vision', 'context': '<DOCUMENT>These bats must dealwith changes in the Doppler shift due to changes in their flightspeed. They have adapted to change their pulse emissionfrequency in relation to their flight speed so echoes still return inthe optimal hearing range.[94][95]In addition to echolocating prey, bat ears are sensitive to soundsmade by their prey, such as the fluttering of moth wings. Thecomplex geometry of ridges 

2024-05-23 02:17:04 +0000   85117 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-05-23 02:17:04 +0000   85117 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.
2024-05-23 02:17:04 +0000   85117 execution.flow     INFO     Executing node validate_inputs. node run id: f9719024-9e60-4cf1-abb8-d56181d5fdae_validate_inputs_6eae33fa-4106-4fa0-b988-957c845cb313
2024-05-23 02:17:04 +0000   85117 execution.flow     INFO     Node validate_inputs completes.
2024-05-23 02:17:04 +0000   85117 execution.flow     INFO     The node 'evaluate_with_rai_service' will be executed because the activate condition is met, i.e. '${validate_inputs.output}' is equal to 'True'.
2024-05-23 02:17:04 +0000   85117 execution.flow     INFO     Executing node evaluate_with_rai_service. node run id: f9719024-9e60-4cf1-abb8-d56181d5fdae_evaluate_with_rai_service_159dedd1-a354-45ad-874e-5324f4952e37
2024-05-23 02:17:09 +0000   85117 execution.flow     INFO     Node ev

In [25]:
import tiktoken
def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    if not string:
        return 0
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [23]:
num_tokens_from_string(context.split("mesopic")[0])

376

## 3. Batch evaluate

In [7]:
df = pd.read_json(dataset_path_ft_eval, lines=True)
df.head()

Unnamed: 0,question,gold_final_answer,context
0,What kind of vision do most microbats have?,Mesopic vision,<DOCUMENT>These bats must dealwith changes in ...
1,What year was Anne Rice's Vampire Chronicles p...,1976,<DOCUMENT>Huntingtonbrought surfing to the Cal...
2,In how many films has Dracula appeared?,The exact number of films is not specified in ...,<DOCUMENT>This formula was followed in novelis...
3,Who is the character that appears in more film...,Sherlock Holmes,"<DOCUMENT>At rest, theymay wrap their wings ar..."
4,What was the time period of Stephenie Meyer's ...,2005-2008,<DOCUMENT>.</DOCUMENT>\n<DOCUMENT>This formula...


In [None]:
!AZURE_OPENAI_ENDPOINT=$azure_endpoint \
    AZURE_OPENAI_API_KEY=$api_key \
    AZURE_OPENAI_DEPLOYMENT=$azure_deployment \
    OPENAI_API_VERSION=$api_version \
    python ../pfeval.py \
    --input $dataset_path_ft_eval \
    --output $dataset_path_ft_eval_score

In [None]:
df = pd.read_json(dataset_path_ft_eval_score, lines=True)
df.head()

In [None]:
df.describe()

## 3. Using Evaluate API to evaluate with data

In previous sections, we walked you through how to use built-in evaluators to evaluate a single row and how to define your own custom evaluators. Now, we will show you how to use these evaluators with the powerful `evaluate` API to assess an entire dataset.

First, let's take a peek at what the data looks like.

In [None]:
df.head()

Now, we will invoke the `evaluate` API using a few evaluators that we already initialized

Additionally, we have a column mapping to map the `truth` column from the dataset to `ground_truth`, which is accepted by the evaluator.

In [8]:
def extract_final_answer(cot_answer: str) -> str:
    """
    Extracts the final answer from the cot_answer field
    """
    return {"final_answer": cot_answer.split("<ANSWER>: ")[-1]}

In [9]:
from promptflow.evals.evaluate import evaluate

result = evaluate(
    #data=dataset_path_ft_eval,
    data="dataset/vampire-bats-6-files/vampire-bats-6-hf.eval.10.jsonl",
    target=extract_final_answer,
    evaluators={
        "groundedness": groundedness_eval,
        "similarity": similarity_eval
    },
    # column mapping
    evaluator_config={
        "default": {
            "answer": "${target.final_answer}",
            "question": "${data.question}",
            "context": "${data.context}",
            "ground_truth": "${data.gold_answer}",
        }
    }
)

from IPython.display import display, JSON
display(JSON(result))



Starting prompt flow service...
Start prompt flow service on port 23333, version: 1.12.0.dev127371921.
You can stop the prompt flow service with the following command:'[1mpf service stop[0m'.
Alternatively, if no requests are made within 1 hours, it will automatically stop.
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=main_extract_final_answer_s7fbcatm_20240523_020614_218557


[2024-05-23 02:06:19 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run main_extract_final_answer_s7fbcatm_20240523_020614_218557, log path: /home/vscode/.promptflow/.runs/main_extract_final_answer_s7fbcatm_20240523_020614_218557/logs.txt


SpawnedForkProcessManagerStartFailure: Failed to start spawned fork process manager


Finally, let's check the results produced by the evaluate API.

In [None]:
# Check the results using Azure AI Studio UI
print(result["studio_url"])

## Analyze the results

In [48]:
dataset_path_ft_eval_score = "dataset/vampire-bats-6-files/vampire-bats-6-ft.eval.answer.1864.score.jsonl"
df = pd.read_json(dataset_path_ft_eval_score, lines=True)
df.head()

Unnamed: 0,question,answer,gold_final_answer,final_answer,context,gpt_groundedness,gpt_similarity,error
0,What is the name of the character that appears...,"To answer the question, we need to identify th...",Sherlock Holmes,Dracula,<DOCUMENT>Further exterior shots followed in L...,3.0,,
1,Who is the character that appears in more film...,"To answer the question, we need to identify th...",Sherlock Holmes,Sherlock Holmes,"<DOCUMENT>At rest, theymay wrap their wings ar...",3.0,5.0,
2,What kind of vision do most microbats have?,"To answer the question, we need to identify th...",Mesopic vision,Mesopic vision,<DOCUMENT>These bats must dealwith changes in ...,5.0,5.0,
3,What is the name of the industries that the va...,"To answer the question, we need to identify th...","film, television, and gaming industries.","film, television, and gaming",<DOCUMENT>Drac and Johnny begin to bond and ha...,3.0,,
4,What is the title of the novel that many early...,"To answer the question, we need to identify th...",Dracula,Dracula,"<DOCUMENT>World Champion from 1976 to1979, Ove...",1.0,5.0,


In [51]:
len(df)

1864

In [49]:
df.describe()

Unnamed: 0,gpt_groundedness,gpt_similarity
count,1849.0,1005.0
mean,3.169822,4.768159
std,1.487079,0.933539
min,1.0,1.0
25%,3.0,5.0
50%,3.0,5.0
75%,5.0,5.0
max,5.0,5.0


In [32]:
df[df['gpt_groundedness'] < 5]

Unnamed: 0,question,answer,gold_final_answer,final_answer,context,gpt_groundedness,gpt_similarity,error
0,Who is the character that appears in more film...,"To answer the question, we need to identify th...",Sherlock Holmes,Sherlock Holmes,"<DOCUMENT>At rest, theymay wrap their wings ar...",3.0,5.0,
1,What is the name of the character that appears...,"To answer the question, we need to identify th...",Sherlock Holmes,Dracula,<DOCUMENT>Further exterior shots followed in L...,3.0,,
3,What is the title of the novel that many early...,"To answer the question, we need to identify th...",Dracula,Dracula,"<DOCUMENT>World Champion from 1976 to1979, Ove...",3.0,5.0,
5,Who is considered one of the preeminent figure...,"To answer the question, we need to identify th...",The vampire,The vampire,<DOCUMENT>Lower kite angles are possible for m...,1.0,5.0,
6,In how many films has Dracula appeared?,"To answer the question, we need to identify th...",The exact number of films is not specified in ...,A large number of films.,<DOCUMENT>This formula was followed in novelis...,3.0,,
...,...,...,...,...,...,...,...,...
95,What is the purpose of the required equipment?,"To answer the question, we need to identify th...",To provide protection to the competitors durin...,To provide protection.,<DOCUMENT>His acceptance intothe school was a ...,3.0,,
96,Here are 10 example questions:,Please provide the question you'd like me to a...,Please provide the question you'd like me to a...,Please provide the question you'd like me to a...,<DOCUMENT>A 3D accelerometer isworn to measure...,3.0,5.0,
97,5. Is Magma a rock group?,"To answer the question, we need to determine i...",Yes,Yes,<DOCUMENT>Male little yellow-shouldered bats (...,1.0,5.0,
98,4. How many groups did Jodorowsky approach for...,"To answer the question, we need to identify th...",2,Multiple groups,<DOCUMENT>The Fremen put thecommunity before t...,1.0,,


In [50]:
dataset_path_ft_eval_score_csv = dataset_path_ft_eval_score.replace(".jsonl", ".xlsx")
df.to_excel(dataset_path_ft_eval_score_csv, index=False)

In [38]:
! pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.0/250.0 kB[0m [31m968.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2
[0m

In [53]:
dataset_path_base_eval_score = "dataset/large_data_evaluate_base_model_Output_Table_05-23-2024-09-54.csv"
df = pd.read_csv(dataset_path_base_eval_score)
df.head()

Unnamed: 0,inputs.question,inputs.answer,inputs.context,inputs.ground_truth,gpt_similarity,gpt_groundedness,index,status
0,What kind of vision do most microbats have?,Most microbats have mesopic vision. $ answer =...,<DOCUMENT>These bats must dealwith changes in ...,Mesopic vision,5.0,5.0,0,Completed
1,What year was Anne Rice's Vampire Chronicles p...,1976.,<DOCUMENT>Huntingtonbrought surfing to the Cal...,1976,5.0,1.0,1,Completed
2,In how many films has Dracula appeared?,Unknown.,<DOCUMENT>This formula was followed in novelis...,The exact number of films is not specified in ...,5.0,1.0,2,Completed
3,Who is the character that appears in more film...,The character that appears in more films than ...,"<DOCUMENT>At rest, theymay wrap their wings ar...",Sherlock Holmes,1.0,1.0,3,Completed
4,What was the time period of Stephenie Meyer's ...,\nPlease let me know if you have any questio...,<DOCUMENT>.</DOCUMENT>\n<DOCUMENT>This formula...,2005-2008,1.0,1.0,4,Completed


In [47]:
dataset_path_base_eval_score_excel = dataset_path_base_eval_score.replace(".csv", ".xlsx")
df.to_excel(dataset_path_base_eval_score_excel, index=False)

In [54]:
df.describe()

Unnamed: 0,gpt_similarity,gpt_groundedness,index
count,1793.0,1802.0,1854.0
mean,2.730619,2.740844,927.533441
std,1.831168,1.951661,535.298451
min,1.0,1.0,0.0
25%,1.0,1.0,464.25
50%,2.0,1.0,927.5
75%,5.0,5.0,1390.75
max,5.0,5.0,1854.0
