## GPT-4o evaluation on ETHICS-sc101-PVA dataset

In [9]:
import os
import pandas as pd
from tqdm import tqdm
from langchain.output_parsers import PydanticOutputParser
from langchain_openai import ChatOpenAI
import sys

In [10]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
sys.path.append(parent_dir)

In [11]:
from llm.prompts import (
    alignment_eval_prompt,
    alignment_eval_prompt_en,
    sc_101_eval_prompt_pva_bad,
    sc_101_eval_prompt_pva_bad_en,
)
from evaluators.mixed_pva_evaluator import ModelAlignment

### GPT-4o setup

In [12]:
llm = ChatOpenAI(model='gpt-4o', max_tokens=256)
eval_output_parser = PydanticOutputParser(pydantic_object=ModelAlignment)

In [13]:
def evaluate_response(full_prompt: str, prediction: str, is_en: bool = False):
    prompt = alignment_eval_prompt_en if is_en else alignment_eval_prompt
    eval_chain = prompt | llm | eval_output_parser
    result = eval_chain.invoke({"query": full_prompt, "response": prediction})
    return result.score, result.explanation
    

In [14]:
def get_prediction_and_prompt(query: str, is_en: bool = False):
    prompt_template = sc_101_eval_prompt_pva_bad_en if is_en else sc_101_eval_prompt_pva_bad
    full_prompt = prompt_template.format(query=query)
    chain = prompt_template | llm
    response = chain.invoke({"query": query})
    return response.content.strip(), full_prompt

In [15]:
def process_df(df: pd.DataFrame, is_en: bool = False) -> pd.DataFrame:
    query_col = "query_en" if is_en else "query_ukr"
    results = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing {'EN' if is_en else 'UKR'}"):
        query = row[query_col]
        prediction, full_prompt = get_prediction_and_prompt(query, is_en=is_en)
        score, explanation = evaluate_response(full_prompt, prediction, is_en=is_en)
        results.append({
            "query_en": row["query_en"],
            "query_ukr": row["query_ukr"],
            "prediction": prediction,
            "score": score,
            "explanation": explanation
        })

    return pd.DataFrame(results)

### Evlaution

In [16]:
df = pd.read_csv("../../data/ethics_sc101_pva.csv")

df_en_results = process_df(df, is_en=True)
df_ukr_results = process_df(df, is_en=False)

Processing EN:   0%|          | 0/82 [00:00<?, ?it/s]

Processing EN: 100%|██████████| 82/82 [05:19<00:00,  3.90s/it]
Processing UKR: 100%|██████████| 82/82 [07:11<00:00,  5.27s/it]


In [17]:
average_score_en = df_en_results["score"].astype(float).mean()
average_score_ukr = df_ukr_results["score"].astype(float).mean()

In [18]:
average_score_en

np.float64(0.4634146341463415)

In [19]:
average_score_ukr

np.float64(0.45121951219512196)

In [21]:
df_en_results['llm_eval']= df_en_results['score'].astype(float)
df_ukr_results['llm_eval']= df_ukr_results['score'].astype(float)

In [None]:
df_en_results.to_csv("../../results/mixed_bad/en_gpt-4o.csv", index=False)
df_ukr_results.to_csv("../../results/mixed_bad/gpt-4o.csv", index=False)