In [1]:
#!pip install instructor
#!pip install openai
#!pip install pydantic
#!pip install jsonref
#!pip install langchain
#!pip install -U swifter

In [2]:
import instructor
from pydantic import BaseModel
from openai import OpenAI
from langchain.prompts import PromptTemplate
from tqdm import tqdm
import os 

class Evaluation(BaseModel):
    fluency: int
    adequacy: int

api_key = os.getenv("OPENAI_API_KEY")
client = instructor.from_openai(OpenAI(api_key =api_key))

prompt = """ You are a helpful language evaluator who can evaluate
input sentence2 and provide an evaluation of its fluency with a
likert scale rating of 1-5, 5 being highly fluent.
You will also have to compate two sentences and judge how adequate
is input sentence 2 with respect to input sentence 1, again with a likert scale rating of 1-5, 5 being highly adequate.
Here are the sentences: input_sentence1: {input_sentence1}, input_sentence2:{input_sentence2}"""

template = PromptTemplate(
    input_variables=["input_sentence1", "input_sentence2"],
    template=prompt,
)

def eval_single(row):
  i, r = row
  input_sentence1, input_sentence2 = r["Expected Caption"], r["Generated Caption"]
  final_prompt = template.format(input_sentence1=input_sentence1, input_sentence2=input_sentence2)
  eval_info = client.chat.completions.create(
    model="gpt-4o",
    response_model=Evaluation,
    messages=[{"role": "user", "content": final_prompt}],
  )
  #print (eval_info.model_dump())
    
  return eval_info.model_dump()



In [3]:
import pandas as pd
import json
import multiprocessing
from concurrent.futures import ThreadPoolExecutor, as_completed

# Function to be applied to each element in the list

def get_results_for_single_file(results_dir,file):
    file_path = os.path.join(results_dir,file)
    df = pd.read_csv(file_path)
    num_threads = 8

    # Create a ThreadPoolExecutor
    results = []
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Submit tasks to the thread pool
        futures = [executor.submit(eval_single, item) for item in df.iterrows()]

        # Process the results as they complete
        for future in tqdm(as_completed(futures)):
            results.append(future.result())

    df["eval"] = pd.Series(results)
    df.to_csv(file.replace("csv","eval.csv"))
    average_adequacy = df["eval"].apply(lambda x: x["adequacy"]).mean()
    average_fluency = df["eval"].apply(lambda x: x["fluency"]).mean()
    return {"Avg. Fluency":average_fluency, "Avg. Adequacy": average_adequacy}

In [4]:
import os
results_dir = "../results"
all_res = {}
for file in os.listdir(results_dir):
    results = get_results_for_single_file(results_dir, file)
    print(results)
    all_res[file.replace("csv","")] = results

results_df = pd.DataFrame(all_res).transpose()
results_df.to_csv("all_gpt4_results.csv")

161it [00:22,  7.21it/s]


{'Avg. Fluency': 4.6645962732919255, 'Avg. Adequacy': 1.031055900621118}


1987it [03:32,  9.36it/s]


{'Avg. Fluency': 4.46250629089079, 'Avg. Adequacy': 1.516859587317564}


162it [00:22,  7.36it/s]


{'Avg. Fluency': 4.728395061728395, 'Avg. Adequacy': 1.6296296296296295}


162it [00:19,  8.16it/s]


{'Avg. Fluency': 4.617283950617284, 'Avg. Adequacy': 1.1666666666666667}


161it [00:19,  8.40it/s]


{'Avg. Fluency': 4.838509316770186, 'Avg. Adequacy': 1.2670807453416149}


163it [00:19,  8.34it/s]


{'Avg. Fluency': 4.54601226993865, 'Avg. Adequacy': 1.2638036809815951}


163it [00:19,  8.36it/s]


{'Avg. Fluency': 4.852760736196319, 'Avg. Adequacy': 1.6319018404907975}


162it [00:19,  8.39it/s]


{'Avg. Fluency': 4.783950617283951, 'Avg. Adequacy': 1.345679012345679}


1987it [04:06,  8.06it/s]


{'Avg. Fluency': 4.823855057876195, 'Avg. Adequacy': 1.5772521389028686}


163it [00:18,  8.87it/s]


{'Avg. Fluency': 4.7975460122699385, 'Avg. Adequacy': 1.5889570552147239}


164it [00:18,  8.83it/s]


{'Avg. Fluency': 4.853658536585366, 'Avg. Adequacy': 1.5304878048780488}


164it [00:18,  9.03it/s]


{'Avg. Fluency': 4.5060975609756095, 'Avg. Adequacy': 1.2073170731707317}


164it [00:19,  8.45it/s]


{'Avg. Fluency': 4.7926829268292686, 'Avg. Adequacy': 1.1829268292682926}


1987it [03:34,  9.25it/s]


{'Avg. Fluency': 4.7991947659788625, 'Avg. Adequacy': 1.4896829391041773}


1987it [03:49,  8.68it/s]


{'Avg. Fluency': 4.772018117765476, 'Avg. Adequacy': 1.6024157020634122}


1987it [03:53,  8.50it/s]


{'Avg. Fluency': 4.821841972823352, 'Avg. Adequacy': 1.3225968797181682}


1987it [03:44,  8.83it/s]


{'Avg. Fluency': 4.827377956718672, 'Avg. Adequacy': 1.4116758933064921}


164it [00:21,  7.76it/s]


{'Avg. Fluency': 4.847560975609756, 'Avg. Adequacy': 1.6646341463414633}


1987it [03:43,  8.89it/s]


{'Avg. Fluency': 3.4448917966784096, 'Avg. Adequacy': 1.3009562154001006}


164it [00:18,  8.66it/s]


{'Avg. Fluency': 4.737804878048781, 'Avg. Adequacy': 1.0365853658536586}


1987it [03:41,  8.97it/s]


{'Avg. Fluency': 4.720684448917967, 'Avg. Adequacy': 1.0865626572722697}


1987it [03:46,  8.78it/s]


{'Avg. Fluency': 4.828384499245093, 'Avg. Adequacy': 1.1071967790639154}


163it [00:19,  8.52it/s]


{'Avg. Fluency': 4.773006134969325, 'Avg. Adequacy': 1.2208588957055215}


164it [00:22,  7.41it/s]


{'Avg. Fluency': 4.865853658536586, 'Avg. Adequacy': 1.3109756097560976}


162it [00:18,  8.64it/s]


{'Avg. Fluency': 4.8580246913580245, 'Avg. Adequacy': 1.4938271604938271}


161it [00:17,  9.39it/s]


{'Avg. Fluency': 4.819875776397516, 'Avg. Adequacy': 1.4906832298136645}


161it [00:18,  8.94it/s]


{'Avg. Fluency': 4.683229813664596, 'Avg. Adequacy': 1.1118012422360248}


163it [00:19,  8.43it/s]


{'Avg. Fluency': 4.871165644171779, 'Avg. Adequacy': 1.6196319018404908}


163it [00:17,  9.30it/s]


{'Avg. Fluency': 4.779141104294479, 'Avg. Adequacy': 1.1349693251533743}


164it [00:20,  8.04it/s]


{'Avg. Fluency': 4.823170731707317, 'Avg. Adequacy': 1.646341463414634}


162it [00:18,  8.55it/s]


{'Avg. Fluency': 4.58641975308642, 'Avg. Adequacy': 1.0308641975308641}


1987it [03:58,  8.34it/s]


{'Avg. Fluency': 4.7770508303975845, 'Avg. Adequacy': 1.097131353799698}


163it [00:20,  8.15it/s]


{'Avg. Fluency': 4.668711656441718, 'Avg. Adequacy': 1.1349693251533743}


1987it [03:46,  8.78it/s]


{'Avg. Fluency': 4.733266230498239, 'Avg. Adequacy': 1.0085556114745848}


1987it [03:40,  9.03it/s]


{'Avg. Fluency': 4.82083543029693, 'Avg. Adequacy': 1.2466029189733265}


163it [00:19,  8.42it/s]


{'Avg. Fluency': 4.754601226993865, 'Avg. Adequacy': 1.3496932515337423}


1987it [03:42,  8.94it/s]


{'Avg. Fluency': 4.758429793658782, 'Avg. Adequacy': 1.2818319073980875}


163it [00:17,  9.35it/s]


{'Avg. Fluency': 4.6503067484662575, 'Avg. Adequacy': 1.01840490797546}


164it [00:19,  8.61it/s]


{'Avg. Fluency': 4.762195121951219, 'Avg. Adequacy': 1.5426829268292683}


161it [00:18,  8.71it/s]


{'Avg. Fluency': 4.788819875776397, 'Avg. Adequacy': 1.5093167701863355}


1987it [03:45,  8.81it/s]


{'Avg. Fluency': 3.915450427780574, 'Avg. Adequacy': 1.4735782586814292}


163it [00:18,  8.77it/s]


{'Avg. Fluency': 4.711656441717792, 'Avg. Adequacy': 1.5582822085889572}


1987it [03:40,  9.00it/s]


{'Avg. Fluency': 4.889783593356819, 'Avg. Adequacy': 1.5515853044791141}


164it [00:21,  7.69it/s]


{'Avg. Fluency': 4.695121951219512, 'Avg. Adequacy': 1.2134146341463414}


164it [00:20,  8.05it/s]


{'Avg. Fluency': 4.487804878048781, 'Avg. Adequacy': 1.2195121951219512}


164it [00:20,  8.00it/s]


{'Avg. Fluency': 4.682926829268292, 'Avg. Adequacy': 1.1524390243902438}


163it [00:20,  8.00it/s]


{'Avg. Fluency': 4.50920245398773, 'Avg. Adequacy': 1.2453987730061349}


163it [00:21,  7.71it/s]


{'Avg. Fluency': 4.6871165644171775, 'Avg. Adequacy': 1.0368098159509203}


1987it [03:54,  8.47it/s]


{'Avg. Fluency': 4.791142425767489, 'Avg. Adequacy': 1.646200301962758}


1987it [03:52,  8.55it/s]


{'Avg. Fluency': 4.6607951685958735, 'Avg. Adequacy': 1.2360342224458984}


162it [00:18,  8.54it/s]


{'Avg. Fluency': 4.333333333333333, 'Avg. Adequacy': 1.1728395061728396}


164it [00:18,  8.73it/s]


{'Avg. Fluency': 4.628048780487805, 'Avg. Adequacy': 1.024390243902439}


1987it [03:55,  8.45it/s]


{'Avg. Fluency': 4.732259687971816, 'Avg. Adequacy': 1.0452944136889784}


161it [00:19,  8.16it/s]

{'Avg. Fluency': 4.53416149068323, 'Avg. Adequacy': 1.2422360248447204}



