In [1]:
import pandas as pd
from openai import OpenAI, AsyncOpenAI
import os
import dotenv
dotenv.load_dotenv()
from prompt import *
client = OpenAI(api_key=os.environ.get("review_evaluation_mbzuai"))

model_name = 'gpt-4o'



ModuleNotFoundError: No module named 'prompt'

In [24]:
import json 
import pandas as pd
aspects = [ 'actionability', 'grounding_specificity', 'verifiability_verification','verifiability_extraction', 'helpfulness']
all_incontext_examples = pd.read_excel('/home/abdelrahman.sadallah/mbzuai/review_rewrite/chatgpt/test_data/in_context_examples.xlsx', sheet_name=None)


In [25]:
import random
def get_prompt(review_point,aspect,prompt_type, in_context_examples,num_examples_per_label=1,):

    prompt = ''
    if prompt_type == 'definitions':
        prompt = BASE_PROMPT.format(review_point=review_point,aspect=aspect,aspect_description=ASPECTS[f'{aspect}_no_examples'])

    elif prompt_type == 'definitions_examples':
        prompt = BASE_PROMPT.format(review_point=review_point,aspect=aspect,aspect_description=ASPECTS[aspect])
    elif prompt_type == 'definitions_incontext_learning' or prompt_type == 'chain_of_thoughts':
        examples = ''
        examples_str = []
        ##3 group examples by the label and choose a random example from each group
        for label in in_context_examples[f'{aspect}_label'].unique():
            for _ in range(num_examples_per_label):
                ## keep sampling a line till it is not the same as the currrent review point
                while True:
                    row = in_context_examples[in_context_examples[f'{aspect}_label']==label].sample(1)
                    row = row.iloc[0]
                    if row['review_point'] != review_point:
                        break
                
                score = row[f'{aspect}_label']
                rationale = row['rationale'] if prompt_type == 'definitions_incontext_learning' else row['chain_of_thoughts']


                examples_str.append(f'''
    Review Point: {row['review_point']}
    rationale: {rationale}
    score: {score}
    ''')
        ## shuffle the list 
        random.shuffle(examples_str)
        examples = '\n'.join(examples_str)
        
        ## for verifiability, we have two tasks
        prompt = BASE_PROMPT_EXAMPLES.format(review_point=review_point,aspect=aspect,aspect_description=ASPECTS[aspect],examples=examples)
    return prompt

# Normal Chatgpt Inference

In [26]:
from tqdm import tqdm
import json
prompt_types = ['definitions', 'definitions_examples', 'definitions_incontext_learning']

def chatgpt_inf(test_data, prompt_types, save_path):
## iterate over the df 
    for aspect in test_data.keys():
        responses = []
        fails = 0
        examples  = test_data[aspect]['examples']
        data = test_data[aspect]['data']
        for idx ,row in tqdm(data.iterrows(), total=data.shape[0]):
            for prompt_type in prompt_types:
                
                review_point = row['review_point']
                prompt = get_prompt(review_point=review_point,aspect=aspect,prompt_type=prompt_type, examples=examples)
                # print(prompt_type)
                # print(prompt)

                ## call the gpt4 model,and output the error message if the call fails
                try:
                    clue_message = {"role": "user", "content": prompt}
                    completion = client.chat.completions.create(
                    response_format={ "type": "json_object" },
                    model=model_name,
                    messages=[
                        clue_message
                    ]
                    )
                    response = json.loads(completion.choices[0].message.content.lower())
                    score = response['score']
                    rationale = response['rationale']
                except Exception as e:      
                    print(f"Failed for {idx} and {aspect} with error {e}")
                    fails += 1
                    score = 'NA'
                    rationale = 'NA'

                row [ f'chatgpt_{aspect}_{prompt_type}_score'] = score
                row [ f'chatgpt_{aspect}_{prompt_type}_rationale'] = rationale
            responses.append(row)
        responses = pd.DataFrame(responses)
        responses.to_csv(f"{save_path}/{aspect}_results.csv", index=False)


# Batch Inference

In [27]:

def chatgpt_batch_inf(test_data,aspect, prompt_type,data_type, save_path, temp=0, num_examples_per_label=1):
   in_context_examples =  all_incontext_examples[aspect]
   lines = []

   for i,row in test_data.iterrows():
      review_point = row['review_point']
      prompt = get_prompt(review_point=review_point,aspect=aspect,prompt_type=prompt_type, in_context_examples=in_context_examples, num_examples_per_label=num_examples_per_label)   
      line = {
         "custom_id": f"{row['id']}", 
         "method": "POST", 
         "url": "/v1/chat/completions", 
         "body": {"model": model_name,
         ########### UNCOMMENT AGAIN #########
         # "response_format" :{ "type": "json_object" },
         "temperature": temp,
         "messages": 
         [{"role": "system", "content": SYSTEM_PROMPT},
         {"role": "user", "content": prompt}],}}
      lines.append(line)

   print(f'sample of the prompts is {lines[0]}')

   ### Write batch input file
   batch_file_path = f"batch_data/{aspect}_{data_type}_batch_input.jsonl"
   with open(batch_file_path, 'w') as f:
      for l in lines:
         json.dump(l, f)
         f.write('\n')

   ### upload the batch file
   batch_input_file = client.files.create(
   file=open(batch_file_path, "rb"),
   purpose="batch")

   ### create the batch request
   batch_input_file_id = batch_input_file.id
   batch_data = client.batches.create(
      input_file_id=batch_input_file_id,
      endpoint="/v1/chat/completions",
      completion_window="24h",
      metadata={
         "description": f"batch file for  {aspect} model gpt-4o, temperature {temp} for {data_type} data",
      })
   batch_metadata = {
      "batch_id": batch_data.id,
      "aspect": aspect,
      "prompt_type": prompt_type,
      "batch_input_file_id": batch_input_file_id,
      "batch_file_path": batch_file_path,
      "data_type": data_type,
   }

   with open(f"batch_data/{aspect}_{data_type}_batch_input_meta_data.json", 'w') as f:
      json.dump(batch_metadata, f, indent=4)
      
   print(f"Batch file for {aspect}, {prompt_type}, and {data_type} data has been created and uploaded successfully")


## This will run the batch inference, and save the meta-data for the batch

In [28]:
# aspects = ['actionability', 'grounding_specificity', 'helpfulness','verifiability_extraction']
# # aspects = ['verifiability_extraction']
# prompt_types = ['definitions', 'definitions_examples', 'definitions_incontext_learning']


import datasets
prompt_type= 'definitions_incontext_learning'
data_path = 'boda/review_evaluation_human_annotation'
TEMP = 0.1
DATA_TYPE = 'gold'
num_examples_per_label = 5

for aspect in aspects:

    if aspect in ['verifiability_extraction', 'verifiability_verification']:
        sheet_name = 'verifiability'
    else:
        sheet_name = aspect
    # test_data = pd.read_excel(data_path, sheet_name=sheet_name)
    test_data = datasets.load_dataset(data_path, name = sheet_name, split= DATA_TYPE).to_pandas()
    ### check if test_data has id column and add one if not
    if 'id' not in test_data.columns:
        test_data['id'] = range(1, len(test_data) + 1)
        with pd.ExcelWriter(data_path, mode='a', engine='openpyxl', if_sheet_exists='replace') as writer:
            test_data.to_excel(writer, sheet_name=aspect, index=False)
        
    chatgpt_batch_inf(test_data=test_data,
                      aspect=aspect, 
                      prompt_type=prompt_type, 
                      temp=TEMP,data_type=DATA_TYPE,  
                      save_path='results',
                      num_examples_per_label=num_examples_per_label)



############################### VERIFIABILITY CODE ########################################
## check for data of verifiability_extraction if it's done, then do the verifiability_verification

# extraction_results_path = 'outputs/gold_aspects_results.xlsx'
# for file in os.listdir('batch_data'):
#     if 'verifiability_extraction' in file and 'meta_data' in file :
#         verifiability_extraction_batch_data = json.load(open(f'batch_data/{file}'))
# if client.batches.retrieve(verifiability_extraction_batch_data['batch_id']).status == 'completed':
#     ## get the saved data, and only consider the ones with labes yes
#     test_data = pd.read_excel(extraction_results_path, sheet_name='verifiability_extraction')
#     test_data = test_data[test_data['chatgpt_verifiability_extraction_definitions_incontext_learning_score']=='yes']

#     chatgpt_batch_inf(test_data=test_data,aspect='verifiability_verification', 
#                       prompt_type=prompt_type, 
#                       temp=TEMP,data_type=DATA_TYPE,  
#                       save_path='results',
#                       num_examples_per_label=num_examples_per_label)
                      



In [29]:
def retrive_batch_and_save_results(batch_data, chatgpt_output_path):
    aspect = batch_data['aspect']
    prompt_type = batch_data['prompt_type']
    batch_id = batch_data['batch_id']
    data_type = batch_data['data_type']
    output_file_id = client.batches.retrieve(batch_id).output_file_id
    chatgpt_response =  client.files.content(output_file_id)
    file_path = chatgpt_output_path
    with open(file_path, 'w') as file:
        file.write(chatgpt_response.text + '\n')


In [30]:
import re

def extract_rationale_and_score(text):
    # Extract rationale
    rationale_match = re.search(r"(?i)\*?\*?rationale:\*?\*?\s*(.*?)(?=\n\s*\*?\*?(?i)score:\*?\*?|$)", text, re.DOTALL)
    rationale = rationale_match.group(1).strip() if rationale_match else None
    
    # Extract score
    score_match = re.search(r"(?i)\*?\*?score:\*?\*?\s*(\d+|yes|no)", text)
    score = score_match.group(1).lower() if score_match else None
    
    
    return {"rationale": rationale, "score": score}


In [31]:
import re
import pandas as pd

aspects = [ 'actionability', 'grounding_specificity', 'verifiability', 'helpfulness']
def write_results_in_original_file(batch_data, chatgpt_output_path, raw_data_path, chatgpt_input_path, write_path):
    errors = 0
    aspect = batch_data['aspect']
    prompt_type = batch_data['prompt_type']
    data_type = batch_data['data_type']

    if aspect in ['verifiability_extraction', 'verifiability_verification']:
        sheet_name = 'verifiability'
    else:
        sheet_name = aspect
    
    chatgpt_response = pd.read_json(chatgpt_output_path, lines=True)
    raw_data_df = pd.read_excel(raw_data_path, sheet_name=sheet_name)
    chatgpt_input = pd.read_json(chatgpt_input_path, lines=True)

    ### iterate over the review_points in the raw dataframe and make sure they are aligned with the chatgpt input data
    final_df = []
    for i in range(raw_data_df.shape[0]):
        id = raw_data_df.iloc[i]['id']

        row = raw_data_df.iloc[i].copy()

        ## if this is verifiability_verification, and we don't find the id, then the was an X case, and we don't do anything
        if  aspect == 'verifiability_verification' and  id not in chatgpt_response['custom_id'].values:
            final_df.append(row)
            continue

        chatgpt_row = chatgpt_response[chatgpt_response['custom_id']==id]

        input = chatgpt_input[chatgpt_input['custom_id']==id].iloc[0]['body']['messages'][1]['content']

        chatgpt_row = chatgpt_row.iloc[0].copy()

        answer = chatgpt_row['response']['body']['choices'][0]['message']['content']

        ## try to load the answer as a json object
        rationale, score = None, None

        extracted_output = extract_rationale_and_score(answer)
        rationale, score = extracted_output['rationale'], extracted_output['score']


        aspect_save_name = aspect
        if aspect in ['verifiability_extraction', 'verifiability_verification']:
            score = 'X' if score == 'no' else score
            aspect_save_name = 'verifiability'


        row [f'chatgpt_{aspect_save_name}_{prompt_type}_score'] = score
        row [f'chatgpt_{aspect_save_name}_{prompt_type}_rationale'] = rationale
        row [f'prompt'] = input

        if not score:
            errors += 1
            print(f"can't process {answer}")
            continue


        final_df.append(row)
        
    final_df = pd.DataFrame(final_df)


    if aspect in ['verifiability_extraction', 'verifiability_verification']:
        aspect = 'verifiability'
    ## remove all the columns that related to another aspects
    # for any column if it has the aspect name of other aspects, remove it
    ## grap the other aspects list
    ## remove all cloulmns expptr for this hlise 
    exclude = ['review_point','paper_id','id','venue','focused_review','batch','prompt']
    other_aspects = [a for a in aspects if a != aspect]
    for col in final_df.columns:
        if col in exclude:
            continue
        if aspect in col:
            continue
        final_df.drop(col, axis=1, inplace=True)
                

    ## rewrite the existing sheet, but keep the other sheets
    file_path = write_path



    ## if the file does not exist, create a new one
    if not os.path.exists(file_path):
        final_df.to_excel(file_path, sheet_name=aspect, index=False)
    else:
        with pd.ExcelWriter(file_path, mode='a', engine='openpyxl', if_sheet_exists='replace') as writer:
            final_df.to_excel(writer, sheet_name=aspect, index=False)
    
    print(f'number of errors for {aspect} and {prompt_type} is {errors}. The number of rows processed is {final_df.shape[0]}')

### Run this every once and a while, and it will check if the batch file has been completed, and if so, it will retrieve the results and save them


In [32]:
import os
import json
for file in os.listdir('batch_data'):
    if 'batch_input_meta_data.json' in file:
        batch_data = json.load(open(f'batch_data/{file}'))

        if client.batches.retrieve(batch_data['batch_id']).status == 'completed':
            print(f"Batch file for {batch_data['aspect']} and {batch_data['prompt_type']} has been completed")
            aspect = batch_data['aspect']
            prompt_type = batch_data['prompt_type']
            data_type = batch_data['data_type']
            
            chatgpt_output_path = f"batch_output/{aspect}_{data_type}_chatgpt_output.jsonl"

            raw_data_path = f"test_data/gold_human_annotations.xlsx"

            chatgpt_input_path =  f"batch_data/{aspect}_{data_type}_batch_input.jsonl"
            write_path = f"outputs/{data_type}_aspects_results.xlsx"

            retrive_batch_and_save_results (batch_data, chatgpt_output_path=chatgpt_output_path)

            if aspect == 'verifiability_verification':
                raw_data_path = write_path
                
            write_results_in_original_file(batch_data,
                                        chatgpt_output_path=chatgpt_output_path,
                                        raw_data_path=raw_data_path,
                                        chatgpt_input_path=chatgpt_input_path,
                                        write_path=write_path)
            
           
            # ### merge the verifiability_extraction and verifiability_verification
            # if aspect == 'verifiability_verification':
            #     extraction_data = pd.read_excel(write_path, sheet_name='verifiability_extraction')
            #     verification_data = pd.read_excel(write_path, sheet_name='verifiability_verification')
            #     raw_verification_data = pd.read_excel(raw_data_path,sheet_name='verifiability')
            #     final_verification_data = []

            #     # for each row in raw_verification_data, check if the score in the extraction is "no", then put the score and rationale
            #     # from the extraction, otherwise, put the score and rationale from the verification
            #     for i in range(raw_verification_data.shape[0]):
            #         id = raw_verification_data.iloc[i]['id']
            #         extraction_row = extraction_data[extraction_data['id']==id]

            #         if extraction_row.shape[0] == 0:
            #             continue
            #         extraction_row = extraction_row.iloc[0]

            #         if extraction_row['chatgpt_verifiability_extraction_definitions_incontext_learning_score'] == 'no':
            #             ## chagne the value to 'X' if the score is 'no'
            #             extraction_row['chatgpt_verifiability_extraction_definitions_incontext_learning_score'] = 'X'

            #             extraction_row['chatgpt_verifiability_definitions_incontext_learning_score'] = extraction_row['chatgpt_verifiability_extraction_definitions_incontext_learning_score']
            #             extraction_row['chatgpt_verifiability_definitions_incontext_learning_rationale'] = extraction_row['chatgpt_verifiability_extraction_definitions_incontext_learning_rationale']
            #             ## remove the old columns
            #             extraction_row.drop(['chatgpt_verifiability_extraction_definitions_incontext_learning_score', 'chatgpt_verifiability_extraction_definitions_incontext_learning_rationale'], inplace=True)
            #             final_verification_data.append(extraction_row)
            #         else:
            #             verification_row = verification_data[verification_data['id']==id]
            #             if verification_row.shape[0] == 0:
            #                 continue
            #             verification_row = verification_row.iloc[0]
            #             verification_row['chatgpt_verifiability_definitions_incontext_learning_score'] = verification_row['chatgpt_verifiability_verification_definitions_incontext_learning_score']
            #             verification_row['chatgpt_verifiability_definitions_incontext_learning_rationale'] = verification_row['chatgpt_verifiability_verification_definitions_incontext_learning_rationale']
            #             ## remove the old columns
                        
            #             final_verification_data.append(verification_row)
                        
            #     final_verification_data = pd.DataFrame(final_verification_data)
            #     with pd.ExcelWriter(write_path, mode='a', engine='openpyxl', if_sheet_exists='replace') as writer:
            #         final_verification_data.to_excel(writer, sheet_name='verifiability', index=False)


        else:
            print(f"Batch file for {batch_data['aspect']} and {batch_data['prompt_type']} is still running")



Batch file for verifiability_extraction and definitions_incontext_learning has been completed
number of errors for verifiability and definitions_incontext_learning is 0. The number of rows processed is 144
Batch file for verifiability_verification and definitions_incontext_learning has been completed
can't process Rationale: The review point suggests conducting an ablation study on the visDial dataset to further support the proposed visual reference resolution model's effectiveness. It specifically mentions interest in the performance of ATT(+H) as shown in Figure 4 left, and questions the result if the model did not consider relevant attention retrieval from the attention memory. This comment is somewhat verifiable because it provides a specific suggestion for an ablation study, which is a common method for validating model components. However, it lacks detailed reasoning or references to support why this particular ablation study would be beneficial or necessary, leaving the authors 