In [42]:
import pandas as pd
from openai import OpenAI, AsyncOpenAI
import os
import dotenv
dotenv.load_dotenv()
from prompt import *
client = OpenAI(api_key=os.environ.get("review_evaluation_mbzuai"))

model_name = 'gpt-4o'



In [45]:
import json 
import pandas as pd
aspects = [ 'actionability', 'grounding_specificity', 'verifiability', 'helpfulness']

all_incontext_examples = pd.read_excel('/home/abdelrahman.sadallah/mbzuai/review_rewrite/chatgpt/test_data/in_context_examples.xlsx', sheet_name=None)


In [46]:
import random
def get_prompt(review_point,aspect,prompt_type, in_context_examples):

    prompt = ''
    if prompt_type == 'definitions':
        prompt = BASE_PROMPT.format(review_point=review_point,aspect=aspect,aspect_description=ASPECTS[f'{aspect}_no_examples'])

    elif prompt_type == 'definitions_examples':
        prompt = BASE_PROMPT.format(review_point=review_point,aspect=aspect,aspect_description=ASPECTS[aspect])
    elif prompt_type == 'definitions_incontext_learning' or prompt_type == 'chain_of_thoughts':
        examples = ''
        examples_str = []
        ##3 group examples by the label and choose a random example from each group
        for label in in_context_examples[f'{aspect}_label'].unique():
            ## keep sampling a line till it is not the same as the currrent review point
            while True:
                row = in_context_examples[in_context_examples[f'{aspect}_label']==label].sample(1)
                row = row.iloc[0]
                if row['review_point'] != review_point:
                    break
            
            score = row[f'{aspect}_label']
            rationale = row['rationale'] if prompt_type == 'definitions_incontext_learning' else row['chain_of_thoughts']


            examples_str.append(f'''
Review Point: {row['review_point']}
rationale: {rationale}
score: {score}
''')
        ## shuffle the list 
        random.shuffle(examples_str)
        examples = '\n'.join(examples_str)

        prompt = BASE_PROMPT_EXAMPLES.format(review_point=review_point,aspect=aspect,aspect_description=ASPECTS[aspect],examples=examples)
    return prompt

# Normal Chatgpt Inference

In [47]:
from tqdm import tqdm
import json
prompt_types = ['definitions', 'definitions_examples', 'definitions_incontext_learning']

def chatgpt_inf(test_data, prompt_types, save_path):
## iterate over the df 
    for aspect in test_data.keys():
        responses = []
        fails = 0
        examples  = test_data[aspect]['examples']
        data = test_data[aspect]['data']
        for idx ,row in tqdm(data.iterrows(), total=data.shape[0]):
            for prompt_type in prompt_types:
                
                review_point = row['review_point']
                prompt = get_prompt(review_point=review_point,aspect=aspect,prompt_type=prompt_type, examples=examples)
                # print(prompt_type)
                # print(prompt)

                ## call the gpt4 model,and output the error message if the call fails
                try:
                    clue_message = {"role": "user", "content": prompt}
                    completion = client.chat.completions.create(
                    response_format={ "type": "json_object" },
                    model=model_name,
                    messages=[
                        clue_message
                    ]
                    )
                    response = json.loads(completion.choices[0].message.content.lower())
                    score = response['score']
                    rationale = response['rationale']
                except Exception as e:      
                    print(f"Failed for {idx} and {aspect} with error {e}")
                    fails += 1
                    score = 'NA'
                    rationale = 'NA'

                row [ f'chatgpt_{aspect}_{prompt_type}_score'] = score
                row [ f'chatgpt_{aspect}_{prompt_type}_rationale'] = rationale
            responses.append(row)
        responses = pd.DataFrame(responses)
        responses.to_csv(f"{save_path}/{aspect}_results.csv", index=False)


# Batch Inference

In [48]:

def chatgpt_batch_inf(test_data,aspect, prompt_type, save_path, temp=0):
   in_context_examples =  all_incontext_examples[aspect]
   lines = []

   for i,row in test_data.iterrows():
      review_point = row['review_point']
      prompt = get_prompt(review_point=review_point,aspect=aspect,prompt_type=prompt_type, in_context_examples=in_context_examples)   
      line = {
         "custom_id": f"{row['id']}", 
         "method": "POST", 
         "url": "/v1/chat/completions", 
         "body": {"model": model_name,
         "response_format" :{ "type": "json_object" },
         "temperature": temp,
         "messages": 
         [{"role": "system", "content": SYSTEM_PROMPT},
         {"role": "user", "content": prompt}],}}
      lines.append(line)

   print(f'sample of the prompts is {lines[0]}')

   ### Write batch input file
   batch_file_path = f"batch_data/{aspect}_batch_input.jsonl"
   with open(batch_file_path, 'w') as f:
      for l in lines:
         json.dump(l, f)
         f.write('\n')

   ### upload the batch file
   batch_input_file = client.files.create(
   file=open(batch_file_path, "rb"),
   purpose="batch")

   ### create the batch request
   batch_input_file_id = batch_input_file.id
   batch_data = client.batches.create(
      input_file_id=batch_input_file_id,
      endpoint="/v1/chat/completions",
      completion_window="24h",
      metadata={
         "description": f"batch file for  {aspect} model gpt-4o, temperature {temp}"
      })
   batch_metadata = {
      "batch_id": batch_data.id,
      "aspect": aspect,
      "prompt_type": prompt_type,
      "batch_input_file_id": batch_input_file_id,
      "batch_file_path": batch_file_path
   }

   with open(f"batch_data/{aspect}_batch_input_meta_data.json", 'w') as f:
      json.dump(batch_metadata, f, indent=4)
      
   print(f"Batch file for {aspect} and {prompt_type} has been created and uploaded successfully")


## This will run the batch inference, and save the meta-data for the batch

In [49]:
aspects = ['actionability', 'grounding_specificity', 'verifiability', 'helpfulness']
# aspects = [ 'grounding_specificity', 'verifiability', 'helpfulness']
# aspects = [ 'actionability']
# prompt_type = 'chain_of_thoughts'
# prompt_types = ['definitions', 'definitions_examples', 'definitions_incontext_learning']
prompt_type= 'chain_of_thoughts'

for aspect in aspects:
    test_data = pd.read_csv(f"test_data/{aspect}_test_data.csv")

    ### check if test_data has id column and add one if not
    if 'id' not in test_data.columns:
        test_data['id'] = range(1, len(test_data) + 1)
        test_data.to_csv(f"test_data/{aspect}_test_data.csv", index=False)
        
    chatgpt_batch_inf(test_data=test_data,aspect=aspect, prompt_type=prompt_type, temp=0.0, save_path='results')



sample of the prompts is {'custom_id': '1', 'method': 'POST', 'url': '/v1/chat/completions', 'body': {'model': 'gpt-4o', 'response_format': {'type': 'json_object'}, 'temperature': 0.0, 'messages': [{'role': 'system', 'content': '\nYou are an expert in evaluating peer review comments with respect to different aspects.\n'}, {'role': 'user', 'content': '\nThese aspects are aimed to maximize the utilization of the review comments for the authors. The primary purpose of the review is to help/guide authors in improving their drafts. Keep this in mind while evaluating the review point. Whenever you encounter a borderline case, think: “Will this review point help authors improve their draft?”. There is no correlation between the aspect score and the length of the review point.\n\nPlease evaluate the review point based on the aspect description provided above. generate the output as a json object with the following format:\n    "rationale": RATIONALE\n    "score": SCORE_VALUE,\n    \nactionabil

In [50]:
def retrive_batch_and_save_results(batch_data):
    aspect = batch_data['aspect']
    prompt_type = batch_data['prompt_type']
    batch_id = batch_data['batch_id']
    output_file_id = client.batches.retrieve(batch_id).output_file_id
    chatgpt_response =  client.files.content(output_file_id)
    file_path = f"batch_output/{aspect}_chatgpt_output.jsonl"
    with open(file_path, 'w') as file:
        file.write(chatgpt_response.text + '\n')


In [51]:
import re
import pandas as pd
def save_results(batch_data):

    aspect = batch_data['aspect']
    prompt_type = batch_data['prompt_type']
    chatgpt_response = pd.read_json(f"batch_output/{aspect}_chatgpt_output.jsonl", lines=True)
    raw_data_df = pd.read_csv(f"test_data/{aspect}_test_data.csv")
    chatgpt_input = pd.read_json(f"batch_data/{aspect}_batch_input.jsonl", lines=True)

    ### iterate over the review_points in the raw dataframe and make sure they are aligned with the chatgpt input data
    final_df = []
    for i in range(raw_data_df.shape[0]):
        id = raw_data_df.iloc[i]['id']
        chatgpt_row = chatgpt_response[chatgpt_response['custom_id']==id]

        input = chatgpt_input[chatgpt_input['custom_id']==id].iloc[0]['body']['messages'][1]['content']

        chatgpt_row = chatgpt_row.iloc[0]

        answer = chatgpt_row['response']['body']['choices'][0]['message']['content']

        ## try to load the answer as a json object
        try:
            json_obj = json.loads(answer)
            chat_gpt_score = json_obj['score']
            chat_gpt_rationale = json_obj['rationale']
            row = raw_data_df.iloc[i]
            score_key = f'chatgpt_{aspect}_{prompt_type}_score'
            rationale_key = f'chatgpt_{aspect}_{prompt_type}_rationale'
            row [score_key] = chat_gpt_score
            row [rationale_key] = chat_gpt_rationale
            row ['prompt'] = input
            final_df.append(row)

        except:
            print("No valid JSON found.")
            continue
        # match = re.search(r'```json\n(.*?)\n```', answer, re.DOTALL)
        # if match:
        #     json_str = match.group(1)
        #     json_obj = json.loads(json_str)  # Convert to Python dictionary
        #     chat_gpt_score = json_obj['score']
        #     chat_gpt_rationale = json_obj['rationale']

        #     row = raw_data_df.iloc[i]
        #     row [f'chatgpt_{aspect}_{prompt_type}_score'] = chat_gpt_score
        #     row [f'chatgpt_{aspect}_{prompt_type}_rationale'] = chat_gpt_rationale
        #     row [f'prompt'] = input
        #     final_df.append(row)

        # else:
        #     print("No valid JSON found.")
        #     continue
    final_df = pd.DataFrame(final_df)
    final_df.to_csv(f'outputs/{aspect}_results.csv', index=False)


### Run this every once and a while, and it will check if the batch file has been completed, and if so, it will retrieve the results and save them


In [52]:
import os
import json
for file in os.listdir('batch_data'):
    if 'batch_input_meta_data.json' in file:
        batch_data = json.load(open(f'batch_data/{file}'))

        if client.batches.retrieve(batch_data['batch_id']).status == 'completed':
            print(f"Batch file for {batch_data['aspect']} and {batch_data['prompt_type']} has been completed")

            
            retrive_batch_and_save_results(batch_data)
            save_results(batch_data)
        else:
            print(f"Batch file for {batch_data['aspect']} and {batch_data['prompt_type']} is still running")



Batch file for grounding_specificity and chain_of_thoughts is still running
Batch file for helpfulness and chain_of_thoughts is still running
Batch file for actionability and chain_of_thoughts is still running
Batch file for verifiability and chain_of_thoughts is still running
