In [50]:
import pandas as pd
from openai import OpenAI, AsyncOpenAI
import os
import dotenv
dotenv.load_dotenv()
from prompt import *
client = OpenAI(api_key=os.environ.get("review_evaluation_mbzuai"))
model_name = 'gpt-4o'



In [51]:
IN_CONTEXT_EXAMPLES = { 

'actionability_examples': '''
Review Point:
- Relies on supplemental space to contain the paper. The paper is not truly independent given this problem (esp. S3.1 reference to Sup. Fig. 6) and again later as noted with the model comparison and other details of the span vs. sentence investigation.
score: 1
rationale: This is not actionable at all. after reading the comment, the authors still have no idea what should be done.

Review Point:
- The main weaknesses of the paper are the experiments, which is understandable for a short paper but I'd still expect it to be stronger. First, the setting is only on extremely low-resource regime, which is not the only case we want to use data augmentation in real-world applications. Also, sentence classification is an easier task. I feel like the proposed augmentation method has potential to be used on more NLP tasks, which was unfortunately not shown.
score: 2
rationale: This is barely actionable. The comment mentions that there's a weakness in the experiments, but doesn't specify what the weakness is. 

Review Point:
4) You perform "on par or better" (l.791). There seems to be a general cognitive bias among NLP researchers to map instances where they perform worse to "on par" and all the rest to "better". I think this wording should be corrected, but otherwise I'm fine with the experimental results.
score: 3
rationale: Corrected how? this action is not concrete.

Review Point:
It will be nice to see some examples of the system on actual texts (vs. other components & models).
score: 4
rationale: The action is concrete, but it is not stated directly.

Review Point:
- Also, since the dataset is artificially created, the dataset itself might have a lot of noise. For instance, the collected "pristine" set of tweets may not be pristine enough and might instead contain misinformation as well as out-of-context images. I would have liked to see more analysis around the quality of the collected dataset and the amount of noise it potentially has.
score: 5
rationale: There is a concrete, and emplicit action.
''',
'grounding_specificity_examples': '''
Review Point:
The paper does not dig into the theory profs and show the convergence properties of the proposed algorithm.
score: 1
rationale: The comment is not gorunded at all, as it does not mention what the proposed algorithm is, or where it was mentioned.

Review Point:
Is it necessary to treat concept map extraction as a separate task? On the one hand, many generic summarization systems build a similar knowledge graph and then generate summaries accordingly. On the other hand, with the increase of the node number, the concept map becomes growing hard to distinguish. Thus, the general summaries should be more readable.
score: 2
rationale: It's weakly grounded and not specific. The comment mentions that the paper does not treat concept map extraction as a separate task, but does not mention where this is mentioned in the paper. It also does not mention what needs to be addressed.

Review Point:
- You mention that you only select 10 answers from all correct answers, why do you do this? Does this affect the underestimation of the performances?
score: 3
rationale: The comment is not well grounded, but the authors can guess which part does this comment refer to, and it's specific.

Review Point:
The description of HIERENC is unclear. From what I understand, each input (h_i) to the temporal network is the average of the representations of all instantiations of context filled by every possible entity in the vocabulary. This does not seem to be a good idea since presumably only one of those instantiations is correct. This would most likely introduce a lot of noise.
score: 4
rationale: The comment is grounded, as it mentions the part that needs addressing, however, it is not specific enough.

Review Point:
- Table 4 needs a little more clarification, what splits are used for obtaining the ATIS numbers?
score: 5
rationale: The comment is fully grounded, as it mentions the specific table, and also mentions what needs addressing in it. 
''',

'verifiability_examples' : '''
Review Point:
It's a bit unclear how the frame similarity factors and attributes similarity factors are selected.
score: 1
rationale: The claim is "a bit unclear", and it is not verifiable at all. 

Review Point:
It will be nice to see some examples of the system on actual texts (vs. other components & models).
score: 2
rationale: The claim is "It will be nice to see some examples", and it has limited verification.

Review Point:
Table 4 needs a little more clarification, what splits are used for obtaining the ATIS numbers?
score: 3
rationale: The claim is "needs a little more clarification" .The vefirication is not extensive enough. 

Review Point:
The evaluation on the word analogy task seems a bit unfair given that the semantic relations are explicitly encoded by the sememes, as the authors themselves point out (more details below).
score: 4
rationale: The claim here is "task seems a bit unfair", and it has been mostly verified. 

Review Point:
- The explanations for features in Section 3.2 are somewhat intertwined and thus confusing. The section would be more coherently organized with more separate paragraphs dedicated to each of lexical features and sentence-level features.
score: 5
rationale: The claim is "are somewhat intertwined and thus confusing", and it is fully verifiable.

Review Point:
The paper does not dig into the theory profs and show the convergence properties of the proposed algorithm.
score: X
rationale: There is no claim in this comment, as it only mentions somehtiing that is missing in the paper, without expressing any sunjective opinion.
''',

'helpfulness_examples' : '''

Review Point:
Using integrated gradients to measure the attribution has been studied in existing papers. The paper also proposes post-processing steps to filter out the “false-positive” neurons, however, the paper doesn’t show how important these post-processing steps are. I think an ablation study may be needed.
score: 1
rationale: The comment is not helpful at all, as it does not suggest what the authors should do to improve the paper.

Review Point:
Not so useful information: While I appreciate the fleshing out of the assumptions, I find that dedicating a whole section of the paper plus experimental results is a lot of space.
score: 2
rationale: The comment is barely helpful, as the action is implicit and vague.

Review Point:
The evaluation on the word analogy task seems a bit unfair given that the semantic relations are explicitly encoded by the sememes, as the authors themselves point out (more details below).
score: 3
rationale: The comment is somewhat helpful, as it suggests what the authors should do to improve the paper, but it is not concrete, and vague.

Review Point:
Some items in Table 2 and Table 3 have Spaces between accuracy and standard deviation, and some items don't, which affects beauty.
score: 4
rationale: The comment is mostly helpful, as it states a concrete and explicit action.

Review Point:
781 "both tasks": antecedent missing The references should be checked for format, e.g. Grice, Sorower et al for capitalization, the verbnet reference for bibliographic details.    
score: 5
rationale: The comment is fully helpful, as it tells the authors exactly what they should do.
'''
}


In [52]:

test_data = {}
aspects = [ 'actionability', 'grounding_specificity', 'verifiability', 'helpfulness']

for aspect in aspects:
    # data = pd.read_csv(f"test_data/{aspect}_test_data.csv")
    data = pd.read_csv(f"test_data/random_test_data.csv")
    examples = IN_CONTEXT_EXAMPLES[f'{aspect}_examples']
    test_data[aspect] = {
        'data': data,
        'examples': examples
    }


In [53]:
import json 
import pandas as pd

all_incontext_examples = pd.read_excel('/home/abdelrahman.sadallah/mbzuai/review_rewrite/chatgpt/test_data/in_context_examples.xlsx', sheet_name=None)


In [54]:
all_incontext_examples['actionability'].columns

Index(['focused_review', 'actionability', 'review_point',
       'actionability_label', 'rationale'],
      dtype='object')

In [55]:
import random
def get_prompt(review_point,aspect,prompt_type, in_context_examples):

    prompt = ''
    if prompt_type == 'definitions':
        prompt = BASE_PROMPT.format(review_point=review_point,aspect=aspect,aspect_description=ASPECTS[f'{aspect}_no_examples'])

    elif prompt_type == 'definitions_examples':
        prompt = BASE_PROMPT.format(review_point=review_point,aspect=aspect,aspect_description=ASPECTS[aspect])
    elif prompt_type == 'definitions_incontext_learning':
        examples = ''
        examples_str = []
        ##3 group examples by the label and choose a random example from each group
        for label in in_context_examples[f'{aspect}_label'].unique():
            ## keep sampling a line till it is not the same as the currrent review point
            while True:
                row = in_context_examples[in_context_examples[f'{aspect}_label']==label].sample(1)
                row = row.iloc[0]
                if row['review_point'] != review_point:
                    break
        
            score = row[f'{aspect}_label']
            rationale = row['rationale']
            examples_str.append(f'''
Review Point: {row['review_point']}
rationale: {rationale}
score: {score}
''')
        ## shuffle the list 
        random.shuffle(examples_str)
        examples = '\n'.join(examples_str)

        prompt = BASE_PROMPT_EXAMPLES.format(review_point=review_point,aspect=aspect,aspect_description=ASPECTS[aspect],examples=examples)
    return prompt

# Normal Chatgpt Inference

In [56]:
from tqdm import tqdm
import json
prompt_types = ['definitions', 'definitions_examples', 'definitions_incontext_learning']

def chatgpt_inf(test_data, prompt_types, save_path):
## iterate over the df 
    for aspect in test_data.keys():
        responses = []
        fails = 0
        examples  = test_data[aspect]['examples']
        data = test_data[aspect]['data']
        for idx ,row in tqdm(data.iterrows(), total=data.shape[0]):
            for prompt_type in prompt_types:
                
                review_point = row['review_point']
                prompt = get_prompt(review_point=review_point,aspect=aspect,prompt_type=prompt_type, examples=examples)
                # print(prompt_type)
                # print(prompt)

                ## call the gpt4 model,and output the error message if the call fails
                try:
                    clue_message = {"role": "user", "content": prompt}
                    completion = client.chat.completions.create(
                    response_format={ "type": "json_object" },
                    model=model_name,
                    messages=[
                        clue_message
                    ]
                    )
                    response = json.loads(completion.choices[0].message.content.lower())
                    score = response['score']
                    rationale = response['rationale']
                except Exception as e:      
                    print(f"Failed for {idx} and {aspect} with error {e}")
                    fails += 1
                    score = 'NA'
                    rationale = 'NA'

                row [ f'chatgpt_{aspect}_{prompt_type}_score'] = score
                row [ f'chatgpt_{aspect}_{prompt_type}_rationale'] = rationale
            responses.append(row)
        responses = pd.DataFrame(responses)
        responses.to_csv(f"{save_path}/{aspect}_results.csv", index=False)


# Batch Inference

In [57]:

def chatgpt_batch_inf(test_data,aspect, prompt_type, save_path):
   in_context_examples =  all_incontext_examples[aspect]
   lines = []
   for i,row in test_data.iterrows():
      review_point = row['review_point']
      prompt = get_prompt(review_point=review_point,aspect=aspect,prompt_type=prompt_type, in_context_examples=in_context_examples)   
      line = {
         "custom_id": f"{row['id']}", 
         "method": "POST", 
         "url": "/v1/chat/completions", 
         "body": {"model": model_name,
         "temperature": 0.1,
         "messages": 
         [{"role": "system", "content": SYSTEM_PROMPT},
         {"role": "user", "content": prompt}],}}
      lines.append(line)

   ### Write batch input file
   batch_file_path = f"batch_data/{aspect}_batch_input.jsonl"
   with open(batch_file_path, 'w') as f:
      for l in lines:
         json.dump(l, f)
         f.write('\n')

   ### upload the batch file
   batch_input_file = client.files.create(
   file=open(batch_file_path, "rb"),
   purpose="batch")

   ### create the batch request
   batch_input_file_id = batch_input_file.id
   batch_data = client.batches.create(
      input_file_id=batch_input_file_id,
      endpoint="/v1/chat/completions",
      completion_window="24h",
      metadata={
         "description": f"batch file for  {aspect} model gpt-4o, temperature 0.1"
      })
   batch_metadata = {
      "batch_id": batch_data.id,
      "aspect": aspect,
      "prompt_type": prompt_type,
      "batch_input_file_id": batch_input_file_id,
      "batch_file_path": batch_file_path
   }

   with open(f"batch_data/{aspect}_batch_input_meta_data.json", 'w') as f:
      json.dump(batch_metadata, f, indent=4)
      
   print(f"Batch file for {aspect} and {prompt_type} has been created and uploaded successfully")


## This will run the batch inference, and save the meta-data for the batch

In [58]:
# aspects = ['actionability', 'grounding_specificity', 'verifiability', 'helpfulness']
aspects = [ 'grounding_specificity', 'verifiability', 'helpfulness']


for aspect in aspects:
    test_data = pd.read_csv(f"test_data/{aspect}_test_data.csv")

    ### check if test_data has id column and add one if not
    if 'id' not in test_data.columns:
        test_data['id'] = range(1, len(test_data) + 1)
        test_data.to_csv(f"test_data/{aspect}_test_data.csv", index=False)
        
    chatgpt_batch_inf(test_data=test_data,aspect=aspect, prompt_type='definitions_incontext_learning', save_path='results')



KeyError: 'rationale'

In [48]:
def retrive_batch_and_save_results(batch_data):
    aspect = batch_data['aspect']
    prompt_type = batch_data['prompt_type']
    batch_id = batch_data['batch_id']
    output_file_id = client.batches.retrieve(batch_id).output_file_id
    chatgpt_response =  client.files.content(output_file_id)
    file_path = f"batch_output/{aspect}_chatgpt_output.jsonl"
    with open(file_path, 'w') as file:
        file.write(chatgpt_response.text + '\n')


In [49]:
import re

def save_results(batch_data):

    aspect = batch_data['aspect']
    prompt_type = batch_data['prompt_type']
    chatgpt_response = json.load(open(f"batch_output/{aspect}_chatgpt_output.jsonl"))
    raw_data_df = pd.read_csv(f"test_data/{aspect}_test_data.csv")

    ### iterate over the review_points in the raw dataframe and make sure they are aligned with the chatgpt input data
    final_df = []
    for i in range(raw_data_df.shape[0]):
        id = raw_data_df.iloc[i]['id']
        chatgpt_row = chatgpt_response[chatgpt_response['id']==id]


        answer = chatgpt_row['response']['body']['choices'][0]['message']['content']
        match = re.search(r'```json\n(.*?)\n```', answer, re.DOTALL)
        if match:
            json_str = match.group(1)
            json_obj = json.loads(json_str)  # Convert to Python dictionary
            chat_gpt_score = json_obj['score']
            chat_gpt_rationale = json_obj['rationale']

            row = raw_data_df.iloc[i]
            row [f'chatgpt_{aspect}_{prompt_type}_score'] = chat_gpt_score
            row [f'chatgpt_{aspect}_{prompt_type}_rationale'] = chat_gpt_rationale
            final_df.append(row)

        else:
            print("No valid JSON found.")
            continue
    final_df = pd.DataFrame(final_df)
    final_df.to_csv('outputs/actionability_results.csv', index=False)


### Run this every once and a while, and it will check if the batch file has been completed, and if so, it will retrieve the results and save them


In [None]:
import os

for file in os.listdir('batch_data'):
    if 'batch_input_meta_data.json' in file:
        batch_data = json.load(open(f'batch_data/{file}'))

        if client.batches.retrieve(batch_data['batch_id']).status == 'completed':
            print(f"Batch file for {batch_data['aspect']} and {batch_data['prompt_type']} has been completed")
            retrive_batch_and_save_results(batch_data)
            save_results(batch_data)

