In [10]:
import pandas as pd
from openai import OpenAI, AsyncOpenAI
import os
import dotenv
dotenv.load_dotenv()
from prompt import *
client = OpenAI(api_key=os.environ.get("review_evaluation_mbzuai"))
model_name = 'gpt-4o'

# data = pd.read_excel('test_data/aspects_test_data.xlsx', sheet_name='verifiability')
data = pd.read_excel('test_data/gold_human_annotations.xlsx', sheet_name='verifiability')

## delete columns chatgpt_verifiability_definitions_incontext_learning_score','chatgpt_verifiability_definitions_incontext_learning_rationale

if 'chatgpt_verifiability_definitions_incontext_learning_score' in data.columns:
    data = data.drop(columns=['chatgpt_verifiability_definitions_incontext_learning_score','chatgpt_verifiability_definitions_incontext_learning_rationale'])
# data = data[:5]

In [11]:
data.columns

Index(['review_point', 'paper_id', 'venue', 'focused_review', 'actionability',
       'actionability_label', 'actionability_label_type', 'batch',
       'grounding_specificity', 'grounding_specificity_label',
       'grounding_specificity_label_type', 'verifiability',
       'verifiability_label', 'verifiability_label_type', 'helpfulness',
       'helpfulness_label', 'helpfulness_label_type', 'professional_tone',
       'professional_tone_label', 'professional_tone_label_type',
       'valid_point', 'valid_point_label', 'valid_point_label_type', 'id'],
      dtype='object')

In [12]:
EXTRACT_CLAIM = '''
Check if there is any claims in the text, or it just includes normal statements.

**Opinion & Claims**
- Subjective statements. For example, an opinion or a stand that the reviewer takes (like a disagreement with an experimental choice).
- Any suggestions or requests for changes. For example, stating that something is worth discussing, should be removed, or added.
- Any comments judging some parts of the paper. For example, stating something is hard to read, not detailed enough, or comments about how good or bad some section of the paper is.
- Any deductions or inferred observations that go beyond just stating facts or results from the paper.
- Generally, any phrases where the reviewer should provide evidence to back up their claim and help the authors understand it better. This can be direct or indirect:
    - Ex: “Important methods like X are not discussed”. We can infer that the reviewer suggests that method X should be discussed. Hence, the reviewer should state why this method should be discussed.

**Normal Statements**
Definition: The comment does not contain any claim, opinion, or suggestion and consists of only factual, descriptive statements that do not require any justification.
Clear and precise reasoning or explanation.
- References to external works/data, when applicable, are specific and relevant.
- Common-sense arguments are logically unassailable.
- Indicating that something exists, or missing without indicating that it should be removed or included.
- General statements about the paper, that don’t include an opinion.
- Objective and factual statements that don’t need any kind of verification.
- Asking for clarifications and general questions.
- Logical statements, or things that can be inferred directly.
- We treat positive claims as normal sentences, as they are of little use to the authors to improve their paper.
    - Example: This paper is well written, and the experimentation methods are well designed.

Generate a rationale and use it to output the score. The score should be "Yes" if there are claims, and "No" if there are no claims.
{examples}
Review Point:
{review_point}    

'''

CLAIM_VERIFICATION = '''
This aspect is aimed to maximize the utilization of the review comments for the authors. The primary purpose of the review is to help/guide authors in improving their drafts. Keep this in mind while evaluating the review point. Whenever you encounter a borderline case, think: “Will this review point help authors improve their draft?”. There is no correlation between the aspect score and the length of the review point.
Evaluate the review point based on the aspect description provided next.


Verifiability:
This aspect measures how well the claim in the text is verified. Evaluate how well the reviewer justifies or proves this claim by providing logical reasoning, using common sense or providing references. The claims' justification or validation can come before or after the claim. Claims don’t need to be stated directly; they can also be inferred.
**Verification**
- The claim is verified by providing logical reasoning.
- The claim is verified through common sense knowledge in the field. For example, referring to certain commonly used practices or standards.
- The claim is verified by providing external references.

Verifiability is rated on a scale from 1-5. We will now provide a definition for each.
1: Unverifiable
Definition: The comment contains a claim without any supporting evidence or justification.
Examples
The results fall behind previous work, and the reasons for this should be investigated.
For many of the datasets tested, the improvement over other approaches or even the general adversarial approach is marginal.
While the language has been improved, there are still some awkward phrases. I suggest the authors have the paper reviewed by a native English speaker.
2: Borderline Verifiable
Definition: The comment provides some support for its claim, but it is insufficient, vague, or not fully articulated. The authors will struggle to follow the justification.
Examples
This method shouldn’t achieve good results. If I remember correctly, I have read a paper that tried to do the same thing, but it didn’t work for them.
It is also unclear whether this momentum term could be a confounding factor in the comparison between PAL and SLS, as the vanilla version of SLS is just a stochastic line search applied to SGD without momentum.
In the experiments, the transfer tasks are too artificial. “At the pretraining stage, we train the models with examples from two classes (“bird" vs. “frog") for CIFAR-10 and four classes (0, 1, 2, and 3) for MNIST”.
3: Somewhat Verifiable
Definition: The comment provides support for its claim, but one or more key elements are missing, such as specific examples, detailed explanations, or supporting references. It requires significant effort from the authors to follow the justification.
Examples
The evaluative framework appears somewhat limited in scope. With considerations restricted to merely three Question-Answering tasks and two language models.
The nature of the contribution with respect to ECE_sweep is not clearly described in the text. Concretely, this amounts to a way to choose the number of bins using data
The approximation error is defined as the gap between the objective values, which is somehow ambiguous unless one has seen the values in the table.
4: Mostly Verifiable
Definition: The comment’s claim is sufficiently supported but has minor gaps. The reviewer could provide a more detailed explanation or reference to support their claims.
Examples
The statistical analysis appears incorrect because the p-values reported for the t-tests do not align with standard thresholds for significance.
The two used datasets are very related, where the input sequence is cocktail party speech, with one outputting the audio of each stream and the other producing the ASR output of each stream
As the paper states in the intro, double Q-learning was developed to address the overestimation problem of Q-learning. However, this cannot really be seen directly from the results in the paper. The explanation given in the paper suggests that double Q learning resolves the overestimation problem by achieving a fast convergence rate.
5: Fully Verifiable
Definition: The claim is thoroughly supported by explicit, sufficient, and robust evidence. This can be done by:
Clear and precise reasoning or explanation.
    - References to external works/data, when applicable, are specific and relevant.
    - Common-sense arguments are logically unassailable.
Examples
The landscape results in parameter space looks very surprising because it has no assumptions on the generator and discriminator architecture except for enough representation. This looks surprising to me because usually, this kind of global optimization result for neural networks needs strong assumptions on the architecture.
The first weakness of this work is that the wish list presented in the Introduction is a bit wider than the real techniques proposed by this work because the key difference of this work lies in the dynamic prior. The three properties were mentioned and basically solved by previous work like reference [21] and [27].
The paper’s main idea of mixing transfer-based and query-based attacks is not novel. There have already been multiple papers based on this idea [9, 19]. This paper simply proposes to combine the best transfer-based attack (TIMI) and one of the best L2 query-based attacks (SimBA), which results in SimBA++, which is the main gain over the previous approaches reported in the paper.

Generate a rationale and use it to output the score. 
{examples}

Review Point:
{review_point}
'''

EXTRACT_CLAIM = """

Claim Extraction

Objective:
Determine whether the given text contains claims or merely consists of factual statements.

Opinion & Claims

Subjective statements, including opinions or disagreements with experimental choices.

Suggestions or requests for changes (e.g., indicating something should be removed, added, or discussed).

Judgments about sections of the paper (e.g., stating that something is unclear, lacks detail, or is well-written).

Deductions or inferred observations beyond stating mere facts.

Any statement where evidence or justification is required to support the claim.

Normal Statements

Definition: A statement that does not contain an opinion, claim, or suggestion but consists solely of factual, descriptive content that requires no justification.

Indicating existence or absence of something without suggesting changes.

General statements about the paper that do not express an opinion.

Objective, factual statements that do not require verification.

Requests for clarification or general questions.

Logical statements or directly inferable information.

Positive claims (e.g., "The paper is well written") are considered neutral as they do not help authors improve their work.

Scoring Criteria:

Yes: If the text contains claims, opinions, or suggestions.

No: If the text consists solely of normal statements.

{examples}
Review Point:
{review_point}
"""



CLAIM_VERIFICATION ="""
Objective
The primary goal of this evaluation is to maximize the utility of review comments for authors. The purpose of a review is to help and guide authors in improving their drafts. Keep this in mind while assessing review points. When encountering a borderline case, ask: “Will this review point help authors improve their draft?” There is no correlation between the aspect score and the length of the review point.

Evaluation Aspect: Verifiability

Definition: This aspect measures how well a claim in the text is verified. Assess how well the reviewer justifies or proves the claim by providing logical reasoning, using common sense, or referencing external sources. Justifications or validations can appear before or after the claim. Claims may be explicitly stated or inferred.

Verification Methods:

Logical reasoning supports the claim.

Common sense knowledge in the field verifies the claim (e.g., referencing established practices or standards).

External references substantiate the claim.

Scoring Criteria (1-5 Scale)

1 - Unverifiable

Definition: The comment contains a claim without any supporting evidence or justification.
Examples:

"The results fall behind previous work, and the reasons for this should be investigated."

"For many datasets tested, the improvement over other approaches or even the general adversarial approach is marginal."

"While the language has been improved, there are still some awkward phrases. I suggest the authors have the paper reviewed by a native English speaker."

2 - Borderline Verifiable

Definition: The comment provides some support for its claim, but the justification is vague, insufficient, or not fully articulated. Authors may struggle to follow the reasoning.
Examples:

"This method shouldn’t achieve good results. If I remember correctly, I have read a paper that tried to do the same thing, but it didn’t work for them."

"It is also unclear whether this momentum term could be a confounding factor in the comparison between PAL and SLS, as the vanilla version of SLS is just a stochastic line search applied to SGD without momentum."

"In the experiments, the transfer tasks are too artificial. At the pretraining stage, we train the models with examples from two classes ('bird' vs. 'frog') for CIFAR-10 and four classes (0, 1, 2, and 3) for MNIST."

3 - Somewhat Verifiable

Definition: The comment provides support for its claim, but key elements are missing, such as specific examples, detailed explanations, or supporting references. Authors must make a significant effort to follow the justification.
Examples:

"The evaluative framework appears somewhat limited in scope, with considerations restricted to merely three Question-Answering tasks and two language models."

"The nature of the contribution with respect to ECE_sweep is not clearly described in the text. Concretely, this amounts to a way to choose the number of bins using data."

"The approximation error is defined as the gap between the objective values, which is ambiguous unless one has seen the values in the table."

4 - Mostly Verifiable

Definition: The comment’s claim is sufficiently supported but has minor gaps. The reviewer could provide a more detailed explanation or reference.
Examples:

"The statistical analysis appears incorrect because the p-values reported for the t-tests do not align with standard thresholds for significance."

"The two used datasets are very related, where the input sequence is cocktail party speech, with one outputting the audio of each stream and the other producing the ASR output of each stream."

"As the paper states in the intro, double Q-learning was developed to address the overestimation problem of Q-learning. However, this cannot really be seen directly from the results in the paper. The explanation given suggests that double Q-learning resolves the overestimation problem by achieving a fast convergence rate."

5 - Fully Verifiable

Definition: The claim is thoroughly supported by explicit, sufficient, and robust evidence. This can be achieved through:

Clear and precise reasoning or explanation.

Specific and relevant references to external works or data.

Logical and unassailable common-sense arguments.
Examples:

"The landscape results in parameter space look very surprising because they have no assumptions on the generator and discriminator architecture except for sufficient representation. This is surprising because such global optimization results for neural networks usually require strong assumptions on the architecture."

"The first weakness of this work is that the wish list presented in the Introduction is broader than the actual techniques proposed. The key difference of this work lies in the dynamic prior, while previous work such as references [21] and [27] had already addressed the three properties mentioned."

"The paper’s main idea of mixing transfer-based and query-based attacks is not novel. Several papers [9, 19] have already explored this concept. This work simply combines the best transfer-based attack (TIMI) and one of the best L2 query-based attacks (SimBA) to create SimBA++, which is the main gain over previous approaches."

Instructions:

Use the scoring scale to evaluate the verifiability of each review point.

Focus on how well-supported the claims are rather than the length of the comment.

Consider whether the review point meaningfully helps the authors improve their draft.

Generate a rationale and use it to output the score. 
{examples}

Review Point:
{review_point}

"""

all_incontext_examples = pd.read_excel('/home/abdelrahman.sadallah/mbzuai/review_rewrite/chatgpt/test_data/in_context_examples.xlsx', sheet_name=None)


In [13]:
all_incontext_examples.keys()

dict_keys(['actionability', 'grounding_specificity', 'verifiability', 'claim_extraction', 'helpfulness'])

In [14]:
import random
def get_prompt(review_point,aspect,prompt_type, in_context_examples):
    prompt = ''
    if prompt_type == 'extract_claim':
        in_context_examples = in_context_examples['claim_extraction']
        examples = ''
        examples_str = []
        ## choose two random samples with the label "X" that are not the same as the current review point
        for _ in range(1):
            while True:
                x_sample = in_context_examples[in_context_examples[f'{aspect}_label'] == 'No'].sample(1).iloc[0]
                if x_sample['review_point'] != review_point:
                    break
            ## change the score to No
            x_sample[f'{aspect}_label'] = 'No'
            examples_str.append(f'''
Review Point: {x_sample['review_point']}
rationale: {x_sample['rationale']}
score: {x_sample[f'{aspect}_label']}
''')
        ## choose two random samples with any of the other labels that are not the same as the current review point
        for _ in range(1):
            while True:
                non_x_samples = in_context_examples[in_context_examples[f'{aspect}_label'] != 'No']
                non_x_sample = non_x_samples.sample(1).iloc[0]
                if non_x_sample['review_point'] != review_point:
                    break
            non_x_sample[f'{aspect}_label'] = 'Yes'
            examples_str.append(f'''
Review Point: {non_x_sample['review_point']}
rationale: {non_x_sample['rationale']}
score: {non_x_sample[f'{aspect}_label']}
''')



        ## shuffle the list 
        random.shuffle(examples_str)
        examples = '\n'.join(examples_str)
        prompt = EXTRACT_CLAIM.format(review_point=review_point,examples=examples)

    elif prompt_type == 'claim_verification':
        in_context_examples = in_context_examples[aspect]
        examples = ''
        examples_str = []
        ##3 group examples by the label and choose a random example from each group
        unique_labels = in_context_examples[f'{aspect}_label'].unique()
        ## delete lebel X
        unique_labels = [label for label in unique_labels if label != 'X']
        for label in unique_labels:
            for _ in range(1):
                ## keep sampling a line till it is not the same as the currrent review point
                while True:
                    row = in_context_examples[in_context_examples[f'{aspect}_label']==label].sample(1)
                    row = row.iloc[0]
                    if row['review_point'] != review_point:
                        break
                
                score = row[f'{aspect}_label']
                rationale = row['rationale'] 


                examples_str.append(f'''
    Review Point: {row['review_point']}
    rationale: {rationale}
    score: {score}
    ''')
        ## shuffle the list 
        random.shuffle(examples_str)
        examples = '\n'.join(examples_str)

        prompt = CLAIM_VERIFICATION.format(review_point=review_point,examples=examples)


    return prompt

In [15]:
from tqdm import tqdm
import json
prompt_types = ['extract_claim','claim_verification']

def chatgpt_inf(data, prompt_type, save_path):
    output_df = []
## iterate over the df 
    for idx ,row in tqdm(data.iterrows(), total=data.shape[0]):
        fails = 0
        review_point = row['review_point']
        prompt = get_prompt(review_point=review_point,aspect='verifiability',prompt_type=prompt_type, in_context_examples=all_incontext_examples)

        # print(f"Prompt: {prompt}")
        # break

        try:
            clue_message = {"role": "user", "content": prompt}
            completion = client.chat.completions.create(
            # response_format={ "type": "json_object" },
            temperature=0.0,
            model=model_name,
            messages=[
                clue_message
            ]
            )
            response = str(completion.choices[0].message.content.lower())

            # if prompt_type == 'extract_claim':
            #     assert response in ['yes','no']
            #     cur_row = row.copy()
            #     cur_row[f'chatgpt_verifiability_definitions_incontext_learning_score'] = response
            #     output_df.append(cur_row)
            # elif prompt_type == 'claim_verification':
            rationale, score = response.split('score:')
            rationale = rationale.split('rationale:')[1]
            if rationale and score:
                score = score.strip()
                rationale = rationale.strip()


        except Exception as e:      
            print(response)
            print(f"Failed for {idx} with error {e}")
            fails += 1
            score = 'NA'
            rationale = 'NA'
            
        cur_row = row.copy()
        cur_row[f'chatgpt_verifiability_definitions_incontext_learning_score'] = score
        cur_row[f'chatgpt_verifiability_{prompt_type}_rationale'] = rationale
        output_df.append(cur_row)

    output_df = pd.DataFrame(output_df)
    return output_df



In [16]:
##### Claim extraction
output_df = chatgpt_inf(data, 'extract_claim', 'verifiability_claim_extraction_chatgpt.xlsx')

## update the data with the new scores, only add the column named chatgpt_verifiability_definitions_incontext_learning_score
data = data.merge(output_df[['review_point','chatgpt_verifiability_definitions_incontext_learning_score']], on='review_point', how='left')

## only keep the rows that have a yes in the chatgpt_verifiability_definitions_incontext_learning_score
verification_data = data[data['chatgpt_verifiability_definitions_incontext_learning_score']=='yes']

verification_data = chatgpt_inf(verification_data, 'claim_verification', 'verifiability_claim_verification_chatgpt.xlsx')





  0%|          | 0/144 [00:00<?, ?it/s]

 31%|███       | 44/144 [01:15<03:07,  1.88s/it]

Failed for 43 with error list index out of range


 33%|███▎      | 47/144 [01:19<02:33,  1.58s/it]

Failed for 46 with error not enough values to unpack (expected 2, got 1)


 36%|███▌      | 52/144 [01:28<02:25,  1.58s/it]

Failed for 51 with error list index out of range


 38%|███▊      | 54/144 [01:31<02:21,  1.58s/it]

Failed for 53 with error list index out of range


 46%|████▌     | 66/144 [01:50<02:25,  1.87s/it]

Failed for 65 with error list index out of range


 50%|█████     | 72/144 [02:02<02:30,  2.09s/it]

Failed for 71 with error list index out of range


 51%|█████▏    | 74/144 [02:05<02:07,  1.82s/it]

Failed for 73 with error list index out of range


 66%|██████▌   | 95/144 [02:40<01:13,  1.50s/it]

Failed for 94 with error list index out of range


 69%|██████▉   | 100/144 [02:48<01:10,  1.59s/it]

Failed for 99 with error list index out of range


 91%|█████████ | 131/144 [03:47<00:27,  2.12s/it]

Failed for 130 with error list index out of range


 92%|█████████▏| 133/144 [03:51<00:22,  2.00s/it]

Failed for 132 with error list index out of range


 95%|█████████▌| 137/144 [03:57<00:12,  1.81s/it]

Failed for 136 with error list index out of range


100%|██████████| 144/144 [04:09<00:00,  1.73s/it]
 59%|█████▊    | 71/121 [03:20<02:41,  3.22s/it]

Failed for 86 with error not enough values to unpack (expected 2, got 1)


 68%|██████▊   | 82/121 [03:48<01:44,  2.67s/it]

Failed for 98 with error not enough values to unpack (expected 2, got 1)


 89%|████████▉ | 108/121 [05:06<00:36,  2.84s/it]

Failed for 125 with error not enough values to unpack (expected 2, got 1)


100%|██████████| 121/121 [05:45<00:00,  2.86s/it]


In [17]:
temp_df = []
for idx, row in data.iterrows():
    cur_row = row.copy()
    if row['chatgpt_verifiability_definitions_incontext_learning_score'] == 'yes':
        ## get the new score and rationale
        new_row = verification_data[verification_data['review_point']==row['review_point']].iloc[0]
        score = new_row['chatgpt_verifiability_definitions_incontext_learning_score']
        rationale = new_row['chatgpt_verifiability_claim_verification_rationale']
        cur_row['chatgpt_verifiability_definitions_incontext_learning_score'] = score
        cur_row['chatgpt_verifiability_claim_verification_rationale'] = rationale
    temp_df.append(cur_row)
data = pd.DataFrame(temp_df)


In [18]:
# replace the vlue no with X in  chatgpt_verifiability_definitions_incontext_learning_score
data['chatgpt_verifiability_definitions_incontext_learning_score'] = data['chatgpt_verifiability_definitions_incontext_learning_score'].replace('no','X')
## convert the types of values to str
data['chatgpt_verifiability_definitions_incontext_learning_score'] = data['chatgpt_verifiability_definitions_incontext_learning_score'].astype(str)

data.to_excel('two_step_verifiability_all_gold_chatgpt_output_min_examples.xlsx', index=False)