In [642]:
import json
import os
import numpy as np
from scipy.stats import ttest_1samp
from typing import List, Dict, Any, Callable, Union
import pandas as pd

In [643]:
def open_json(file_path: str) -> Dict:
    with open(file_path, 'r') as f:
        return json.load(f)


def by_procedure(p_values: List[float], q: float) -> List[int]:
    p_values = np.array(p_values, dtype=float)
    m = len(p_values)
    sorted_indices = np.argsort(p_values)
    sorted_pvals = p_values[sorted_indices]
    # Compute the harmonic sum H_m = 1 + 1/2 + ... + 1/m
    H_m = np.sum(1.0 / np.arange(1, m + 1))
    # Compute the BY thresholds for each rank i
    by_thresholds = (np.arange(1, m + 1) / m) * (q / H_m)
    max_i = -1
    for i in range(m):
        if sorted_pvals[i] <= by_thresholds[i]:
            max_i = i
    if max_i == -1:
        return []
    rejected_sorted_indices = sorted_indices[:max_i + 1]
    return list(rejected_sorted_indices)


def accuracy(pred: Any, annotations: List[Any]) -> float:
    return float(np.mean([pred == ann for ann in annotations]))


def neg_rmse(pred: Union[int, float], annotations: List[Union[int, float]]) -> float:
    return -1 * float(np.sqrt(np.mean([(pred - ann) ** 2 for ann in annotations])))


def sim(pred: str, annotations: List[str], similarity_func: Callable) -> float:
    return float(np.mean([similarity_func(pred, ann) for ann in annotations]))


def ttest(indicators, epsilon: float) -> float:
    return ttest_1samp(indicators, epsilon, alternative='less').pvalue


def alt_test(llm_annotations: Dict[Union[int, str], Any],
             humans_annotations: Dict[Union[int, str], Dict[Union[int, str], Any]],
             scoring_function: Union[str, Callable] = 'accuracy',
             epsilon: float = 0.2,
             q_fdr: float = 0.05,
             min_humans_per_instance: int = 2,
             min_instances_per_human: int = 30):
    # prepare alignment scoring function
    if isinstance(scoring_function, str):
        if scoring_function == 'accuracy':
            scoring_function = accuracy
        elif scoring_function == 'neg_rmse':
            scoring_function = neg_rmse
        else:
            raise ValueError("Unknown scoring function")
    else:
        scoring_function = scoring_function

    # prepare sets - i_set has humans as keys, h_set has instances as keys
    i_set, h_set = {}, {}
    for h, anns in humans_annotations.items():
        i_set[h] = list(anns.keys())
        for i, ann in anns.items():
            if i not in h_set:
                h_set[i] = []
            h_set[i].append(h)

    # remove instances with less than min_humans_per_instance
    instances_to_keep = {i for i in h_set if len(h_set[i]) >= min_humans_per_instance and i in llm_annotations}
    if len(instances_to_keep) < len(h_set):
        print(f"Dropped {len(h_set) - len(instances_to_keep)} instances with less than {min_humans_per_instance} annotators.")
    i_set = {h: [i for i in i_set[h] if i in instances_to_keep] for h in i_set}
    h_set = {i: h_set[i] for i in h_set if i in instances_to_keep}

    p_values, advantage_probs, humans = [], [], []
    for excluded_h in humans_annotations:
        llm_indicators = []
        excluded_indicators = []
        instances = [i for i in i_set[excluded_h] if i in llm_annotations]
        if len(instances) < min_instances_per_human:
            print(f"Skipping annotator {excluded_h} with only {len(instances)} instances < {min_instances_per_human}.")
            continue

        for i in instances:
            human_ann = humans_annotations[excluded_h][i]
            llm_ann = llm_annotations[i]
            remaining_anns = [humans_annotations[h][i] for h in h_set[i] if h != excluded_h]
            human_score = scoring_function(human_ann, remaining_anns)
            llm_score = scoring_function(llm_ann, remaining_anns)
            llm_indicators.append(1 if llm_score >= human_score else 0)
            excluded_indicators.append(1 if human_score >= llm_score else 0)

        diff_indicators = [exc_ind - llm_ind for exc_ind, llm_ind in zip(excluded_indicators, llm_indicators)]
        p_values.append(ttest(diff_indicators, epsilon))
        advantage_probs.append(float(np.mean(llm_indicators)))
        humans.append(excluded_h)

    rejected_indices = by_procedure(p_values, q_fdr)
    advantage_prob = float(np.mean(advantage_probs))
    winning_rate = len(rejected_indices) / len(humans)
    return winning_rate, advantage_prob

In [644]:
annotations_folder = "/home/abdelrahman.sadallah/mbzuai/review_rewrite/chatgpt/outputs"
datasets_scoring_functions = {
    'actionability': 'accuracy',
    'grounding_specificity': 'accuracy',
    'helpfulness': 'accuracy',
    'verifiability': 'accuracy',
}
datasets_epsilons = {
    'actionability': 0.5,
    'grounding_specificity': 0.5,
    'helpfulness': 0.5,
    'verifiability': 0.5,
}

all_human_annotations = pd.read_csv('/home/abdelrahman.sadallah/mbzuai/review_rewrite/data/human_annotation_gathered/all_human_annotations_processed.csv')
chat_gpt_annotations = {}
for aspect in datasets_scoring_functions.keys():
    aspect_raw_annotations = pd.read_excel(os.path.join(annotations_folder, f"main_data_batch_random_results.xlsx"), sheet_name=aspect)
    chat_gpt_annotations[aspect] =  aspect_raw_annotations



In [645]:
chat_gpt_annotations[aspect].columns

Index(['review_point', 'paper_id', 'venue', 'focused_review', 'batch', 'id',
       'chatgpt_verifiability_extraction_score',
       'chatgpt_verifiability_extraction_rationale', 'prompt',
       'chatgpt_verifiability_score', 'chatgpt_verifiability_rationale'],
      dtype='object')

In [646]:
import ast
from utils import annotators_unique_id_batch_id_map_inv
import json
import random 
alt_test_data = {}

for aspect in datasets_scoring_functions.keys():

    # chat_gpt_annotations[aspect] = chat_gpt_annotations[aspect].sample(n= 170)

    ## take only rows from all_human_annotations that have the same review_point as the chat_gpt_annotations
    aspect_human_annotaion = all_human_annotations[all_human_annotations['review_point'].isin(chat_gpt_annotations[aspect]['review_point'])].reset_index()
    chat_gpt_annotations[aspect] = chat_gpt_annotations[aspect].set_index('review_point').reindex(aspect_human_annotaion['review_point']).reset_index()


    ######## FILETER GOLD LABELS ########
    # ## remove rows that have gold in the labe type
    # aspect_human_annotaion = aspect_human_annotaion[~aspect_human_annotaion[f'{aspect}_label_type'].isin(['gold'])]
    # ## take only rows from chat_gpt_annotations that have the same review_point as the human_annotations
    # chat_gpt_annotations[aspect] = chat_gpt_annotations[aspect][chat_gpt_annotations[aspect]['review_point'].isin(aspect_human_annotaion['review_point'])].reset_index()
    # aspect_human_annotaion = aspect_human_annotaion.set_index('review_point').reindex(chat_gpt_annotations[aspect]['review_point']).reset_index()


    aspect_chat_gpt_alt_labels = {}
    aspect_human_alt_labels = {}

    for i,row in aspect_human_annotaion.iterrows():
        review_point = row['review_point']
        assert review_point == chat_gpt_annotations[aspect]['review_point'][i]

        chatgpt_label = chat_gpt_annotations[aspect][f'chatgpt_{aspect}_score'][i]
        label_annotators_dict = json.loads(row[aspect].replace("'", "\""))
        human_labels = label_annotators_dict['labels']
        human_annotators = label_annotators_dict['annotators']




        chatgpt_label =  0 if chatgpt_label in ['X','x', 'no claim'] else chatgpt_label
        chatgpt_label = int(chatgpt_label)


        aspect_chat_gpt_alt_labels[f'instance_{i}'] = chatgpt_label
        for label, annotator in zip(human_labels, human_annotators):
            cnt = 1
            human_id = annotators_unique_id_batch_id_map_inv[annotator]
            if human_id  not in aspect_human_alt_labels.keys():
                aspect_human_alt_labels[human_id] = {}

            label = 0 if label == 'X' else label
            label = int(label)


            ### High disagreement between annotators ###
            # label = cnt
            ###### Random labels === Weak annotators #####
            # label = random.randint(1,5)

            #### Very strong annotators ####
            # label = 1


            aspect_human_alt_labels[human_id][f'instance_{i}'] = label
            cnt += 1
    
    alt_test_data[aspect] = {
        'llm_annotations': aspect_chat_gpt_alt_labels,
        'humans_annotations': aspect_human_alt_labels
    }
                



In [647]:
# alt_test_data['actionability']['humans_annotations']

In [648]:
for aspect in alt_test_data.keys():
    epsilon = datasets_epsilons[aspect]
    metric = datasets_scoring_functions[aspect]
    print(f"Computing {aspect}")
    humans_annotations = alt_test_data[aspect]['humans_annotations']
    llm_annotations = alt_test_data[aspect]['llm_annotations']
    llm_annotations  = {'gpt-4o': llm_annotations}
    for llm_name, llm_annotations in llm_annotations.items():
        wr, ap = alt_test(llm_annotations, humans_annotations, metric, epsilon=epsilon)
        print(f"{aspect} {llm_name} [{'PASSED' if wr >= 0.5 else 'FAILED'}]:\tWinning Rate={wr:.2f}\tAdvantage Probability={ap:.2f}")
    print()

Computing actionability
Dropped 1 instances with less than 2 annotators.
actionability gpt-4o [PASSED]:	Winning Rate=1.00	Advantage Probability=0.72

Computing grounding_specificity
Dropped 1 instances with less than 2 annotators.
grounding_specificity gpt-4o [PASSED]:	Winning Rate=1.00	Advantage Probability=0.79

Computing helpfulness
Dropped 1 instances with less than 2 annotators.
helpfulness gpt-4o [PASSED]:	Winning Rate=1.00	Advantage Probability=0.76

Computing verifiability
Dropped 1 instances with less than 2 annotators.
verifiability gpt-4o [PASSED]:	Winning Rate=1.00	Advantage Probability=0.71



In [649]:
# all_human_annotations = pd.read_csv('/home/abdelrahman.sadallah/mbzuai/review_rewrite/data/human_annotation_gathered/all_human_annotations_processed.csv')

# aspects = [ 'actionability', 'grounding_specificity','verifiability', 'helpfulness']

# ## randomly select 200 samples from samples that has "batch" > 4
# all_human_annotations = all_human_annotations[all_human_annotations ['batch'] > 4]
# all_human_annotations = all_human_annotations.sample(200)
# all_human_annotations = all_human_annotations.reset_index()
# ## save this to excel file with each aspect in a different sheet
# with pd.ExcelWriter('/home/abdelrahman.sadallah/mbzuai/review_rewrite/chatgpt/main_data/batches/batch_random.xlsx', engine='openpyxl') as writer:
#     for aspect in aspects:
#         all_human_annotations.to_excel(writer, sheet_name=aspect, index=False)

In [650]:
# aspects = [ 'actionability', 'grounding_specificity','verifiability', 'helpfulness']
# for aspect in aspects:
  
#     data = pd.read_excel('/home/abdelrahman.sadallah/mbzuai/review_rewrite/chatgpt/main_data/batches/batch_random.xlsx', sheet_name=aspect)
#     print(len(data))
#     print(data[f'{aspect}_label_type'].unique())