In [118]:
import json
import os
import numpy as np
from scipy.stats import ttest_1samp
from typing import List, Dict, Any, Callable, Union
import pandas as pd

In [119]:
def open_json(file_path: str) -> Dict:
    with open(file_path, 'r') as f:
        return json.load(f)


def by_procedure(p_values: List[float], q: float) -> List[int]:
    p_values = np.array(p_values, dtype=float)
    m = len(p_values)
    sorted_indices = np.argsort(p_values)
    sorted_pvals = p_values[sorted_indices]
    # Compute the harmonic sum H_m = 1 + 1/2 + ... + 1/m
    H_m = np.sum(1.0 / np.arange(1, m + 1))
    # Compute the BY thresholds for each rank i
    by_thresholds = (np.arange(1, m + 1) / m) * (q / H_m)
    max_i = -1
    for i in range(m):
        if sorted_pvals[i] <= by_thresholds[i]:
            max_i = i
    if max_i == -1:
        return []
    rejected_sorted_indices = sorted_indices[:max_i + 1]
    return list(rejected_sorted_indices)


def accuracy(pred: Any, annotations: List[Any]) -> float:
    return float(np.mean([pred == ann for ann in annotations]))


def neg_rmse(pred: Union[int, float], annotations: List[Union[int, float]]) -> float:
    return -1 * float(np.sqrt(np.mean([(pred - ann) ** 2 for ann in annotations])))


def sim(pred: str, annotations: List[str], similarity_func: Callable) -> float:
    return float(np.mean([similarity_func(pred, ann) for ann in annotations]))


def ttest(indicators, epsilon: float) -> float:
    return ttest_1samp(indicators, epsilon, alternative='less').pvalue


def alt_test(llm_annotations: Dict[Union[int, str], Any],
             humans_annotations: Dict[Union[int, str], Dict[Union[int, str], Any]],
             scoring_function: Union[str, Callable] = 'accuracy',
             epsilon: float = 0.2,
             q_fdr: float = 0.05,
             min_humans_per_instance: int = 2,
             min_instances_per_human: int = 30):
    # prepare alignment scoring function
    if isinstance(scoring_function, str):
        if scoring_function == 'accuracy':
            scoring_function = accuracy
        elif scoring_function == 'neg_rmse':
            scoring_function = neg_rmse
        else:
            raise ValueError("Unknown scoring function")
    else:
        scoring_function = scoring_function

    # prepare sets - i_set has humans as keys, h_set has instances as keys
    i_set, h_set = {}, {}
    for h, anns in humans_annotations.items():
        i_set[h] = list(anns.keys())
        for i, ann in anns.items():
            if i not in h_set:
                h_set[i] = []
            h_set[i].append(h)

    # remove instances with less than min_humans_per_instance
    instances_to_keep = {i for i in h_set if len(h_set[i]) >= min_humans_per_instance and i in llm_annotations}
    if len(instances_to_keep) < len(h_set):
        print(f"Dropped {len(h_set) - len(instances_to_keep)} instances with less than {min_humans_per_instance} annotators.")
    i_set = {h: [i for i in i_set[h] if i in instances_to_keep] for h in i_set}
    h_set = {i: h_set[i] for i in h_set if i in instances_to_keep}

    p_values, advantage_probs, humans = [], [], []
    for excluded_h in humans_annotations:
        llm_indicators = []
        excluded_indicators = []
        instances = [i for i in i_set[excluded_h] if i in llm_annotations]
        if len(instances) < min_instances_per_human:
            print(f"Skipping annotator {excluded_h} with only {len(instances)} instances < {min_instances_per_human}.")
            continue

        for i in instances:
            human_ann = humans_annotations[excluded_h][i]
            llm_ann = llm_annotations[i]
            remaining_anns = [humans_annotations[h][i] for h in h_set[i] if h != excluded_h]
            human_score = scoring_function(human_ann, remaining_anns)
            llm_score = scoring_function(llm_ann, remaining_anns)
            llm_indicators.append(1 if llm_score >= human_score else 0)
            excluded_indicators.append(1 if human_score >= llm_score else 0)

        diff_indicators = [exc_ind - llm_ind for exc_ind, llm_ind in zip(excluded_indicators, llm_indicators)]
        p_values.append(ttest(diff_indicators, epsilon))
        advantage_probs.append(float(np.mean(llm_indicators)))
        humans.append(excluded_h)

    rejected_indices = by_procedure(p_values, q_fdr)
    advantage_prob = float(np.mean(advantage_probs))
    winning_rate = len(rejected_indices) / len(humans)
    return winning_rate, advantage_prob

In [120]:
annotations_folder = "/home/abdelrahman.sadallah/mbzuai/review_rewrite/chatgpt/outputs"
datasets_scoring_functions = {
    'actionability': 'accuracy',
    'grounding_specificity': 'accuracy',
    'helpfulness': 'accuracy',
    'verifiability': 'accuracy',
}
datasets_epsilons = {
    'actionability': 0.3,
    'grounding_specificity': 0.3,
    'helpfulness': 0.3,
    'verifiability': 0.3,
}

all_human_annotations = pd.read_csv('/home/abdelrahman.sadallah/mbzuai/review_rewrite/data/human_annotation_gathered/all_human_annotations_processed.csv')

chat_gpt_annotations = {}
for aspect in datasets_scoring_functions.keys():
    aspect_raw_annotations = pd.read_csv(os.path.join(annotations_folder, f"{aspect}_random_samples_results.csv"))
    chat_gpt_annotations[aspect] =  aspect_raw_annotations



In [121]:
chat_gpt_annotations[aspect].columns

Index(['review_point', 'paper_id', 'venue', 'focused_review', 'actionability',
       'actionability_label', 'actionability_label_type', 'batch',
       'grounding_specificity', 'grounding_specificity_label',
       'grounding_specificity_label_type', 'verifiability',
       'verifiability_label', 'verifiability_label_type', 'helpfulness',
       'helpfulness_label', 'helpfulness_label_type', 'professional_tone',
       'professional_tone_label', 'professional_tone_label_type',
       'valid_point', 'valid_point_label', 'valid_point_label_type',
       'chatgpt_verifiability_definitions_score',
       'chatgpt_verifiability_definitions_rationale'],
      dtype='object')

In [122]:
import ast
alt_test_data = {}

for aspect in datasets_scoring_functions.keys():
    ## take only rows from all_human_annotations that have the same review_point as the chat_gpt_annotations
    aspect_human_annotaion = all_human_annotations[all_human_annotations['review_point'].isin(chat_gpt_annotations[aspect]['review_point'])].reset_index()
    chat_gpt_annotations[aspect] = chat_gpt_annotations[aspect].set_index('review_point').reindex(aspect_human_annotaion['review_point']).reset_index()

    aspect_chat_gpt_alt_labels = {}
    aspect_human_alt_labels = {}

    for i,row in aspect_human_annotaion.iterrows():
        review_point = row['review_point']
        assert review_point == chat_gpt_annotations[aspect]['review_point'][i]

        chatgpt_label = chat_gpt_annotations[aspect][f'chatgpt_{aspect}_definitions_score'][i]
        human_labels = row[aspect]
        human_labels  = ast.literal_eval(human_labels)

        aspect_chat_gpt_alt_labels[f'instance_{i}'] = chatgpt_label
        for j, human_label in enumerate(human_labels):
            human_id = f'annotator_{j}'
            if human_id  not in aspect_human_alt_labels.keys():
                aspect_human_alt_labels[human_id] = {}
            aspect_human_alt_labels[human_id][f'instance_{i}'] = human_label
    
    alt_test_data[aspect] = {
        'llm_annotations': aspect_chat_gpt_alt_labels,
        'humans_annotations': aspect_human_alt_labels
    }
                



In [123]:
for aspect in alt_test_data.keys():
    epsilon = datasets_epsilons[aspect]
    metric = datasets_scoring_functions[aspect]
    print(f"Computing {aspect}")
    humans_annotations = alt_test_data[aspect]['humans_annotations']
    llm_annotations = alt_test_data[aspect]['llm_annotations']
    llm_annotations  = {'gpt-4o': llm_annotations}
    for llm_name, llm_annotations in llm_annotations.items():
        wr, ap = alt_test(llm_annotations, humans_annotations, metric, epsilon=epsilon)
        print(f"{aspect} {llm_name} [{'PASSED' if wr >= 0.5 else 'FAILED'}]:\tWinning Rate={wr:.2f}\tAdvantage Probability={ap:.2f}")
    print()

Computing actionability
Skipping annotator annotator_4 with only 11 instances < 30.
Skipping annotator annotator_5 with only 11 instances < 30.
Skipping annotator annotator_6 with only 11 instances < 30.
Skipping annotator annotator_7 with only 11 instances < 30.
actionability gpt-4o [FAILED]:	Winning Rate=0.00	Advantage Probability=0.33

Computing grounding_specificity
Skipping annotator annotator_4 with only 11 instances < 30.
Skipping annotator annotator_5 with only 11 instances < 30.
Skipping annotator annotator_6 with only 11 instances < 30.
Skipping annotator annotator_7 with only 11 instances < 30.
grounding_specificity gpt-4o [FAILED]:	Winning Rate=0.00	Advantage Probability=0.32

Computing helpfulness
Skipping annotator annotator_4 with only 11 instances < 30.
Skipping annotator annotator_5 with only 11 instances < 30.
Skipping annotator annotator_6 with only 11 instances < 30.
Skipping annotator annotator_7 with only 11 instances < 30.
helpfulness gpt-4o [FAILED]:	Winning Rat