## Analyze human annotation results

In [5]:
import pandas as pd 
import numpy as np 
from pathlib import Path 
from scipy.stats import ttest_rel, ttest_ind, spearmanr

In [2]:
df = pd.read_csv("../data/ecqa/secondbest/gpt4/with_nle_annotated.csv")
print(df.shape)
df.columns

(1500, 41)


Index(['HITId', 'HITTypeId', 'Title', 'Description', 'Keywords', 'Reward',
       'CreationTime', 'MaxAssignments', 'RequesterAnnotation',
       'AssignmentDurationInSeconds', 'AutoApprovalDelayInSeconds',
       'Expiration', 'NumberOfSimilarHITs', 'LifetimeInSeconds',
       'AssignmentId', 'WorkerId', 'AssignmentStatus', 'AcceptTime',
       'SubmitTime', 'AutoApprovalTime', 'ApprovalTime', 'RejectionTime',
       'RequesterFeedback', 'WorkTimeInSeconds', 'LifetimeApprovalRate',
       'Last30DaysApprovalRate', 'Last7DaysApprovalRate', 'Input.question',
       'Input.choice_A', 'Input.choice_B', 'Input.choice_C', 'Input.choice_D',
       'Input.choice_E', 'Input.answer', 'Input.explanation',
       'Answer.convincingness_after', 'Answer.convincingness_before',
       'Answer.correctness', 'Answer.fluency', 'Approve', 'Reject'],
      dtype='object')

In [3]:
def p_to_stars(pval, bonferroni=1):
    if pval < 0.001/bonferroni:
        return "***"
    elif pval < 0.01/bonferroni:
        return "**"
    elif pval < 0.05/bonferroni:
        return "*"
    return ""


def print_reports(df, bonferroni=3):
    print("Number of questions:", len(df.HITId.unique()))
    
    # Average score of each question.
    print("Average scores of each question:")
    scores = ["Answer.convincingness_before", "Answer.convincingness_after", "Answer.fluency", "Answer.correctness"]
    df_mean = df[["HITId"] + scores].groupby("HITId").mean()
    
    for sc in scores:
        print("\t{}:\t mean {:.4f}, sd {:.4f}".format(sc, df_mean[sc].mean(), df_mean[sc].std()))

    # T test
    print("t_test: convincingness before vs after", end="\t")
    s, pval = ttest_rel(df_mean["Answer.convincingness_before"], df_mean["Answer.convincingness_after"])
    sign = "<" if s < 0 else ">"
    stars = p_to_stars(pval, bonferroni=bonferroni)
    print("{} {}".format(sign, stars))

print_reports(df)


Number of questions: 500
Average scores of each question:
	Answer.convincingness_before:	 mean 2.9560, sd 0.9866
	Answer.convincingness_after:	 mean 3.5267, sd 0.9261
	Answer.fluency:	 mean 4.8520, sd 0.3459
	Answer.correctness:	 mean 4.6800, sd 0.5210
t_test: convincingness before vs after	< ***


### Proxy evaluation results

Up till now, the mixtral-8x7B results seem a bit weird. Vicuna-33B and WizardLM-70B show results that are `< ***`.

In [4]:
explainers = ["gpt4", "chat", "claude"]
proxy_models = ["mixtral-8x7B", "vicuna-33B", "wizardlm-70B"]
tasks = [
    "ecqa/secondbest",
    "nli/entail_to_neutral",
    "nli/contra_to_neutral"
]
for explainer in explainers:
    for task in tasks:
        for proxy in proxy_models:
            print(f"\nexplainer={explainer}, task={task}, proxy={proxy}")
            
            df = pd.read_csv(f"../data/{task}/{explainer}/with_nle_scored_by_{proxy}.csv")\
                .rename(columns={"convincingness_before": "Answer.convincingness_before",
                        "convincingness_after": "Answer.convincingness_after",
                        "fluency": "Answer.fluency",
                        "correctness": "Answer.correctness"})
            df["HITId"] = list(range(len(df)))  # Add this column so the print_reports(), which averages by HITId, can work
            print_reports(df)


explainer=gpt4, task=ecqa/secondbest, proxy=mixtral-8x7B
Number of questions: 500
Average scores of each question:
	Answer.convincingness_before:	 mean 2.6080, sd 0.9840
	Answer.convincingness_after:	 mean 2.5880, sd 0.8097
	Answer.fluency:	 mean 1.9520, sd 0.9998
	Answer.correctness:	 mean 2.9760, sd 0.2180
t_test: convincingness before vs after	> 

explainer=gpt4, task=ecqa/secondbest, proxy=vicuna-33B
Number of questions: 500
Average scores of each question:
	Answer.convincingness_before:	 mean 1.6400, sd 1.2302
	Answer.convincingness_after:	 mean 3.0120, sd 0.1546
	Answer.fluency:	 mean 1.3040, sd 0.7407
	Answer.correctness:	 mean 1.3880, sd 0.9139
t_test: convincingness before vs after	< ***

explainer=gpt4, task=ecqa/secondbest, proxy=wizardlm-70B
Number of questions: 500
Average scores of each question:
	Answer.convincingness_before:	 mean 3.3520, sd 1.2029
	Answer.convincingness_after:	 mean 3.7000, sd 0.9549
	Answer.fluency:	 mean 3.0840, sd 0.4211
	Answer.correctness:	 mean 

### Proxy evaluator vs humans

In [13]:
def evaluator_vs_human():
    dataset_task = "ecqa/secondbest"
    explainer = "gpt4"
    evaluators = ["mixtral-8x7B", "vicuna-33B", "wizardlm-70B"]

    scores = {
        "Answer.convincingness_before": "convincingness_before", 
        "Answer.convincingness_after": "convincingness_after", 
        "Answer.fluency": "fluency", 
        "Answer.correctness": "correctness"
    }
    human_results_raw = pd.read_csv(f"../data/{dataset_task}/{explainer}/with_nle_annotated.csv").rename(columns=scores)
    
    selected_columns = ["HITId"] + list(scores.values())
    df_mean = human_results_raw[selected_columns].groupby("HITId").mean()
    
    for evaluator in evaluators:
        eval_results = pd.read_csv(f"../data/{dataset_task}/{explainer}/with_nle_scored_by_{evaluator}.csv")

        print ("human vs {}".format(evaluator))
        for sc in scores.values():
            corr, corr_p = spearmanr(df_mean[sc], eval_results[sc])
            print("\tscore: {} corr={:.4f} {}".format(sc, corr, p_to_stars(corr_p)))

evaluator_vs_human()

human vs mixtral-8x7B
	score: convincingness_before corr=0.0777 
	score: convincingness_after corr=-0.0149 
	score: fluency corr=-0.0313 
	score: correctness corr=0.0023 
human vs vicuna-33B
	score: convincingness_before corr=-0.0199 
	score: convincingness_after corr=0.0556 
	score: fluency corr=0.0083 
	score: correctness corr=0.0184 
human vs wizardlm-70B
	score: convincingness_before corr=0.0221 
	score: convincingness_after corr=0.0399 
	score: fluency corr=0.0691 
	score: correctness corr=0.0371 
