## Analyze human annotation results

In [1]:
import pandas as pd 
import numpy as np 
from pathlib import Path 
import scipy 

In [2]:
df = pd.read_csv("../data/ECQA_SecondBest_method_noS_500_annotated.csv")
print(df.shape)
df.columns

(1500, 42)


Index(['HITId', 'HITTypeId', 'Title', 'Description', 'Keywords', 'Reward',
       'CreationTime', 'MaxAssignments', 'RequesterAnnotation',
       'AssignmentDurationInSeconds', 'AutoApprovalDelayInSeconds',
       'Expiration', 'NumberOfSimilarHITs', 'LifetimeInSeconds',
       'AssignmentId', 'WorkerId', 'AssignmentStatus', 'AcceptTime',
       'SubmitTime', 'AutoApprovalTime', 'ApprovalTime', 'RejectionTime',
       'RequesterFeedback', 'WorkTimeInSeconds', 'LifetimeApprovalRate',
       'Last30DaysApprovalRate', 'Last7DaysApprovalRate', 'Input.ECQA_Number',
       'Input.question', 'Input.choice_A', 'Input.choice_B', 'Input.choice_C',
       'Input.choice_D', 'Input.choice_E', 'Input.answer', 'Input.explanation',
       'Answer.convincingness_after', 'Answer.convincingness_before',
       'Answer.correctness', 'Answer.fluency', 'Approve', 'Reject'],
      dtype='object')

In [3]:
def p_to_stars(pval, bonferroni=1):
    if pval < 0.001/bonferroni:
        return "***"
    elif pval < 0.01/bonferroni:
        return "**"
    elif pval < 0.05/bonferroni:
        return "*"
    return ""


def print_reports(df, bonferroni=3):
    print("Number of questions:", len(df.HITId.unique()))
    
    # Average score of each question.
    print("Average scores of each question:")
    scores = ["Answer.convincingness_before", "Answer.convincingness_after", "Answer.fluency", "Answer.correctness"]
    df_mean = df[["HITId"] + scores].groupby("HITId").mean()
    
    for sc in scores:
        print("\t{}:\t mean {:.4f}, sd {:.4f}".format(sc, df_mean[sc].mean(), df_mean[sc].std()))

    # T test
    print("t_test: convincingness before vs after", end="\t")
    s, pval = scipy.stats.ttest_rel(df_mean["Answer.convincingness_before"], df_mean["Answer.convincingness_after"])
    sign = "<" if s < 0 else ">"
    stars = p_to_stars(pval, bonferroni=bonferroni)
    print("{} {}".format(sign, stars))

print_reports(df)


Number of questions: 500
Average scores of each question:
	Answer.convincingness_before:	 mean 2.9413, sd 1.0942
	Answer.convincingness_after:	 mean 3.6080, sd 1.0156
	Answer.fluency:	 mean 4.8613, sd 0.3355
	Answer.correctness:	 mean 4.5240, sd 0.5895
t_test: convincingness before vs after	< ***


In [4]:
df = pd.read_csv("../data/ECQA_SecondBest_method_S_500_annotated.csv")
print_reports(df)

Number of questions: 500
Average scores of each question:
	Answer.convincingness_before:	 mean 2.9560, sd 0.9866
	Answer.convincingness_after:	 mean 3.5267, sd 0.9261
	Answer.fluency:	 mean 4.8520, sd 0.3459
	Answer.correctness:	 mean 4.6800, sd 0.5210
t_test: convincingness before vs after	< ***


## Proxy evaluation results

In [5]:
models = ["mixtral-8x7B", "vicuna-33B"]
tasks = [
    "ECQA_SecondBest_method_noS_500",
    "ECQA_SecondBest_method_S_500",
    "NLI_entail_to_neutral"
]
for model in models:
    for task in tasks:
        print(f"\nmodel={model}, task={task}")
        df = pd.read_csv(f"../data/{model}/{task}.csv")\
            .rename(columns={"convincingness_before": "Answer.convincingness_before",
                    "convincingness_after": "Answer.convincingness_after",
                    "fluency": "Answer.fluency",
                    "correctness": "Answer.correctness"})
        df["HITId"] = list(range(len(df)))
        print_reports(df)


model=mixtral-8x7B, task=ECQA_SecondBest_method_noS_500
Number of questions: 500
Average scores of each question:
	Answer.convincingness_before:	 mean 2.6080, sd 0.9840
	Answer.convincingness_after:	 mean 2.5880, sd 0.8097
	Answer.fluency:	 mean 1.9520, sd 0.9998
	Answer.correctness:	 mean 2.9760, sd 0.2180
t_test: convincingness before vs after	> 

model=mixtral-8x7B, task=ECQA_SecondBest_method_S_500
Number of questions: 500
Average scores of each question:
	Answer.convincingness_before:	 mean 2.6080, sd 0.9840
	Answer.convincingness_after:	 mean 2.5880, sd 0.8097
	Answer.fluency:	 mean 1.9520, sd 0.9998
	Answer.correctness:	 mean 2.9760, sd 0.2180
t_test: convincingness before vs after	> 

model=mixtral-8x7B, task=NLI_entail_to_neutral
Number of questions: 300
Average scores of each question:
	Answer.convincingness_before:	 mean 2.7667, sd 0.7028
	Answer.convincingness_after:	 mean 2.9933, sd 0.1155
	Answer.fluency:	 mean 1.7800, sd 0.9771
	Answer.correctness:	 mean 2.9467, sd 0.32