In [4]:
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score,cohen_kappa_score
from scipy import stats

# aspects = [ 'actionability', 'grounding_specificity', 'verifiability', 'helpfulness']
aspects = [ 'actionability']
types = ['definitions', 'definitions_examples', 'definitions_incontext_learning']

results = {}
## check if theres is file for each aspect
for aspect in aspects:
    try:
        results[aspect] = pd.read_csv(f'outputs/{aspect}_results.csv')
        print(f'File for {aspect} found')
    except:
        print(f'No file for {aspect}')

File for actionability found


In [5]:
def get_stats(pred, gold,aspect):

    if aspect != 'verifiability':
        f1 = f1_score(pred,gold, average="micro")
        kappa = cohen_kappa_score(pred,gold)
        kappa_linear = cohen_kappa_score(pred,gold, weights='linear')
        kappa_quadratic = cohen_kappa_score(pred,gold, weights='quadratic')
        spearman = stats.spearmanr(pred, gold)
        return f1, kappa, kappa_linear, kappa_quadratic, spearman

    else:
        new_pred = []
        new_gold = []
        new_pred_X = []
        new_gold_X = []
        for x,y in zip(pred,gold):
            ## map values to X
            if x in ['X','x', 'NO CLAIM']: x = 'X'
            if y in ['X','x', 'NO CLAIM']: y = 'X'

            # if one of the values is X, then add it to a differnt list
            if x == 'X' or y == 'X':
                x = 0 if x == 'X' else 1
                y = 0 if y == 'X' else 1
                new_pred_X.append(x)
                new_gold_X.append(y)
            else:
                new_pred.append(x)
                new_gold.append(y)
        
        gold = new_gold
        pred = new_pred
        f1 = f1_score(pred,gold, average="micro")
        kappa = cohen_kappa_score(pred,gold)
        kappa_linear = cohen_kappa_score(pred,gold, weights='linear')
        kappa_quadratic = cohen_kappa_score(pred,gold, weights='quadratic')
        spearman = stats.spearmanr(pred, gold)
        f1_X = f1_score(new_pred_X,new_gold_X, average="micro")
        return f1, kappa, kappa_linear, kappa_quadratic, spearman, f1_X
            
    

In [6]:

with open('results/chatgpt_results.txt', 'w') as f:
    for aspect, df in results.items():
        f.write(f'Agreement Statistics for {aspect}\n')
        f.write(f' Total number of samples: {len(df)}\n')

        for type in ['definitions_incontext_learning']:
            gold_labels = []
            chatgpt_labels = []
            for index, row in df.iterrows():
                gold_label = str(int(float(row[f'{aspect}_label']))) if row[f'{aspect}_label'] not in ['X','x', 'NO CLAIM', 'no claim'] else 'X'
                chatgpt_label = str(int(float(row[f'chatgpt_{aspect}_{type}_score']))) if row[f'chatgpt_{aspect}_{type}_score'] not in ['X','x', 'NO CLAIM','no claim'] else 'X'
                gold_labels.append(gold_label)
                chatgpt_labels.append(chatgpt_label)

            f.write(f' Agreement Statistics for {type}\n')
            ## for verifiability we have one mroe measure
            if aspect == 'verifiability':
                f1, kappa, kappa_linear, kappa_quadratic, spearman, f1_X = get_stats(gold=gold_labels, pred=chatgpt_labels, aspect=aspect)
            else:
                f1, kappa, kappa_linear, kappa_quadratic, spearman = get_stats(gold=gold_labels, pred=chatgpt_labels, aspect=aspect)

            f.write(f' F1 Score: {f1:.2f}\n')
            f.write(f' Kappa Score: {kappa:.2f}\n')
            f.write(f' Linear Kappa Score: {kappa_linear:.2f}\n')
            f.write(f' Quadratic Kappa Score: {kappa_quadratic:.2f}\n')
            f.write(f' Spearman Correlation: {spearman.correlation:.2f}\n')
            if aspect == 'verifiability':
                f.write(f' F1 Score for X: {f1_X:.2f}\n')
            
            f.write('-' * 50 + '\n')
                
        f.write('=' * 50 + '\n')
        
