In [1]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.metrics import cohen_kappa_score
import csv
import pandas as pd


In [2]:
def get_pool(paths):
    pool_1 = defaultdict(int)

    with open(paths, 'r') as f:
        lines = f.readlines()

    for line in lines:
        turn_id, _, passage_id, score = line.split('\t')   
        id = turn_id+'****'+ passage_id
        pool_1[id] = int(score.strip())
    return pool_1

In [3]:
def report_kappa(path_gpt, path_nist):
    gpt_pool = get_pool(path_gpt)
    nist_pool = get_pool(path_nist)

    gpt_label = []
    nist_label = []
    
    for id in gpt_pool:
        if id in nist_pool:
            gpt_label.append(gpt_pool[id])
            nist_label.append(nist_pool[id])

    kappa_graded = cohen_kappa_score(gpt_label, nist_label)


    gpt_binary = [0 if elem<2 else 1 for elem in gpt_label]
    nist_binary = [0 if elem<2 else 1 for elem in nist_label]
    
    len(nist_binary)
    len(gpt_binary)

    kappa_binary = cohen_kappa_score(nist_binary, gpt_binary)
    
    return kappa_binary, kappa_graded

In [4]:

def get_test_subset():
    file_path = "splitted_data.txt"

    with open(file_path, 'r') as file:
        reader = csv.reader(file, delimiter='\t')
        df = pd.DataFrame(reader, columns=['turn_id', 'user_utterance', 'response',  'passage_id', 'passage_txt', 'score', 'ptkb', 'lable'])
    df.head()   


    test_set = df[df['lable']=='test']

    turn_passages_test = []

    for _, row in test_set.iterrows():
        turn_passages_test.append(row["turn_id"] +'****'+row["passage_id"])
    
    return turn_passages_test

In [5]:
def report_kappa_on_subset(path_gpt, path_nist):
    gpt_pool = get_pool(path_gpt)
    nist_pool = get_pool(path_nist)

    subset = get_test_subset()
    
    gpt_label = []
    nist_label = []
    
    for id in gpt_pool:
        if id in subset:
            gpt_label.append(gpt_pool[id])
            nist_label.append(nist_pool[id])

    kappa_graded = cohen_kappa_score(gpt_label, nist_label)


    gpt_binary = [0 if elem<2 else 1 for elem in gpt_label]
    nist_binary = [0 if elem<2 else 1 for elem in nist_label]
    
    len(nist_binary)
    len(gpt_binary)

    kappa_binary = cohen_kappa_score(nist_binary, gpt_binary)
    
    return kappa_binary, kappa_graded

In [None]:

path_nist = 'pools/human_qrels_tab'

models_main_name = {'gpt3.5-one-shot-pool-V2': 'one-shot',
                    'gpt3.5-one-shot-pool-V2-temp0': 'one-shot (tmp=0)',
                    'gpt3.5-two-shot-pool' : 'two-shot',
                    'gpt3.5-two-shot-pool-V2-temp0': 'two-shot (tmp=0)',
                    'gpt3.5-zero-shot-pool':'zero-shot',
                    'gpt3.5-zero-shot-paul-pool-temp0-': 'zero-shot (tmp=0)'
                    }

all_models_names = list(models_main_name.keys())

for model_name in all_models_names:
    path_gpt = 'outputs/'+model_name+'.txt'
    kappa_binary, kappa_graded = report_kappa(path_gpt, path_nist)
    print('{model_name} & {kappa_binary} & {kappa_graded} \\\\ \\midrule'.format(model_name = models_main_name[model_name], 
                                                                            kappa_binary="{:.3f}".format(kappa_binary),
                                                                            kappa_graded="{:.3f}".format(kappa_graded)))



In [None]:
path_nist = 'pools/human_qrels_tab'


models_main_name = {
                    'gpt3.5-one-shot-pool-V2': 'one-shot',
                    'gpt3.5-one-shot-pool-V2-temp0': 'one-shot (tmp=0)',
                    'gpt3.5-two-shot-pool' : 'two-shot',
                    'gpt3.5-two-shot-pool-V2-temp0': 'two-shot (tmp=0)',
                    'gpt3.5-zero-shot-pool':'zero-shot',
                    'gpt3.5-zero-shot-paul-pool-temp0-': 'zero-shot (tmp=0)',
                    'Llama-3-FT-pool': 'Llama-3 FT',
                    'Llama-3-inst-FT-pool': 'Llama-3-inst FT',
                    'Llama-3-zero-pool': 'Llama-3 zero-shot',
                    'Llama-3-inst-zero-pool': 'Llama-3-inst zero-shot'
                    }

all_models_names = list(models_main_name.keys())

for model_name in all_models_names:
    path_gpt = 'outputs/'+model_name+'.txt'
    kappa_binary, kappa_graded = report_kappa_on_subset(path_gpt, path_nist)
    print('{model_name} & {kappa_binary} & {kappa_graded} \\\\ \\midrule'.format(model_name = models_main_name[model_name], 
                                                                            kappa_binary="{:.3f}".format(kappa_binary),
                                                                            kappa_graded="{:.3f}".format(kappa_graded)))

