In [1]:
from glob import glob

import krippendorff
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import kendalltau
from sklearn.metrics import cohen_kappa_score, classification_report, ConfusionMatrixDisplay
import farrow_and_ball as fb

In [2]:
# plt set default parameters
plt.rcParams.update(plt.rcParamsDefault)
#set plt log level to info
plt.set_loglevel('INFO')


def plot_confusion_matrix(y_true, y_pred, labels, cmap=None, colorbar=False):
    if cmap is None:
        cmap = fb.build_colormap(fb.BaseColorPalette.PINKS, continuous=True, reverse=False)
    fig, ax = plt.subplots(figsize=(2, 2), dpi=300)
    disp = ConfusionMatrixDisplay.from_predictions(y_true, y_pred, labels=labels, cmap=cmap, ax=ax, colorbar=colorbar)
    return disp.plot(ax=ax, cmap=cmap, colorbar=colorbar)


def judgements_evaluation_report(eval_df, true_col='true', predicted_col='predicted'):
    print(classification_report(eval_df[true_col], eval_df[predicted_col], labels=[0, 1, 2, 3]))
    _k = krippendorff.alpha(eval_df[[true_col, predicted_col]].T, level_of_measurement='ordinal',
                            value_domain=[0, 1, 2, 3])
    _kappa = cohen_kappa_score(eval_df[true_col], eval_df[predicted_col])
    _pearson = np.corrcoef(eval_df[true_col], eval_df[predicted_col])[0, 1]
    _kendall = kendalltau(eval_df[true_col], eval_df[predicted_col]).correlation
    _mae = np.abs(eval_df[true_col] - eval_df[predicted_col]).mean()
    return _k, _kappa, _pearson, _kendall, _mae


def eval_predictions(predictions_df, model_name, true_col='true', predicted_col='predicted', labels=(0, 1, 2, 3)):
    _k, _kappa, _pearson, _kendall, _mae = judgements_evaluation_report(predictions_df, true_col=true_col,
                                                                        predicted_col=predicted_col)
    print(f"Krippendorff's alpha for {model_name} on the validation set: {_k:.3f}")
    print(f"Cohen's kappa for {model_name} on the validation set: {_kappa:.3f}")
    print(f"Pearson correlation for {model_name} on the validation set: {_pearson:.3f}")
    print(f"Kendall's tau for {model_name} on the validation set: {_kendall:.3f}")
    print(f"Mean Absolute Error (MAE) for {model_name} on the validation set: {_mae:.3f}")

    plot_confusion_matrix(predictions_df[true_col], predictions_df[predicted_col], labels=labels)
    plt.title(model_name, fontsize='small')
    plt.savefig(f'plots/{model_name}_cm.pdf', bbox_inches='tight', dpi=300)
    plt.show()

In [3]:
# load validation df
val_qrel_df = pd.read_table('val_set_qrel.tsv').drop(columns=['Q0'])
val_qrel_df

Unnamed: 0,qid,docid,relevance
0,q26,p1668,0
1,q26,p3564,0
2,q26,p148,0
3,q26,p10159,1
4,q26,p4522,0
...,...,...,...
1726,q48,p11513,0
1727,q48,p9676,0
1728,q48,p688,0
1729,q48,p8574,3


In [4]:
def read_predictions_df(file_path, val_qrel_df, rsplit=False) -> pd.DataFrame:
    raw_output_df = pd.concat([pd.read_table(f) for f in glob(file_path)])
    if rsplit:
        _temp_df = raw_output_df['prediction'].str.rsplit(',', n=1, expand=True)
        raw_output_df = raw_output_df.assign(
            predicted=_temp_df[1].str.split(':', n=1, expand=True)[1].str.strip().str.strip(
                '"}').astype(int),
            explanation=_temp_df[0].str.split(':', n=1, expand=True)[1].str.strip().str.strip(
                '"}'))
    else:
        _temp_df = raw_output_df['prediction'].str.split(',', n=1, expand=True)
        raw_output_df = raw_output_df.assign(
            predicted=_temp_df[0].str.split(':', n=1, expand=True)[1].str.strip().astype(int),
            explanation=_temp_df[1].str.split(':', n=1, expand=True)[1].str.strip().str.strip(
                '"}'))
    raw_output_df = raw_output_df.join(val_qrel_df.set_index(['qid', 'docid']), on=['qid', 'docid'])
    return raw_output_df.drop(columns=['prediction'])

In [5]:
file_path = 'llm_raw_output/raw_output_rel_p-mult-1_Meta-Llama-3-70B-Instruct*'
raw_output_df = pd.concat([pd.read_table(f) for f in glob(file_path)])
for x in raw_output_df.iloc[0]:
    print(x)

q48
['p10319', 'p4495', 'p4314']
Here is my evaluation:
     The order of the passages is: doc-2, doc-1, doc-3.
     The reason for this order is that doc-2 directly answers the query, providing evidence that older adults can gain strength by training once per week. Doc-1 is also related to the query, but it doesn't directly answer it, instead comparing the effects of different training frequencies. Doc-3 is less relevant, as it discusses the effectiveness of different training frequencies, but doesn't specifically address the query.

     {"pid": "doc-2", "explanation": "Directly answers the query", "relevance": 3}
     {"pid": "doc-1", "explanation": "Related to the query, but doesn't directly answer it", "relevance": 2}
     {"pid": "doc-3", "explanation": "Less relevant, discusses training frequencies in general", "relevance": 1}


In [13]:
_temp_df = raw_output_df['prediction'].str.split('{').str[1:].apply(''.join)
_record_d = []
failed = []
for (qid, docid), text in zip(raw_output_df[['qid', 'docid']].itertuples(index=False), _temp_df):
    _records = text.split('\n')
    docid = [i.strip() for i in docid[1:-1].replace("'", "").split(',')]
    map_docid = {f'doc-{i}': d for i, d in enumerate(docid, 1)}
    for _record in _records:
        if not _record:
            continue
        try:
            _, pid, explanation, relevance = _record.split(':')
            pid = pid.rsplit(',')[0].strip().strip('\"')
            explanation = explanation.rsplit(',', maxsplit=1)[0].strip().strip('\"')
            relevance = relevance.strip().strip('}')
            _record_d.append(dict(qid=qid, docid=map_docid.get(pid), explanation=explanation, predicted=relevance))
        except Exception as e:
            # print(e)
            failed.append(_record)
            continue

parsed_df = pd.DataFrame(_record_d)
parsed_df

Unnamed: 0,qid,docid,explanation,predicted
0,q48,p4495,Directly answers the query,3
1,q48,p10319,"Related to the query, but doesn't directly ans...",2
2,q48,p4314,"Less relevant, discusses training frequencies ...",1
3,q48,p10234,Directly answers the query,3
4,q48,p7954,Related to muscle gain,2
...,...,...,...,...
1686,q48,p3541,"Discusses strength training for older adults, ...",2
1687,q48,p9676,"Discusses workout strategies in general, witho...",0
1688,q48,p6475,Directly answers the query,3
1689,q48,p688,Provides a general guideline for strength trai...,2


In [14]:
parsed_df = parsed_df.join(val_qrel_df.set_index(['qid', 'docid']), on=['qid', 'docid'])
parsed_df['predicted'] = parsed_df['predicted'].replace('', '0').astype(int)
parsed_df

Unnamed: 0,qid,docid,explanation,predicted,relevance
0,q48,p4495,Directly answers the query,3,3
1,q48,p10319,"Related to the query, but doesn't directly ans...",2,3
2,q48,p4314,"Less relevant, discusses training frequencies ...",1,0
3,q48,p10234,Directly answers the query,3,0
4,q48,p7954,Related to muscle gain,2,0
...,...,...,...,...,...
1686,q48,p3541,"Discusses strength training for older adults, ...",2,0
1687,q48,p9676,"Discusses workout strategies in general, witho...",0,0
1688,q48,p6475,Directly answers the query,3,3
1689,q48,p688,Provides a general guideline for strength trai...,2,0


In [15]:
for i in failed[:5]:
    print(i)

print(_record.split(':'))
# list(map(lambda s: s.split(':')[1].strip().strip('\"').strip('}'), _record.split(',')))

     "pid": "doc-3", "explanation": "List of colleges and universities in the US", "re
"pid": "doc-3", "explanation": "Explains a scenario where flying low can be useful, i.e., to avoid radar coverage in mountainous areas", "re
    "pid": "doc-3", "explanation": " passage explains why helicopter pilots fly low, mentioning the need for awareness of the terrain and obstacles", "relevance
     "pid": "doc-3", "explanation": "The passage mentions the altitude at which the plane was flying, which is somewhat
     "pid": "doc-3", "explanation": "Discusses drones, underwater vehicles, and other unmanned systems, which is not
['     "pid"', ' "doc-2", "explanation"', ' "Focuses on a specific exercise", "relevance"', ' 1}']


In [16]:
parsed_df.loc[parsed_df['predicted'] == '']

Unnamed: 0,qid,docid,explanation,predicted,relevance


In [18]:
eval_predictions(parsed_df, true_col='relevance', predicted_col='predicted')

TypeError: eval_predictions() missing 1 required positional argument: 'model_name'

In [None]:
'q27' 'p9565' 'p9333'