In [None]:
import json
import logging
from copy import deepcopy
from datetime import datetime
from functools import partial
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv
from tqdm.auto import tqdm
from IPython.display import clear_output

from bellek.utils import set_seed

load_dotenv()

tqdm.pandas()
pd.options.display.float_format = "{:,.3f}".format

set_seed(89)

In [None]:
import textwrap

def format_paragraph(paragraph):
    return f"Paragraph {paragraph['idx']} - {paragraph['paragraph_text']}"

def present_row(row):
    outcome = "Success" if row['fuzzy_match'] else "Fail"
    print(row['id'])
    print(f"Outcome: {outcome}")
    print()
    print("# Question decomposition")
    print("## Reference")
    print(row['question'])
    for item in row['question_decomposition_original']:
        q = item['question']
        a = item['answer']
        print(f"\t{q}")
        print(f"\t\t{a}")
    print()
    print("## Prediction")
    for item in row['question_decomposition']:
        q = item['question']
        a = item['answer']
        print(f"\t{q}")
        print(f"\t\t{a}")
    print()
    print("# Answer")
    print(f"Reference: {row['answers']}")
    print(f"Prediction: {row['predicted_answer']}")
    print()
    print("# Context")
    print("# Reference")
    for p in row['paragraphs_original']:
        if p['is_supporting']:
            for line in textwrap.wrap(format_paragraph(p), width=120):
                print(line)
            print()

    print("# Prediction")
    for p in row['paragraphs']:
        if p['is_supporting']:
            for line in textwrap.wrap(format_paragraph(p), width=120):
                print(line)
            print()

In [None]:
def partition_by_success(dataf):
    fail_dataf = dataf.loc[~dataf['fuzzy_match']]
    success_dataf = dataf.loc[dataf['fuzzy_match']]
    print(len(success_dataf), len(fail_dataf))
    print(f"{len(success_dataf)/len(dataf):.2f}", f"{len(fail_dataf)/len(dataf):.2f}")
    return success_dataf, fail_dataf

In [None]:
res_df = pd.read_json('our-method-kgqa-results-20240908-134424.jsonl', lines=True)
original_df = pd.read_json("../../data/generated/musique-evaluation/dataset.jsonl", orient="records", lines=True)
df = pd.merge(res_df, original_df[['id', 'paragraphs', 'question_decomposition']], on="id", suffixes=("", "_original"))
df.head()

In [None]:
success_df, fail_df = partition_by_success(df)

In [None]:
i = 0
row = fail_df.iloc[i]
present_row(row)

In [None]:
failure_reasons = {}

In [None]:
for i in range(len(fail_df.sort_values('id'))):
    row = fail_df.iloc[i]
    id = row['id']
    if id in failure_reasons:
        continue
    present_row(row)
    reason = input("Reason: ")
    if reason.lower() == 'quit':
        break
    failure_reasons[id] = reason
    clear_output(wait=True)

In [None]:
failure_reasons

In [None]:
with open('kgqa-inspect-failure-reasons.json', 'w') as f:
    json.dump(failure_reasons, f, indent=2)