In [1]:
import random
import textwrap
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
from ipywidgets import HBox, widgets


In [2]:
import tiktoken

encoder = tiktoken.encoding_for_model("gpt-4o")


def count_tokens(text: str) -> int:
    return len(encoder.encode(text))

In [None]:
df = pd.read_json('results/results-hated-fore.jsonl', lines=True)
print(len(df))
df.head()

In [None]:
row = df.iloc[0]
row['hops'][0]

In [5]:
df['generation'] = df['hops'].apply(lambda x: x[0]['qa_result']['generation'])
df['gen_token_count'] = df['generation'].apply(lambda x: count_tokens(x))

In [None]:
success_mask = df['f1'] > 0.5
fail_df = df[~success_mask]
success_df = df[success_mask]
print("Fail:", len(fail_df))
print("Success:", len(success_df))
print("Total:", len(df))
print("Mean F1:", f"{df['f1'].mean():.3f}")

In [None]:
answered_mask = df['generation'].str.lower().str.contains('answer:')
len(df.loc[~answered_mask]), f"{len(df.loc[~answered_mask]) / len(df):.3f}"

In [None]:
# Plot the histogram
ax = df['gen_token_count'].hist(bins=50)

# Add lines for min, median, and max
min_val = df['gen_token_count'].min()
median_val = df['gen_token_count'].median()
max_val = df['gen_token_count'].max()

plt.axvline(min_val, color='r', linestyle='dashed', linewidth=1, label=f'Min: {min_val}')
plt.axvline(median_val, color='g', linestyle='dashed', linewidth=1, label=f'Median: {median_val}')
plt.axvline(max_val, color='b', linestyle='dashed', linewidth=1, label=f'Max: {max_val}')

# Add legend
plt.legend()

# Show plot
plt.show()

In [9]:
def fixedwidth(text):
    return "\n".join(textwrap.wrap(text, width=80, replace_whitespace=False))

def format_row(row):
    context = row['hops'][0]['context']
    question = row['hops'][0]['question']
    generation = row['hops'][0]['qa_result']['generation']

    output = []
    output.append(f"{row['id']} - {row['n_hops']} hops")
    output.append("="*80)
    output.append(fixedwidth(context))
    output.append("="*80)
    output.append("Q: " + question)
    output.append("Reference Answers: " + str(row['reference_answers']))
    output.append("Predicted Answer: " + row['predicted_answer'])
    output.append("-"*80)
    output.append("Generation")
    output.append("-"*80)
    output.append(generation)
    output.append("-"*80)
    output.append("# Scores")
    output.append("EM: {:.3f}".format(row['exact_match']))
    output.append("F1: {:.3f}".format(row['f1']))
    if 'analysis' in row:
        output.append("")
        output.append("# Analysis")
        output.append(fixedwidth(row['analysis']))
    return "\n".join(output)

def present_row(row):
    print(format_row(row))


def create_browse_app(df):
    def browse_failed(i=0):
        row = df.iloc[i]
        present_row(row)

    index = widgets.IntText(value=0, description='Index:')
    left_button = widgets.Button(description='Previous')
    right_button = widgets.Button(description='Next')

    def on_left_button_clicked(b):
        if index.value > 0:
            index.value -= 1

    def on_right_button_clicked(b):
        if index.value < len(df) - 1:
            index.value += 1

    left_button.on_click(on_left_button_clicked)
    right_button.on_click(on_right_button_clicked)

    ui = HBox([left_button, index, right_button])
    out = widgets.interactive_output(browse_failed, {'i': index})

    display(ui, out)


In [None]:
inspect_df = fail_df.sort_values('f1')
create_browse_app(inspect_df)

In [None]:
unanswered_df = df.loc[~answered_mask]
create_browse_app(unanswered_df)

In [None]:
answered_failed_df = df.loc[answered_mask & ~success_mask]
create_browse_app(answered_failed_df)