In [9]:
import random
import textwrap
from pathlib import Path

import pandas as pd
from ipywidgets import HBox, widgets

from bellem.musique.eval import compute_scores


In [2]:
df = pd.read_json('deepseek-70b-best-results.jsonl', lines=True)
print(len(df))
df.head()

2417


Unnamed: 0,id,n_hops,answer,hops,predicted_answer,reference_answers,exact_match,f1,fuzzy_match
0,2hop__460946_294723,2,Miquette Giraudy,[{'question': 'Who is the spouse of the Green ...,Miquette Giraudy,[Miquette Giraudy],1,1.0,1
1,2hop__252311_366220,2,Mike Medavoy,[{'question': 'Who founded the company that di...,Mike Medavoy,[Mike Medavoy],1,1.0,1
2,2hop__701895_752697,2,Municipality of Nuevo Laredo,[{'question': 'What administrative territorial...,Municipality of Nuevo Laredo,[Tamaulipas],0,0.0,0
3,2hop__259228_793698,2,"Cologne, Germany",[{'question': 'Where is Ulrich Walter's employ...,"Cologne, Germany",[Cologne],0,0.666667,0
4,2hop__481349_302087,2,Bombardier Inc.,[{'question': 'Which company owns the manufact...,Bombardier Inc.,"[Bombardier Inc., Bombardier]",1,1.0,1


In [3]:
df['generation'] = df['hops'].apply(lambda x: x[0]['qa_result']['generation'])

In [20]:
def evaluate_row(row):
    predicted_answer = row['predicted_answer']
    reference_answers = row['reference_answers']
    return compute_scores(predicted_answer, reference_answers)
    
def report_performance(dataf):
    score_records = dataf.apply(evaluate_row, axis=1)
    dataf['exact_match'] = [record['exact_match'] for record in score_records]
    dataf['f1'] = [record['f1'] for record in score_records]
    return dataf.groupby('n_hops')[['exact_match', 'f1']].agg(['mean', 'count'])

In [21]:
report_performance(df)

Unnamed: 0_level_0,exact_match,exact_match,f1,f1
Unnamed: 0_level_1,mean,count,mean,count
n_hops,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2,0.599042,1252,0.750479,1252
3,0.555263,760,0.692036,760
4,0.503704,405,0.597067,405


In [22]:
def parse_answer(generation):
    if 'Answer:' in generation:
        return generation.split('Answer:')[1].strip()
    
    last_line = generation.splitlines()[-1]
    return last_line.strip()

In [23]:
df_new = df.copy()

In [24]:
df_new['predicted_answer'] = df_new['generation'].apply(parse_answer)
report_performance(df_new)

Unnamed: 0_level_0,exact_match,exact_match,f1,f1
Unnamed: 0_level_1,mean,count,mean,count
n_hops,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2,0.601438,1252,0.753711,1252
3,0.555263,760,0.692475,760
4,0.503704,405,0.597067,405


## Inspect

In [16]:
def fixedwidth(text):
    return "\n".join(textwrap.wrap(text, width=80, replace_whitespace=False))

def format_row(row):
    context = row['hops'][0]['context']
    question = row['hops'][0]['question']
    generation = row['hops'][0]['qa_result']['generation']

    output = []
    output.append(f"{row['id']} - {row['n_hops']} hops")
    output.append("="*80)
    output.append(fixedwidth(context))
    output.append("="*80)
    output.append("Q: " + question)
    output.append("Reference Answers: " + str(row['reference_answers']))
    output.append("Predicted Answer: " + row['predicted_answer'])
    output.append("-"*80)
    output.append("Generation")
    output.append("-"*80)
    output.append(generation)
    output.append("-"*80)
    output.append("# Scores")
    output.append("EM: {:.3f}".format(row['exact_match']))
    output.append("F1: {:.3f}".format(row['f1']))
    if 'analysis' in row:
        output.append("")
        output.append("# Analysis")
        output.append(fixedwidth(row['analysis']))
    return "\n".join(output)

def present_row(row):
    print(format_row(row))


def create_browse_app(df):
    def browse_failed(i=0):
        row = df.iloc[i]
        present_row(row)

    index = widgets.IntText(value=0, description='Index:')
    left_button = widgets.Button(description='Previous')
    right_button = widgets.Button(description='Next')

    def on_left_button_clicked(b):
        if index.value > 0:
            index.value -= 1

    def on_right_button_clicked(b):
        if index.value < len(df) - 1:
            index.value += 1

    left_button.on_click(on_left_button_clicked)
    right_button.on_click(on_right_button_clicked)

    ui = HBox([left_button, index, right_button])
    out = widgets.interactive_output(browse_failed, {'i': index})

    display(ui, out)


In [25]:
answered_mask = df['generation'].str.lower().str.contains('answer:')
len(df.loc[~answered_mask]), f"{len(df.loc[~answered_mask]) / len(df):.3f}"

(17, '0.007')

In [26]:
unanswered_df = df.loc[~answered_mask]
create_browse_app(unanswered_df)

HBox(children=(Button(description='Previous', style=ButtonStyle()), IntText(value=0, description='Index:'), Bu…

Output()

In [29]:
df_new[df_new['id']=='2hop__215896_460425']

Unnamed: 0,id,n_hops,answer,hops,predicted_answer,reference_answers,exact_match,f1,fuzzy_match,generation
690,2hop__215896_460425,2,,[{'question': 'What is the location of formati...,"Fort Lee, New Jersey",[Fort Lee],0,0.666667,0,"<think>\nOkay, I need to figure out where the ..."


In [30]:
create_browse_app(df)

HBox(children=(Button(description='Previous', style=ButtonStyle()), IntText(value=0, description='Index:'), Bu…

Output()