In [37]:
import json
import pandas as pd

def jprint(obj):
    print(json.dumps(obj, indent=2))


In [38]:
records = [
    {
        "file": "../outputs/predictions-ragent-Qwen2.5-1.5B-Instruct-musique-grpo.jsonl",
        "model": "Qwen2.5-1.5B-ragent-grpo-musique",
        "retriever": "bm25",
        "top-k": 2,
    },
    {
        "file": "../outputs/musique-predictions-ragent-qwen-2.5-1.5B-Instruct.jsonl",
        "model": "Qwen2.5-1.5B-Instruct",
        "retriever": "hybrid",
        "top-k": 2,
    },
    {
        "file": "../outputs/ragent/meta-llama/Llama-3.1-8B-Instruct/predictions-musique-mini-hybrid.jsonl",
        "model": "Llama-3.1-8B-Instruct",
        "retriever": "hybrid",
        "top-k": 2,
    },
    {
        "file": "../outputs/ragent/Qwen/Qwen2.5-7B-Instruct/predictions-musique-mini-hybrid.jsonl",
        "model": "Qwen2.5-7B-Instruct",
        "retriever": "hybrid",
        "top-k": 2,
    },
    {
        "file": "../outputs/ragent/bdsaglam/Qwen2.5-1.5B-Instruct-ragent-musique/predictions-musique-mini-hybrid-2.jsonl",
        "model": "Qwen2.5-1.5B-ragent-grpo-musique",
        "retriever": "hybrid",
        "top-k": 2,
    },
    {
        "file": "../outputs/ragent/bdsaglam/Qwen2.5-1.5B-Instruct-ragent-musique/predictions-musique-mini-semantic-1.jsonl",
        "model": "Qwen2.5-1.5B-ragent-grpo-musique",
        "retriever": "semantic",
        "top-k": 1,
    },
    {
        "file": "../outputs/ragent/bdsaglam/Qwen2.5-1.5B-Instruct-ragent-musique/predictions-musique-mini-semantic-2.jsonl",
        "model": "Qwen2.5-1.5B-ragent-grpo-musique",
        "retriever": "semantic",
        "top-k": 2,
    },
]

In [39]:
def load_predictions(record):
    with open(record['file']) as f:
        records = [json.loads(line) for line in f]
    df = pd.DataFrame(records)
    df['model'] = record['model']
    df['retriever'] = record['retriever']
    df['top-k'] = record['top-k']
    return df


def load_all_predictions(records):
    dfs = [load_predictions(record) for record in records]
    return pd.concat(dfs)

In [40]:
from verifiers.rubrics.utils import get_last_answer
from verifiers.metrics.musique import exact_match, f1

def preprocess(df):
    df['n_hops'] = df['supporting_titles'].apply(len)
    df['predicted_answer'] = df['trajectory'].apply(get_last_answer)
    df['exact_match'] = df.apply(lambda row: exact_match(row['predicted_answer'], row['answers']), axis=1)
    df['f1'] = df.apply(lambda row: f1(row['predicted_answer'], row['answers']), axis=1)
    return df

In [45]:
df = preprocess(load_all_predictions(records))
df.head(3)

Unnamed: 0,answer,n_hops,prompt,docs,answers,supporting_titles,trajectory,model,retriever,top-k,id,predicted_answer,exact_match,f1
0,Vito Corleone,2,[{'content': 'Answer the question based on the...,"[{'id': 0, 'is_supporting': False, 'text': '# ...","[Vito Corleone, Vito Andolini, Vito Andolini C...","[The Good Shepherd (film), The Godfather Part II]",[{'content': 'Answer the question based on the...,Qwen2.5-1.5B-ragent-grpo-musique,bm25,2,,Robert De Niro,0,0.0
1,Rohana Wijeweera,2,[{'content': 'Answer the question based on the...,"[{'id': 0, 'is_supporting': False, 'text': '# ...",[Rohana Wijeweera],"[Rohana Wijeweera, Dimuthu Bandara Abayakoon]",[{'content': 'Answer the question based on the...,Qwen2.5-1.5B-ragent-grpo-musique,bm25,2,,Janatha Vimukthi Peramuna,0,0.0
2,406,2,[{'content': 'Answer the question based on the...,"[{'id': 0, 'is_supporting': False, 'text': '# ...",[406],"[Rössen culture, Galicia (Spain)]",[{'content': 'Answer the question based on the...,Qwen2.5-1.5B-ragent-grpo-musique,bm25,2,,1932,0,0.0


In [42]:
scores_df = df.groupby(['model', 'retriever', 'top-k'])[['exact_match', 'f1']].agg(['count', 'mean'])
scores_df.columns = scores_df.columns.map(lambda x: x[0] + '_' + x[1])
scores_df.sort_values(by='f1_mean', ascending=False, inplace=True)
scores_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,exact_match_count,exact_match_mean,f1_count,f1_mean
model,retriever,top-k,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Qwen2.5-7B-Instruct,hybrid,2,300,0.19,300,0.262008
Llama-3.1-8B-Instruct,hybrid,2,300,0.213333,300,0.250381
Qwen2.5-1.5B-ragent-grpo-musique,hybrid,2,300,0.13,300,0.18499
Qwen2.5-1.5B-ragent-grpo-musique,semantic,2,300,0.113333,300,0.183898
Qwen2.5-1.5B-ragent-grpo-musique,semantic,1,300,0.09,300,0.163452
Qwen2.5-1.5B-ragent-grpo-musique,bm25,2,300,0.086667,300,0.138532
Qwen2.5-1.5B-Instruct,hybrid,2,300,0.003333,300,0.004667


## Inspect

In [43]:
import textwrap
from ipywidgets import widgets
from IPython.display import display
from ipywidgets import HBox
from tabulate import tabulate

def fixedwidth(text):
    """Wrap text to fixed width"""
    return "\n".join(textwrap.wrap(str(text), width=120, replace_whitespace=False))

def format_messages(messages):
    """Format a list of messages as a chat"""
    return "\n".join([f"{m['role']}: {m['content']}" for m in messages])

def format_row(row):
    """Format a single row for display as a table"""
    n_prefix_messages = len(row['prompt'])
    data = [
        ["F1", f"{row['f1']:.2f}"],
        ["Answers", row['answers']],
        ["Predicted Answer", row['predicted_answer']],
        ["Prompt", fixedwidth(format_messages(row['prompt'][-1:]))], 
        ["Trajectory", fixedwidth(format_messages(row['trajectory'][n_prefix_messages:]))],
    ]
    return tabulate(data, tablefmt='grid')

def present_row(row):
    """Display a formatted row as table"""
    print(format_row(row))

def create_browse_app(df):
    """Create an interactive widget to browse through the data"""
    def browse_data(i=0):
        row = df.iloc[i]
        present_row(row)

    index = widgets.IntText(value=0, description='Index:')
    left_button = widgets.Button(description='Previous')
    right_button = widgets.Button(description='Next')

    def on_left_button_clicked(b):
        if index.value > 0:
            index.value -= 1

    def on_right_button_clicked(b):
        if index.value < len(df) - 1:
            index.value += 1

    left_button.on_click(on_left_button_clicked)
    right_button.on_click(on_right_button_clicked)

    ui = HBox([left_button, index, right_button])
    out = widgets.interactive_output(browse_data, {'i': index})

    display(ui, out)

In [44]:
mask = df['n_hops'] == 2
sdf = df[mask]
create_browse_app(sdf)

HBox(children=(Button(description='Previous', style=ButtonStyle()), IntText(value=0, description='Index:'), Bu…

Output()