```
Non-answer critic:
0 = generated answer
1 = did not generate answer
```

In [1]:
from pathlib import Path
import json

import pandas as pd
import numpy as np

from statsmodels.stats.multitest import multipletests
from scipy.stats import permutation_test

## Utils

In [2]:
def compute_selectivity(y_true, y_pred):
    """
    Compute selectivity metric.

    Selectivity is defined as the fraction of correct system decision.
    Here the decision is to either answer a question (0) or to abstain (1).
    
    Parameters:
    - y_true: List[int]. List of length N, ground truth decisions per sample.
    - y_pred: List[List[int]]. List of lists, shape (N, r) where r is the number of repetitions.
    """
    scores = []
    for ground_truth, actions in zip(y_true, y_pred):
        n_actions = len(actions)
        n_correct = sum(1 for action in actions if action == ground_truth)
        scores.append(n_correct/n_actions)
    return scores


# Example usage:
y_true = [1, 0, 1]
y_pred = [
    [1, 1, 0],  # Sample 1: 3 generations
    [0, 0, 0],  # Sample 2: 3 generations
    [1, 0, 0],  # Sample 3: 3 generations
]

selectivity = compute_selectivity(y_true, y_pred)
assert selectivity == [2/3, 1, 1/3]
print(selectivity)
print(np.mean(selectivity))

[0.6666666666666666, 1.0, 0.3333333333333333]
0.6666666666666666


## Data Loading

### Queries

In [3]:
df_queries = pd.read_json('../data/queries/20250317-email.json')
df_queries['answerable'] = df_queries['sources'].apply(lambda x: len(x) > 0)
print(df_queries['answerable'].value_counts())
df_queries.head()

answerable
True     76
False    19
Name: count, dtype: int64


Unnamed: 0,id,category,intent,question,reference_answer,sources,answerable
0,faq-0000,01 - Admission Criteria,"Given my background, will I be accepted?",I want to inquire about the acceptance criteri...,Your bachelor degree needs to be in Data Scien...,[https://www.uni-marburg.de/en/studying/degree...,True
1,faq-0001,01 - Admission Criteria,"Given my background, will I be accepted?",I recently completed my undergraduate degree i...,Your bachelor degree needs to be in Data Scien...,[https://www.uni-marburg.de/en/studying/degree...,True
2,faq-0002,01 - Admission Criteria,"Given my background, will I be accepted?",My cgpa is 6.4 but while I pursuing my BCA I ...,Your bachelor degree needs to be in Data Scien...,[https://www.uni-marburg.de/en/studying/degree...,True
3,faq-0003,01 - Admission Criteria,"Given my background, will I be accepted?",I have completed my Bachelor's degree in Compu...,Your bachelor degree needs to be in Data Scien...,[https://www.uni-marburg.de/en/studying/degree...,True
4,faq-0004,01 - Admission Criteria,"Given my background, will I be accepted?",I would like to inquire about the eligibility ...,Your bachelor degree needs to be in Data Scien...,[https://www.uni-marburg.de/en/studying/degree...,True


### Aggregated Metrics (all queries)

In [4]:
runs = list(Path('../output/20250317-email/').glob('generation*'))
# runs = [run for run in runs if not 'oracle' in run.name] # TODO: remove me
runs

[PosixPath('../output/20250317-email/generation_gemma-3-27b-it_oracle'),
 PosixPath('../output/20250317-email/generation_gemma-3-4b-it'),
 PosixPath('../output/20250317-email/generation_llama-3.1-70b_w8a8'),
 PosixPath('../output/20250317-email/generation_llama-3.1-8b'),
 PosixPath('../output/20250317-email/generation_gemma-3-27b-it'),
 PosixPath('../output/20250317-email/generation_llama-3.1-70b'),
 PosixPath('../output/20250317-email/generation_llama-3.1-8b_w8a8'),
 PosixPath('../output/20250317-email/generation_gemma-3-1b-it'),
 PosixPath('../output/20250317-email/generation_gemma-3-12b-it')]

In [7]:
run_meta = pd.DataFrame({
    "generation_gemma-3-1b-it": {"gpus": 1},
    "generation_gemma-3-4b-it": {"gpus": 1},
    "generation_gemma-3-4b-it": {"gpus": 1},
    "generation_gemma-3-12b-it": {"gpus": 1},
    "generation_gemma-3-27b-it": {"gpus": 1},
    "generation_gemma-3-27b-it_oracle": {"gpus": 1},
    "generation_llama-3.1-8b": {"gpus": 1},
    "generation_llama-3.1-8b_w8a8": {"gpus": 1},
    "generation_llama-3.1-70b": {"gpus": 2},
    "generation_llama-3.1-70b_w8a8": {"gpus": 2},
}).T


In [8]:
all_metrics = []
for run_path in runs:
    run_path = Path(run_path)
    with open(run_path / 'metrics.json') as fin:
        metrics = json.load(fin)
        metrics['run_id'] = run_path.name

    with open(run_path / 'output.json') as fin:
        outputs = json.load(fin)
        metrics['duration'] = np.mean([sample['duration'] for sample in outputs])

    # calculate selectivity
    df_run = pd.read_json(run_path / 'metrics_by_sample.json')
    y_true = df_run['sources'].apply(lambda x: len(x) == 0).astype(int).tolist()
    y_pred = df_run['NonAnswerCritic'].apply(lambda d: d['raw'])
    metrics['selectivity'] = np.mean(compute_selectivity(y_true, y_pred))
        
    all_metrics.append(metrics)

df_metrics = pd.DataFrame(all_metrics)
df_metrics = df_metrics.set_index('run_id')
df_metrics = df_metrics.rename_axis(None, axis=0)
df_metrics['avg'] = df_metrics[['rouge1', 'BERTScore', 'AnswerFaithfulness', 'AnswerRelevance', 'selectivity']].mean(axis=1)
df_metrics['gpus'] = run_meta['gpus']
df_metrics = df_metrics[['GeneratedAnswerLength', 'ReferenceAnswerLength', 'ContextLength',
       'BLEU', 'rouge1', 'rouge2', 'rougeLsum', 'BERTScore',
       'AnswerSimilarity', 'AnswerFaithfulness', 'AnswerRelevance',
       'NonAnswerCritic', 'duration', 'selectivity', 'avg', 'gpus']]
df_metrics

Unnamed: 0,GeneratedAnswerLength,ReferenceAnswerLength,ContextLength,BLEU,rouge1,rouge2,rougeLsum,BERTScore,AnswerSimilarity,AnswerFaithfulness,AnswerRelevance,NonAnswerCritic,duration,selectivity,avg,gpus
generation_gemma-3-27b-it_oracle,109.312281,45.505263,1366.589474,0.084271,0.314183,0.146774,0.252543,0.171713,0.646229,0.557169,0.671983,0.74386,5.882649,0.915789,0.526168,1
generation_gemma-3-4b-it,196.915789,45.505263,5620.778947,0.03841,0.222056,0.087291,0.179805,0.042887,0.614126,0.56042,0.716876,0.789474,3.196034,0.814035,0.471255,1
generation_llama-3.1-70b_w8a8,117.417544,45.505263,5620.778947,0.077356,0.267664,0.123802,0.213965,0.150585,0.601064,0.693834,0.723121,0.715789,7.764474,0.768421,0.520725,2
generation_llama-3.1-8b,81.754386,45.505263,5620.778947,0.062436,0.205055,0.066859,0.159197,0.104566,0.565784,0.519473,0.742701,0.368421,2.206146,0.533333,0.421026,1
generation_gemma-3-27b-it,107.073684,45.505263,5620.778947,0.074435,0.287744,0.122395,0.22519,0.151955,0.623858,0.719232,0.691358,0.764912,7.69972,0.838596,0.537777,1
generation_llama-3.1-70b,118.319298,45.505263,5620.778947,0.077036,0.260572,0.120529,0.207323,0.147061,0.595954,0.670717,0.723342,0.677193,11.924302,0.74386,0.50911,2
generation_llama-3.1-8b_w8a8,82.747368,45.505263,5620.778947,0.042965,0.19359,0.055501,0.14405,0.086113,0.563427,0.539329,0.74444,0.364912,1.739515,0.522807,0.417256,1
generation_gemma-3-1b-it,150.242105,45.505263,5620.778947,0.013268,0.163902,0.030586,0.125409,-0.019309,0.554584,0.327918,0.727947,0.375439,7.148096,0.435088,0.327109,1
generation_gemma-3-12b-it,102.2,45.505263,5620.778947,0.073962,0.280611,0.117044,0.218255,0.147767,0.624867,0.750481,0.698595,0.680702,3.805117,0.782456,0.531982,1


### Metrics by sample (separately for answerable/unanswerable queries)

In [9]:
def load_metrics_by_sample(run_path):
    run_path = Path(run_path)
    df_run = pd.read_json(run_path / 'metrics_by_sample.json')
    # df_run = pd.merge(df_run, df_queries[['id','answerable']], on='id')
    df_run['answerable'] = df_run['sources'].apply(lambda x: len(x) > 0)

    # Calculate selectivity
    # NonAnswerCritic: 0 = answer, 1 = abstain
    # len(sources) == 0: 1 if abstain, 0 if should answer
    y_true = df_run['sources'].apply(lambda x: len(x) == 0).astype(int).tolist()
    y_pred = df_run['NonAnswerCritic'].apply(lambda d: d['raw'])
    df_run['selectivity'] = compute_selectivity(y_true, y_pred)
    
    def extract_average_score(x):
        # Extract the average score over multiple generations.
        # This returns score from the following dict:
        # {
        #    'score': 0.74,
        #    'raw': [x1, x2, x3]
        # }
        if isinstance(x, dict) and 'score' in x: 
            return x['score']
        return x
    
    df_run = df_run.map(extract_average_score)      
    return df_run    

In [10]:
metrics = [
    'GeneratedAnswerLength',
    'ReferenceAnswerLength',
    'ContextLength',
    'rouge1',
    'rouge2',
    'rougeLsum',
    'BERTScore',
    'AnswerSimilarity',
    'AnswerFaithfulness',
    'AnswerRelevance',
    'NonAnswerCritic',
    'duration',
    'selectivity',
]

evaluation_answerable = []
evaluation_unanswerable = []

for run_path in runs:
    df = load_metrics_by_sample(run_path)
    
    agg = df[df['answerable']][metrics].mean()
    agg.name = run_path.name
    evaluation_answerable.append(agg)

    agg = df[~df['answerable']][metrics].mean()
    agg.name = run_path.name
    evaluation_unanswerable.append(agg)
    

df_metrics_answerable = pd.DataFrame(evaluation_answerable)
df_metrics_answerable['gpus'] = run_meta['gpus']
df_metrics_answerable['avg'] = df_metrics_answerable[['rouge1', 'BERTScore', 'AnswerFaithfulness', 'AnswerRelevance', 'selectivity']].mean(axis=1)

df_metrics_unanswerable = pd.DataFrame(evaluation_unanswerable)
df_metrics_unanswerable['gpus'] = run_meta['gpus']
df_metrics_unanswerable['avg'] = df_metrics_answerable[['rouge1', 'BERTScore', 'AnswerFaithfulness', 'AnswerRelevance', 'selectivity']].mean(axis=1)

## Table 1: Quantization vs. No Quantization

In [11]:
rename_metrics = {
    'rouge1': 'ROUGE',
    'BERTScore': 'BERTScore',
    'AnswerFaithfulness': 'Faithfulness',
    'AnswerRelevance': 'Relevance',
    'selectivity': 'Selectivity',
    'avg': 'Avg.',
    
    # 'ReferenceAnswerLength': '|y|',
    # 'ContextLength': '|c|',
    # 'rouge2': 'R-2',
    # 'rougeLsum': 'R-L',
    # 'AnswerSimilarity': 'Similarity',
    # 'NonAnswerCritic': '% Answered',
    
    'GeneratedAnswerLength': '|ŷ|',
    'duration': 'sec/q',
}

rename_runs = {
    # "generation_gemma-3-1b-it": "Gemma 3 (1B)",
    # "generation_gemma-3-4b-it": "Gemma 3 (4B)",
    # "generation_gemma-3-12b-it": "Gemma 3 (12B)",
    # "generation_gemma-3-27b-it": "Gemma 3 (27B)",
    "generation_llama-3.1-8b": "Llama 3.1 (8B)",
    "generation_llama-3.1-8b_w8a8": "Llama 3.1 (8B, Q)",
    "generation_llama-3.1-70b": "Llama 3.1 (70B)",
    "generation_llama-3.1-70b_w8a8": "Llama 3.1 (70B, Q)"
}

df = df_metrics.loc[rename_runs.keys(), rename_metrics.keys()].rename(rename_runs, axis=0).rename(rename_metrics,axis=1)
df \
    .style \
    .format(precision=2) \
    .format(subset=['sec/q'], precision=1) \
    .format(subset=['|ŷ|'], precision=0)

Unnamed: 0,ROUGE,BERTScore,Faithfulness,Relevance,Selectivity,Avg.,|ŷ|,sec/q
Llama 3.1 (8B),0.21,0.1,0.52,0.74,0.53,0.42,82,2.2
"Llama 3.1 (8B, Q)",0.19,0.09,0.54,0.74,0.52,0.42,83,1.7
Llama 3.1 (70B),0.26,0.15,0.67,0.72,0.74,0.51,118,11.9
"Llama 3.1 (70B, Q)",0.27,0.15,0.69,0.72,0.77,0.52,117,7.8


## Table 2: answerable vs. unanswerable

In [15]:
rename_metrics = {
    'rouge1': 'ROUGE',
    'BERTScore': 'BERTScore',
    'AnswerFaithfulness': 'Faithfulness',
    'AnswerRelevance': 'Relevance',
    'selectivity': 'Selectivity',
    'avg': 'Avg.',
    
    # 'ReferenceAnswerLength': '|y|',
    # 'ContextLength': '|c|',
    # 'rouge2': 'R-2',
    # 'rougeLsum': 'R-L',
    # 'AnswerSimilarity': 'Similarity',
    # 'NonAnswerCritic': '% Answered',
    
    'GeneratedAnswerLength': '|ŷ|',
    'duration': 'sec/q',
    'gpus': 'GPUs',
}

rename_runs = {
    "generation_gemma-3-1b-it": "Gemma 3 (1B)",
    "generation_gemma-3-4b-it": "Gemma 3 (4B)",
    "generation_gemma-3-12b-it": "Gemma 3 (12B)",
    "generation_gemma-3-27b-it": "Gemma 3 (27B)",
    "generation_gemma-3-27b-it_oracle": "$\\rightarrow$ Oracle Retriever",
    "generation_llama-3.1-8b": "Llama 3.1 (8B)",
    "generation_llama-3.1-8b_w8a8": "$\\rightarrow$ 8b quantized",
    "generation_llama-3.1-70b": "Llama 3.1 (70B)",
    "generation_llama-3.1-70b_w8a8": "$\\rightarrow$ 70b quantized"
}

In [24]:
df = df_metrics_answerable.loc[rename_runs.keys(), rename_metrics.keys()].rename(rename_runs, axis=0).rename(rename_metrics,axis=1)
df.columns.name = 'Generator'
display(df)

max_index = [m for m in df.index if 'oracle' not in m.lower()]
max_cols = ['ROUGE', 'BERTScore', 'Faithfulness', 'Relevance', 'Selectivity', 'Avg.']
max_slice = (max_index, max_cols)

tex = df \
    .style \
    .format(precision=2) \
    .format(subset=['sec/q'], precision=1) \
    .format(subset=['|ŷ|'], precision=0) \
    .format_index("\\textbf{{{}}}", escape="latex", axis=1) \
    .highlight_max(subset=max_slice, props="font-weight:bold") \
    .to_latex(caption='c', label='tab:', hrules=True, position='t', convert_css=True)

print(tex)

Generator,ROUGE,BERTScore,Faithfulness,Relevance,Selectivity,Avg.,|ŷ|,sec/q,GPUs
Gemma 3 (1B),0.165704,-0.02823,0.332067,0.732343,0.381579,0.316693,142.486842,7.151602,1
Gemma 3 (4B),0.237562,0.044837,0.564397,0.722886,0.877193,0.489375,205.820175,3.276626,1
Gemma 3 (12B),0.298984,0.15015,0.761873,0.70655,0.789474,0.541406,112.934211,4.087632,1
Gemma 3 (27B),0.30782,0.160515,0.705015,0.701624,0.877193,0.550433,113.934211,7.99333,1
$\rightarrow$ Oracle Retriever,0.347427,0.182656,0.632724,0.677311,0.912281,0.55048,126.004386,6.833859,1
Llama 3.1 (8B),0.217968,0.09745,0.536198,0.736623,0.438596,0.405367,88.903509,2.367257,1
$\rightarrow$ 8b quantized,0.200713,0.073407,0.543385,0.747296,0.429825,0.398925,93.197368,1.920624,1
Llama 3.1 (70B),0.285146,0.156002,0.695324,0.738327,0.763158,0.527591,131.934211,12.540774,2
$\rightarrow$ 70b quantized,0.289009,0.156721,0.73095,0.731187,0.802632,0.5421,129.942982,8.338526,2


\begin{table}[t]
\caption{c}
\label{tab:}
\begin{tabular}{lrrrrrrrrr}
\toprule
Generator & \textbf{ROUGE} & \textbf{BERTScore} & \textbf{Faithfulness} & \textbf{Relevance} & \textbf{Selectivity} & \textbf{Avg.} & \textbf{|ŷ|} & \textbf{sec/q} & \textbf{GPUs} \\
\midrule
Gemma 3 (1B) & 0.17 & -0.03 & 0.33 & 0.73 & 0.38 & 0.32 & 142 & 7.2 & 1 \\
Gemma 3 (4B) & 0.24 & 0.04 & 0.56 & 0.72 & \bfseries 0.88 & 0.49 & 206 & 3.3 & 1 \\
Gemma 3 (12B) & 0.30 & 0.15 & \bfseries 0.76 & 0.71 & 0.79 & 0.54 & 113 & 4.1 & 1 \\
Gemma 3 (27B) & \bfseries 0.31 & \bfseries 0.16 & 0.71 & 0.70 & \bfseries 0.88 & \bfseries 0.55 & 114 & 8.0 & 1 \\
$\rightarrow$ Oracle Retriever & 0.35 & 0.18 & 0.63 & 0.68 & 0.91 & 0.55 & 126 & 6.8 & 1 \\
Llama 3.1 (8B) & 0.22 & 0.10 & 0.54 & 0.74 & 0.44 & 0.41 & 89 & 2.4 & 1 \\
$\rightarrow$ 8b quantized & 0.20 & 0.07 & 0.54 & \bfseries 0.75 & 0.43 & 0.40 & 93 & 1.9 & 1 \\
Llama 3.1 (70B) & 0.29 & 0.16 & 0.70 & 0.74 & 0.76 & 0.53 & 132 & 12.5 & 2 \\
$\rightarrow$ 70b quantized 

In [25]:
df = df_metrics_unanswerable.loc[rename_runs.keys(), rename_metrics.keys()].rename(rename_runs, axis=0).rename(rename_metrics,axis=1)
df.columns.name = 'Generator'

display(df)

max_index = [m for m in df.index if 'oracle' not in m.lower()]
max_cols = ['ROUGE', 'BERTScore', 'Faithfulness', 'Relevance', 'Selectivity', 'Avg.']
max_slice = (max_index, max_cols)

tex = df \
    .style \
    .format(precision=2) \
    .format(subset=['sec/q'], precision=1) \
    .format(subset=['|ŷ|'], precision=0) \
    .format_index("\\textbf{{{}}}", escape="latex", axis=1) \
    .highlight_max(subset=max_slice, props="font-weight:bold") \
    .to_latex(caption='c', label='tab:', hrules=True, position='t', convert_css=True)

print(tex)

Generator,ROUGE,BERTScore,Faithfulness,Relevance,Selectivity,Avg.,|ŷ|,sec/q,GPUs
Gemma 3 (1B),0.156694,0.016374,0.311322,0.64346,0.649123,0.316693,181.263158,7.134073,1
Gemma 3 (4B),0.160033,0.035087,0.542837,0.611876,0.561404,0.489375,161.298246,2.873665,1
Gemma 3 (12B),0.207118,0.138233,0.704915,0.561418,0.754386,0.541406,59.263158,2.675058,1
Gemma 3 (27B),0.207442,0.117717,0.75301,0.601267,0.684211,0.550433,79.631579,6.52528,1
$\rightarrow$ Oracle Retriever,0.181209,0.127938,0.25495,0.665002,0.929825,0.55048,42.54386,2.077812,1
Llama 3.1 (8B),0.1534,0.133027,0.469589,0.553541,0.912281,0.405367,53.157895,1.561701,1
$\rightarrow$ 8b quantized,0.165097,0.136939,0.523355,0.545162,0.894737,0.398925,40.947368,1.01508,1
Llama 3.1 (70B),0.162275,0.111295,0.550435,0.606192,0.666667,0.527591,63.859649,9.458411,2
$\rightarrow$ 70b quantized,0.182281,0.126042,0.550715,0.655709,0.631579,0.5421,67.315789,5.468266,2


\begin{table}[t]
\caption{c}
\label{tab:}
\begin{tabular}{lrrrrrrrrr}
\toprule
Generator & \textbf{ROUGE} & \textbf{BERTScore} & \textbf{Faithfulness} & \textbf{Relevance} & \textbf{Selectivity} & \textbf{Avg.} & \textbf{|ŷ|} & \textbf{sec/q} & \textbf{GPUs} \\
\midrule
Gemma 3 (1B) & 0.16 & 0.02 & 0.31 & 0.64 & 0.65 & 0.32 & 181 & 7.1 & 1 \\
Gemma 3 (4B) & 0.16 & 0.04 & 0.54 & 0.61 & 0.56 & 0.49 & 161 & 2.9 & 1 \\
Gemma 3 (12B) & 0.21 & \bfseries 0.14 & 0.70 & 0.56 & 0.75 & 0.54 & 59 & 2.7 & 1 \\
Gemma 3 (27B) & \bfseries 0.21 & 0.12 & \bfseries 0.75 & 0.60 & 0.68 & \bfseries 0.55 & 80 & 6.5 & 1 \\
$\rightarrow$ Oracle Retriever & 0.18 & 0.13 & 0.25 & 0.67 & 0.93 & 0.55 & 43 & 2.1 & 1 \\
Llama 3.1 (8B) & 0.15 & 0.13 & 0.47 & 0.55 & \bfseries 0.91 & 0.41 & 53 & 1.6 & 1 \\
$\rightarrow$ 8b quantized & 0.17 & 0.14 & 0.52 & 0.55 & 0.89 & 0.40 & 41 & 1.0 & 1 \\
Llama 3.1 (70B) & 0.16 & 0.11 & 0.55 & 0.61 & 0.67 & 0.53 & 64 & 9.5 & 2 \\
$\rightarrow$ 70b quantized & 0.18 & 0.13 & 0.55 & \bf