In [4]:
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

In [5]:
# Set a classic serif font and adjust global styling for academic appearance
mpl.rcParams.update({
    'font.family': 'serif',
    'font.size': 12,
    'axes.labelsize': 14,
    'axes.titlesize': 16,
    'axes.edgecolor': 'black',
    'axes.linewidth': 1.0,
    'xtick.direction': 'out',
    'ytick.direction': 'out',
    'xtick.major.size': 5,
    'ytick.major.size': 5,
    'xtick.minor.size': 2.5,
    'ytick.minor.size': 2.5,
    'xtick.major.width': 1,
    'ytick.major.width': 1,
    'figure.dpi': 300,  # High resolution for publication
    'savefig.dpi': 300
})

In [None]:
df = pd.read_json("exps.jsonl", lines=True)
print(len(df))
df.head()

In [7]:
param_cols = [col for col in df.columns if col.startswith("params.")]
metric_cols = [col for col in df.columns if col.startswith("metrics.")]
df.drop(
    columns=[
        "id",
        "params.dataset.path",
        "params.dataset.name",
        "params.dataset.split",
        "params.qa.model",
    ],
    inplace=True,
)

In [None]:
target_param_cols = [
    "params.qa.technique",
    "params.qa.system_prompt",
    "params.qa.user_prompt_template",
    "params.qa.few_shot_examples",
    "params.qa.n_shot",
    "params.qa.n_sc",
    "params.qa.temperature",
]
df = df.sort_values(target_param_cols + ["params.run"])
df.reset_index(drop=True, inplace=True)
df.head()

In [None]:
# find the rows with max f1 score for each technique group
avg_df = df.groupby(target_param_cols)[metric_cols].mean().reset_index()
max_f1_rows = avg_df.groupby('params.qa.technique')["metrics.f1"].idxmax()
best_df = avg_df.loc[max_f1_rows, target_param_cols + metric_cols].sort_values('metrics.f1', ascending=False)
best_df

In [None]:
# Extract and prepare the data
efficiency_metrics = [
    "metrics.gen_token_count.success.mean",
    "metrics.gen_token_count.success.std",
    "metrics.gen_token_count.success.min",
    "metrics.gen_token_count.success.max",
]
data = best_df[["params.qa.technique"] + efficiency_metrics].copy()
data.columns = ["Technique", "Mean Tokens", "Std Tokens", "Min Tokens", "Max Tokens"]
data['Technique'] = data['Technique'].map(lambda x: x.capitalize() if x == 'direct' else x.upper())

# Identify the mean for the "Direct" technique
lower_bound = data.loc[data["Technique"] == "Direct", "Mean Tokens"].values[0]

# Compute the multiplication factor relative to Direct
data["Relative Verbosity Ratio"] = data["Mean Tokens"] / lower_bound

# Sort techniques by RVR
data.sort_values("Relative Verbosity Ratio", inplace=True)
data

In [None]:
plt.figure(figsize=(10, 6))
plt.errorbar(data["Technique"], data["Mean Tokens"], 
             yerr=[data["Mean Tokens"] - data["Min Tokens"], data["Max Tokens"] - data["Mean Tokens"]],
             fmt='o', capsize=5, ecolor='red', color='blue')
plt.title("Mean with Min/Max Ranges of Generated Tokens by QA Technique")
plt.xlabel("QA Technique")
plt.ylabel("Number of Tokens")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Create a bar plot with error bars
plt.figure(figsize=(10, 6))
plt.bar(data["Technique"], data["Mean Tokens"], 
        yerr=data["Std Tokens"], capsize=5, 
        color='skyblue', edgecolor='black')
plt.title("Mean and Standard Deviation of Generated Tokens by QA Technique")
plt.xlabel("QA Technique")
plt.ylabel("Number of Tokens")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Create a bar plot with error bars
fig, ax = plt.subplots(figsize=(10, 6))

bars = ax.bar(data["Technique"], data["Mean Tokens"], 
              yerr=data["Std Tokens"], capsize=5, 
              color='white', edgecolor='black', linewidth=1)

# Set titles and labels with appropriate font sizes
ax.set_title("Mean and Standard Deviation of Generated Tokens by QA Technique", pad=15)
ax.set_xlabel("QA Technique", labelpad=10)
ax.set_ylabel("Number of Tokens", labelpad=10)

# Rotate x-axis labels for readability
plt.xticks(rotation=45, ha='right')

# Remove top and right spines for a cleaner look
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Enable minor ticks for additional precision
ax.minorticks_on()

# Adjust layout for publication quality
plt.tight_layout()

# Show or save the figure as needed (e.g., plt.savefig("figure.png"))
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
plt.bar(data["Technique"], data["Mean Tokens"], 
        yerr=data["Std Tokens"], capsize=5, 
        color='skyblue', edgecolor='black')
plt.title("Mean and Standard Deviation of Generated Tokens by QA Technique")
plt.xlabel("QA Technique")
plt.ylabel("Number of Tokens")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Create the lollipop chart
fig, ax = plt.subplots(figsize=(10, 6))

# Plot a vertical line for each technique from min to max
for idx, row in data.iterrows():
    ax.plot([row["Technique"], row["Technique"]], [row["Min Tokens"], row["Max Tokens"]],
            color='grey', lw=2, zorder=1)

# Plot mean points with error bars (mean ± std) as small horizontal ticks
for idx, row in data.iterrows():
    # Vertical line for std range centered on the mean
    ax.plot([row["Technique"], row["Technique"]],
            [row["Mean Tokens"] - row["Std Tokens"], row["Mean Tokens"] + row["Std Tokens"]],
            color='darkblue', lw=3, zorder=2)
    # Marker at the mean
    ax.scatter(row["Technique"], row["Mean Tokens"], 
               color='skyblue', edgecolor='black', s=100, zorder=3)

# Set titles and labels
ax.set_title("Token Generation Statistics by QA Technique", pad=15)
ax.set_xlabel("QA Technique", labelpad=10)
ax.set_ylabel("Number of Tokens", labelpad=10)

# Rotate x-axis labels for readability
plt.xticks(rotation=45, ha='right')

# Remove top and right


In [None]:
# Create synthetic data assuming normal distribution for each technique
synthetic_data = []
for _, row in data.iterrows():
    # Generate 500 sample points from a normal distribution with given mean and std
    samples = np.random.normal(loc=row["Mean Tokens"], scale=row["Std Tokens"], size=500)
    # Clip to min and max bounds
    samples = np.clip(samples, row["Min Tokens"], row["Max Tokens"])
    synthetic_data.extend(zip([row["Technique"]]*len(samples), samples))

# Convert to DataFrame
synthetic_df = pd.DataFrame(synthetic_data, columns=["Technique", "Token Count"])

# Plot violin plot using Seaborn
plt.figure(figsize=(10, 6))
sns.violinplot(x="Technique", y="Token Count", data=synthetic_df, palette="pastel", inner="quartile")
plt.title("Approximate Distribution of Generated Tokens by QA Technique")
plt.xlabel("QA Technique")
plt.ylabel("Number of Tokens")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

bars = ax.bar(
    data["Technique"], 
    data["Relative Verbosity Ratio"], 
    color='skyblue', 
    edgecolor='black', 
    linewidth=1
)

# Highlight the Direct technique bar (which should have a ratio of 1.0)
for bar, tech in zip(bars, data["Technique"]):
    if tech == "Direct":
        bar.set_color('lightgreen')
        bar.set_edgecolor('darkgreen')

# Add a horizontal reference line at ratio = 1
ax.axhline(1, color='grey', linestyle='--', linewidth=1)

# Set titles and labels
ax.set_title("Relative Verbosity of QA Techniques Compared to 'Direct'", pad=15)
ax.set_xlabel("QA Technique", labelpad=10)
ax.set_ylabel("Relative Verbosity Ratio", labelpad=10)

plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
