In [1]:
#Fig1

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import matplotlib.patches as mpatches
from matplotlib.ticker import ScalarFormatter, FixedLocator, FuncFormatter

GPU_name = 'Ascend-910'
CSV_FILE_PATH = f"stability_token_level_report_STD_RANGE_{GPU_name}.csv"

font_size = 26
MODEL_NAME_MAPPING = {
    "gemma3": "Gemma3-12B",
    "llama3.2": "Llama3.2-11B",
    "qwen3": "Qwen3-VL-8B", 
    "deepseek_qwen3": "DeepSeek-Qwen3-8B"
}

MODEL_NAMES_ORDER = [
    MODEL_NAME_MAPPING["gemma3"],
    MODEL_NAME_MAPPING["llama3.2"],
    MODEL_NAME_MAPPING["qwen3"], 
    MODEL_NAME_MAPPING["deepseek_qwen3"] 
]

BATCH_SIZES = [2, 4, 8, 16]

BS_HATCH_MAP = {
    16: "xx",  
    8: "..",   
    4: "//",   
    2: ""      
}

MODEL_PALETTE = {
    MODEL_NAME_MAPPING["gemma3"]: "skyblue",
    MODEL_NAME_MAPPING["llama3.2"]: "salmon",
    MODEL_NAME_MAPPING["qwen3"]: "lightgreen",
    MODEL_NAME_MAPPING["deepseek_qwen3"]: "mediumpurple"
}

METRIC_BINS = [
    0, 1e-4, 1e-3, 0.01, 0.05, 0.1, 1.0
]

METRIC_BIN_LABELS_SHORT = []
for i in range(len(METRIC_BINS)-1):
    low = METRIC_BINS[i]
    high = METRIC_BINS[i+1]
    label = f"{low:g}-{high:g}"
    METRIC_BIN_LABELS_SHORT.append(label)

OUTPUT_PLOT_DIR = f"Figures_final/Fig1"
os.makedirs(OUTPUT_PLOT_DIR, exist_ok=True)

try:
    df_raw = pd.read_csv(CSV_FILE_PATH)
    df_raw = df_raw.rename(columns={
        "Std_Prob_Runs (sigma_j)": "Std_Prob_Runs",
        "Range_Prob_Runs (R_j)": "Range_Prob_Runs"
    })
except Exception as e:
    print(f"❌Error: {e}")
    # exit() 

fig, axes = plt.subplots(1, 2, figsize=(30, 7), sharey=True)

plot_configs = {
    "Range_Prob_Runs": "Range", 
    "Std_Prob_Runs": "Standard Deviation"
}

config_items = list(plot_configs.items()) 

for i, (ax, (metric_col, title_label)) in enumerate(zip(axes, config_items)):
    
    df = df_raw.copy()
    
    df[f'{metric_col}_Bin'] = pd.cut(
        df[metric_col], 
        bins=METRIC_BINS, 
        labels=METRIC_BIN_LABELS_SHORT, 
        right=True, 
        include_lowest=True
    )
    df_clean = df.dropna(subset=[f'{metric_col}_Bin'])

    df_counts = df_clean.groupby(
        ['BatchSize', 'Model', f'{metric_col}_Bin'], observed=True
    ).size().reset_index(name='Count')
    
    df_counts['Model'] = df_counts['Model'].replace(MODEL_NAME_MAPPING)

    total_counts_per_model = df_counts.groupby('Model')['Count'].sum().reset_index(name='Total')
    df_counts = pd.merge(df_counts, total_counts_per_model, on='Model')
    df_counts['Percentage'] = (df_counts['Count'] / df_counts['Total']) * 100

    # Pivot
    df_pivoted = df_counts.pivot_table(
        index=['Model', f'{metric_col}_Bin'], 
        columns='BatchSize', 
        values='Percentage',
        fill_value=0
    )
    df_pivoted.columns = [f"BS{col}" for col in df_pivoted.columns]
    df_pivoted = df_pivoted.reset_index()

    df_pivoted['Stack_2'] = df_pivoted['BS2']
    df_pivoted['Stack_4'] = df_pivoted['Stack_2'] + df_pivoted['BS4']
    df_pivoted['Stack_8'] = df_pivoted['Stack_4'] + df_pivoted['BS8']
    df_pivoted['Stack_16'] = df_pivoted['Stack_8'] + df_pivoted['BS16']
    
    stack_order = [16, 8, 4, 2] 
    for bs in stack_order:
        y_col = f'Stack_{bs}'
        
        sns.barplot(
            data=df_pivoted,
            x=f'{metric_col}_Bin',
            y=y_col,
            hue='Model',
            hue_order=MODEL_NAMES_ORDER,
            palette=MODEL_PALETTE,
            ax=ax,
            order=METRIC_BIN_LABELS_SHORT,
            edgecolor='black',
            linewidth=1,
            dodge=True,
            zorder=20 - bs 
        )
        
        current_hatch = BS_HATCH_MAP[bs]
        num_models = len(MODEL_NAMES_ORDER)
        containers = ax.containers[-num_models:] 
        for container in containers:
            for bar in container:
                bar.set_hatch(current_hatch)
                bar.set_alpha(1.0) 

    ax.set_title(f"Distribution of {title_label} ({GPU_name})", fontsize=font_size, y=1.02)
    ax.set_xlabel(f"{title_label} Value", fontsize=font_size)
    
    if i == 0:
        ax.set_ylabel("Percentage of Tokens (%)", fontsize=font_size)
    else:
        ax.set_ylabel("")

    ax.set_yscale('log')
    ax.set_ylim(bottom=0.005, top=200) 
    
    custom_ticks = [0.01, 0.1, 1, 10, 100]
    ax.yaxis.set_major_locator(FixedLocator(custom_ticks))
    ax.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: f'{x:g}'))
    ax.yaxis.set_minor_formatter(plt.NullFormatter())

    ax.grid(axis='y', alpha=0.3, linestyle='--', which='major')
    plt.setp(ax.get_xticklabels(), fontsize=font_size-4, rotation=0) 
    ax.tick_params(axis='y', labelsize=font_size)

    if ax.get_legend():
        ax.get_legend().remove()

target_ax = axes[1]
handles, labels = target_ax.get_legend_handles_labels()

unique_model_handles = []
unique_model_labels = []
seen = set()
for h, l in zip(handles, labels):
    if l in MODEL_NAMES_ORDER and l not in seen:
        unique_model_handles.append(h)
        unique_model_labels.append(l)
        seen.add(l)

bs_handles = []
for bs in [16, 8, 4, 2]:
    patch = mpatches.Patch(
        facecolor='white', edgecolor='black', hatch=BS_HATCH_MAP[bs], label=f"Batch {bs}"
    )
    bs_handles.append(patch)

legend_1 = target_ax.legend(
    unique_model_handles, unique_model_labels,
    title="Model", 
    loc='upper left', 
    bbox_to_anchor=(1.02, 1.0),
    fontsize=font_size-4, 
    title_fontsize=font_size-4,
    framealpha=0.9
)
target_ax.add_artist(legend_1)

legend_2 = target_ax.legend(
    handles=bs_handles,
    title="Batch Size Contribution",
    loc='upper left', 
    bbox_to_anchor=(1.02, 0.50), 
    fontsize=font_size-4, 
    title_fontsize=font_size-4,
    framealpha=0.9
)

plt.tight_layout()
save_path = os.path.join(OUTPUT_PLOT_DIR, f"Distribution_Combined_Range_Std_{GPU_name}.png")
plt.savefig(save_path, dpi=300, bbox_inches='tight', bbox_extra_artists=(legend_1, legend_2))
plt.close(fig)
print(f"✅ Saved to: {save_path}")

In [None]:
#Fig2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import matplotlib.patches as mpatches
from matplotlib.ticker import ScalarFormatter, FixedLocator, FuncFormatter

GPU_name = 'Ascend-910'
CSV_FILE_PATH = f"stability_token_level_report_STD_RANGE_{GPU_name}.csv"

font_size = 18
MODEL_NAME_MAPPING = {
    "gemma3": "Gemma3-12B",
    "llama3.2": "Llama3.2-11B",
    "qwen3": "Qwen3-VL-8B", 
    "deepseek_qwen3": "DeepSeek-Qwen3-8B"
}

MODEL_NAMES_ORDER = [
    MODEL_NAME_MAPPING["gemma3"],
    MODEL_NAME_MAPPING["llama3.2"],
    MODEL_NAME_MAPPING["qwen3"], 
    MODEL_NAME_MAPPING["deepseek_qwen3"] 
]

BATCH_SIZES = [2, 4, 8, 16]

BS_HATCH_MAP = {
    16: "xx",  
    8: "..",   
    4: "//",   
    2: ""      
}

MODEL_PALETTE = {
    MODEL_NAME_MAPPING["gemma3"]: "skyblue",
    MODEL_NAME_MAPPING["llama3.2"]: "salmon",
    MODEL_NAME_MAPPING["qwen3"]: "lightgreen",
    MODEL_NAME_MAPPING["deepseek_qwen3"]: "mediumpurple"
}

METRIC_BINS = [round(x, 1) for x in np.arange(0, 1.1, 0.1)]

METRIC_BIN_LABELS_SHORT = []
for i in range(len(METRIC_BINS)-1):
    low = METRIC_BINS[i]
    high = METRIC_BINS[i+1]
    label = f"{low:g}-{high:g}"
    METRIC_BIN_LABELS_SHORT.append(label)

OUTPUT_PLOT_DIR = f"Figures_final/Fig2"
os.makedirs(OUTPUT_PLOT_DIR, exist_ok=True)

try:
    df_raw = pd.read_csv(CSV_FILE_PATH)
    print(f"Loaded: {CSV_FILE_PATH}")

    df_raw = df_raw.rename(columns={
        "Mean_Prob_Runs (mu_j)": "Mean_Prob_Runs", 
    })
    
        
except Exception as e:
    print(f"❌ Error: {e}")
    # exit() 

fig, ax = plt.subplots(figsize=(16, 4))

metric_col = "Mean_Prob_Runs"
title_label = "Mean Probability"

df = df_raw.copy()

df[f'{metric_col}_Bin'] = pd.cut(
    df[metric_col], 
    bins=METRIC_BINS, 
    labels=METRIC_BIN_LABELS_SHORT, 
    right=True, 
    include_lowest=True
)
df_clean = df.dropna(subset=[f'{metric_col}_Bin'])

df_counts = df_clean.groupby(
    ['BatchSize', 'Model', f'{metric_col}_Bin'], observed=True
).size().reset_index(name='Count')

df_counts['Model'] = df_counts['Model'].replace(MODEL_NAME_MAPPING)

total_counts_per_model = df_counts.groupby('Model')['Count'].sum().reset_index(name='Total')
df_counts = pd.merge(df_counts, total_counts_per_model, on='Model')
df_counts['Percentage'] = (df_counts['Count'] / df_counts['Total']) * 100

# Pivot
df_pivoted = df_counts.pivot_table(
    index=['Model', f'{metric_col}_Bin'], 
    columns='BatchSize', 
    values='Percentage',
    fill_value=0
)
df_pivoted.columns = [f"BS{col}" for col in df_pivoted.columns]
df_pivoted = df_pivoted.reset_index()

df_pivoted['Stack_2'] = df_pivoted['BS2']
df_pivoted['Stack_4'] = df_pivoted['Stack_2'] + df_pivoted['BS4']
df_pivoted['Stack_8'] = df_pivoted['Stack_4'] + df_pivoted['BS8']
df_pivoted['Stack_16'] = df_pivoted['Stack_8'] + df_pivoted['BS16']

stack_order = [16, 8, 4, 2] 
for bs in stack_order:
    y_col = f'Stack_{bs}'
    
    sns.barplot(
        data=df_pivoted,
        x=f'{metric_col}_Bin',
        y=y_col,
        hue='Model',
        hue_order=MODEL_NAMES_ORDER,
        palette=MODEL_PALETTE,
        ax=ax,
        order=METRIC_BIN_LABELS_SHORT,
        edgecolor='black',
        linewidth=1,
        dodge=True,
        zorder=20 - bs 
    )
    
    current_hatch = BS_HATCH_MAP[bs]
    num_models = len(MODEL_NAMES_ORDER)
    containers = ax.containers[-num_models:] 
    for container in containers:
        for bar in container:
            bar.set_hatch(current_hatch)
            bar.set_alpha(1.0) 

ax.set_title(f"Distribution of {title_label} ({GPU_name})", fontsize=font_size, y=1.02)
ax.set_xlabel(f"{title_label} Value", fontsize=font_size)
ax.set_ylabel("Percentage of Tokens (%)", fontsize=font_size)

ax.set_yscale('log')
ax.set_ylim(bottom=0.005, top=200) 

custom_ticks = [0.01, 0.1, 1, 10, 100]
ax.yaxis.set_major_locator(FixedLocator(custom_ticks))
ax.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: f'{x:g}'))
ax.yaxis.set_minor_formatter(plt.NullFormatter()) 

ax.grid(axis='y', alpha=0.3, linestyle='--', which='major')
plt.setp(ax.get_xticklabels(), fontsize=font_size-2, rotation=0) 
ax.tick_params(axis='y', labelsize=font_size)

if ax.get_legend():
    ax.get_legend().remove()

handles, labels = ax.get_legend_handles_labels()

unique_model_handles = []
unique_model_labels = []
seen = set()
for h, l in zip(handles, labels):
    if l in MODEL_NAMES_ORDER and l not in seen:
        unique_model_handles.append(h)
        unique_model_labels.append(l)
        seen.add(l)

bs_handles = []
for bs in [16, 8, 4, 2]:
    patch = mpatches.Patch(
        facecolor='white', edgecolor='black', hatch=BS_HATCH_MAP[bs], label=f"Batch {bs}"
    )
    bs_handles.append(patch)

legend_1 = ax.legend(
    unique_model_handles, unique_model_labels,
    title="Model", 
    loc='upper left', 
    bbox_to_anchor=(1.01, 1.0),
    fontsize=font_size-6, 
    title_fontsize=font_size-6,
    framealpha=0.9
)
ax.add_artist(legend_1)


legend_2 = ax.legend(
    handles=bs_handles,
    title="Batch Size Contribution",
    loc='upper left', 
    bbox_to_anchor=(1.01, 0.5),
    fontsize=font_size-6, 
    title_fontsize=font_size-6,
    framealpha=0.9
)

plt.tight_layout()
save_path = os.path.join(OUTPUT_PLOT_DIR, f"Distribution_Mean_Prob_{GPU_name}.png")
plt.savefig(save_path, dpi=300, bbox_inches='tight', bbox_extra_artists=(legend_1, legend_2))
plt.close(fig)
print(f"✅ Saved to: {save_path}")

In [None]:
#fig3 and fig4

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import matplotlib.patches as mpatches
from matplotlib.ticker import LogLocator, FuncFormatter

GPU_name = 'Ascend-910'
CSV_FILE_PATH = f"stability_token_level_report_STD_RANGE_{GPU_name}.csv"

font_size = 18
MODEL_NAME_MAPPING = {
    "gemma3": "Gemma3-12B",
    "llama3.2": "Llama3.2-11B",
    "qwen3": "Qwen3-VL-8B", 
    "deepseek_qwen3": "DeepSeek-Qwen3-8B"
}

MODEL_NAMES_ORDER = [
    MODEL_NAME_MAPPING["gemma3"],
    MODEL_NAME_MAPPING["llama3.2"],
    MODEL_NAME_MAPPING["qwen3"], 
    MODEL_NAME_MAPPING["deepseek_qwen3"] 
]

BATCH_SIZES = [2, 4, 8, 16]

BS_HATCH_MAP = {
    16: "xx",  
    8: "..",   
    4: "//",   
    2: ""      
}

MODEL_PALETTE = {
    MODEL_NAME_MAPPING["gemma3"]: "skyblue",
    MODEL_NAME_MAPPING["llama3.2"]: "salmon",
    MODEL_NAME_MAPPING["qwen3"]: "lightgreen",
    MODEL_NAME_MAPPING["deepseek_qwen3"]: "mediumpurple"
}

PROB_BINS = [
    0, 0.001, 0.1, 0.2, 0.3, 0.4, 0.5, 
    0.6, 0.7, 0.8, 0.9, 0.999, 1.0
]
PROB_BIN_LABELS = [
    "0-0.001", "0.001-0.1", "0.1-0.2", "0.2-0.3", "0.3-0.4", "0.4-0.5",
    "0.5-0.6", "0.6-0.7", "0.7-0.8", "0.8-0.9", "0.9-0.999", "0.999-1"
]

OUTPUT_PLOT_DIR = f"Figures_final/Fig3"
os.makedirs(OUTPUT_PLOT_DIR, exist_ok=True)
USE_LOG_SCALE_Y = True

try:
    df = pd.read_csv(CSV_FILE_PATH)
    print(f"Loaded: {CSV_FILE_PATH}")
    df = df.rename(columns={
        "Std_Prob_Runs (sigma_j)": "Std_Prob_Runs",
        "Range_Prob_Runs (R_j)": "Range_Prob_Runs"
    })
except Exception as e:
    print(f"❌ Error: {e}")
    # exit()

df['Prob_B1_Bin'] = pd.cut(
    df['Prob_B1'], bins=PROB_BINS, labels=PROB_BIN_LABELS, right=True, include_lowest=True
)
df = df.dropna(subset=['Prob_B1_Bin'])

df_agg = df.groupby(
    ['BatchSize', 'Model', 'Prob_B1_Bin'], observed=True
).agg(
    Std_Prob_Runs_Mean=('Std_Prob_Runs', 'mean'),
    Range_Prob_Runs_Mean=('Range_Prob_Runs', 'mean')
).reset_index()

df_agg['Model'] = df_agg['Model'].replace(MODEL_NAME_MAPPING)

metrics = ["Std_Prob_Runs_Mean", "Range_Prob_Runs_Mean"]
df_pivoted = df_agg.pivot_table(
    index=['Model', 'Prob_B1_Bin'], 
    columns='BatchSize', 
    values=metrics,
    fill_value=0 
)

df_pivoted.columns = [f"{col[0]}_BS{col[1]}" for col in df_pivoted.columns]
df_pivoted = df_pivoted.reset_index()

for metric in ["Std_Prob_Runs", "Range_Prob_Runs"]:
    base_col = f"{metric}_Mean"
    df_pivoted[f'{base_col}_Stack_2'] = df_pivoted[f'{base_col}_BS2']
    df_pivoted[f'{base_col}_Stack_4'] = df_pivoted[f'{base_col}_BS2'] + df_pivoted[f'{base_col}_BS4']
    df_pivoted[f'{base_col}_Stack_8'] = df_pivoted[f'{base_col}_Stack_4'] + df_pivoted[f'{base_col}_BS8']
    df_pivoted[f'{base_col}_Stack_16'] = df_pivoted[f'{base_col}_Stack_8'] + df_pivoted[f'{base_col}_BS16']


plot_configs = {
    "Range_Prob_Runs": "Range", 
    "Std_Prob_Runs": "Standard Deviation"
}

for metric_key, title_label in plot_configs.items():
    metric_col_base = f"{metric_key}_Mean"
    
    fig, ax = plt.subplots(figsize=(16, 4))

    stack_order = [16, 8, 4, 2] 
    
    ylim_bottom = 1e-6
    ylim_top = 1.5 

    for bs in stack_order:
        y_col = f'{metric_col_base}_Stack_{bs}'
        
        sns.barplot(
            data=df_pivoted,
            x='Prob_B1_Bin',
            y=y_col,
            hue='Model',
            hue_order=MODEL_NAMES_ORDER,
            palette=MODEL_PALETTE,
            ax=ax,
            order=PROB_BIN_LABELS,
            edgecolor='black',
            linewidth=1,
            dodge=True, 
            zorder=20 - bs 
        )
        
        current_hatch = BS_HATCH_MAP[bs]
        num_models = len(MODEL_NAMES_ORDER)
        containers = ax.containers[-num_models:] 
        
        for container in containers:
            for bar in container:
                bar.set_hatch(current_hatch)
                bar.set_alpha(1.0) 

    ax.set_title(f"Average {title_label} vs Probability ({GPU_name})", fontsize=font_size, y=1.02)
    ax.set_xlabel("Baseline Probability", fontsize=font_size)
    ax.set_ylabel(f"Average {title_label}", fontsize=font_size)

    if USE_LOG_SCALE_Y:
        ax.set_yscale('log')
        ax.set_ylim(bottom=ylim_bottom, top=ylim_top)
    

    ax.yaxis.set_major_locator(LogLocator(base=10.0, subs=(1.0,), numticks=10))
    
    def math_formatter(x, pos):
        if x <= 0:
            return "0"
        exponent = int(np.round(np.log10(x)))
        return f"$10^{{{exponent}}}$"

    ax.yaxis.set_major_formatter(FuncFormatter(math_formatter))
    
    ax.yaxis.set_minor_formatter(plt.NullFormatter())

    ax.grid(axis='y', alpha=0.3, linestyle='--', which='major')
    
    plt.setp(ax.get_xticklabels(), fontsize=font_size-2, rotation=0) 
    ax.tick_params(axis='y', labelsize=font_size)

    if ax.get_legend():
        ax.get_legend().remove()

    target_ax = ax 
    handles, labels = target_ax.get_legend_handles_labels()

    unique_model_handles = []
    unique_model_labels = []
    seen = set()
    for h, l in zip(handles, labels):
        if l in MODEL_NAMES_ORDER and l not in seen:
            unique_model_handles.append(h)
            unique_model_labels.append(l)
            seen.add(l)

    bs_handles = []
    for bs in [16, 8, 4, 2]:
        patch = mpatches.Patch(
            facecolor='white', edgecolor='black', hatch=BS_HATCH_MAP[bs], label=f"Batch {bs}"
        )
        bs_handles.append(patch)

    legend_1 = target_ax.legend(
        unique_model_handles, unique_model_labels,
        title="Model", 
        loc='upper left', 
        bbox_to_anchor=(1.02, 1.0), 
        fontsize=font_size-6, 
        title_fontsize=font_size-6,
        framealpha=0.9
    )
    target_ax.add_artist(legend_1)

    legend_2 = target_ax.legend(
        handles=bs_handles,
        title="Batch Size Contribution",
        loc='upper left', 
        bbox_to_anchor=(1.02, 0.50), 
        fontsize=font_size-6, 
        title_fontsize=font_size-6,
        framealpha=0.9
    )

    plt.tight_layout()
    save_path = os.path.join(OUTPUT_PLOT_DIR, f"Stacked_Average_{metric_key}_{GPU_name}.png")
    plt.savefig(save_path, dpi=300, bbox_inches='tight', bbox_extra_artists=(legend_1, legend_2))
    plt.close(fig)
    print(f"✅ Saved to: {save_path}")


In [None]:
#fig5

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import matplotlib.patches as mpatches
from matplotlib.ticker import FixedLocator, FuncFormatter

GPU_name = 'A100'
CSV_FILE_PATH = f"stability_token_level_report_STD_RANGE_{GPU_name}.csv"

font_size = 18
MODEL_NAME_MAPPING = {
    "gemma3": "Gemma3-12B",
    "llama3.2": "Llama3.2-11B",
    "qwen3": "Qwen3-VL-8B", 
    "deepseek_qwen3": "DeepSeek-Qwen3-8B"
}

MODEL_NAMES_ORDER = [
    MODEL_NAME_MAPPING["gemma3"],
    MODEL_NAME_MAPPING["llama3.2"],
    MODEL_NAME_MAPPING["qwen3"], 
    MODEL_NAME_MAPPING["deepseek_qwen3"] 
]

BATCH_SIZES = [2, 4, 8, 16]

BS_HATCH_MAP = {
    16: "xx",  
    8: "..",   
    4: "//",   
    2: ""      
}

MODEL_PALETTE = {
    MODEL_NAME_MAPPING["gemma3"]: "skyblue",
    MODEL_NAME_MAPPING["llama3.2"]: "salmon",
    MODEL_NAME_MAPPING["qwen3"]: "lightgreen",
    MODEL_NAME_MAPPING["deepseek_qwen3"]: "mediumpurple"
}

PROB_BINS = [
    0, 0.001, 0.1, 0.2, 0.3, 0.4, 0.5, 
    0.6, 0.7, 0.8, 0.9, 0.999, 1.0
]
PROB_BIN_LABELS = [
    "0-0.001", "0.001-0.1", "0.1-0.2", "0.2-0.3", "0.3-0.4", "0.4-0.5",
    "0.5-0.6", "0.6-0.7", "0.7-0.8", "0.8-0.9", "0.9-0.999", "0.999-1"
]

OUTPUT_PLOT_DIR = f"Figures_final/Fig5"
os.makedirs(OUTPUT_PLOT_DIR, exist_ok=True)

USE_LOG_SCALE_Y = False 

try:
    df = pd.read_csv(CSV_FILE_PATH)
    df = df.rename(columns={
        "Range_Logit_Runs": "Range_Logit_Runs",
    })
           
except Exception as e:
    print(f"❌ Error: {e}")
    # exit()


df['Prob_B1_Bin'] = pd.cut(
    df['Prob_B1'], bins=PROB_BINS, labels=PROB_BIN_LABELS, right=True, include_lowest=True
)
df = df.dropna(subset=['Prob_B1_Bin'])

df_agg = df.groupby(
    ['BatchSize', 'Model', 'Prob_B1_Bin'], observed=True
).agg(
    Range_Logit_Runs_Mean=('Range_Logit_Runs', 'mean')
).reset_index()

df_agg['Model'] = df_agg['Model'].replace(MODEL_NAME_MAPPING)

metrics = ["Range_Logit_Runs_Mean"]
df_pivoted = df_agg.pivot_table(
    index=['Model', 'Prob_B1_Bin'], 
    columns='BatchSize', 
    values=metrics,
    fill_value=0 
)

df_pivoted.columns = [f"{col[0]}_BS{col[1]}" for col in df_pivoted.columns]
df_pivoted = df_pivoted.reset_index()

metric = "Range_Logit_Runs"
base_col = f"{metric}_Mean"
df_pivoted[f'{base_col}_Stack_2'] = df_pivoted[f'{base_col}_BS2']
df_pivoted[f'{base_col}_Stack_4'] = df_pivoted[f'{base_col}_BS2'] + df_pivoted[f'{base_col}_BS4']
df_pivoted[f'{base_col}_Stack_8'] = df_pivoted[f'{base_col}_Stack_4'] + df_pivoted[f'{base_col}_BS8']
df_pivoted[f'{base_col}_Stack_16'] = df_pivoted[f'{base_col}_Stack_8'] + df_pivoted[f'{base_col}_BS16']


plot_configs = {
    "Range_Logit_Runs": "Logit Range"
}

for metric_key, title_label in plot_configs.items():
    metric_col_base = f"{metric_key}_Mean"
    fig, ax = plt.subplots(figsize=(16, 4))

    stack_order = [16, 8, 4, 2] 
    
    ylim_bottom = 0
    ylim_top = 4.5

    for bs in stack_order:
        y_col = f'{metric_col_base}_Stack_{bs}'
        
        sns.barplot(
            data=df_pivoted,
            x='Prob_B1_Bin',
            y=y_col,
            hue='Model',
            hue_order=MODEL_NAMES_ORDER,
            palette=MODEL_PALETTE,
            ax=ax,
            order=PROB_BIN_LABELS,
            edgecolor='black',
            linewidth=1,
            dodge=True, 
            zorder=20 - bs 
        )
        
        current_hatch = BS_HATCH_MAP[bs]
        num_models = len(MODEL_NAMES_ORDER)
        containers = ax.containers[-num_models:] 
        
        for container in containers:
            for bar in container:
                bar.set_hatch(current_hatch)
                bar.set_alpha(1.0) 

    ax.set_title(f"Average {title_label} vs Probability ({GPU_name})", fontsize=font_size, y=1.02)
    ax.set_xlabel("Baseline Probability", fontsize=font_size)
    ax.set_ylabel(f"Average {title_label}", fontsize=font_size)

    if USE_LOG_SCALE_Y:
        ax.set_yscale('log')
    else:
        ax.set_yscale('linear')
    
    ax.set_ylim(bottom=ylim_bottom, top=ylim_top)

    custom_ticks = [0, 1, 2, 3, 4]
    ax.yaxis.set_major_locator(FixedLocator(custom_ticks))
    
    ax.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: f'{x:g}'))
    ax.yaxis.set_minor_formatter(plt.NullFormatter())

    ax.grid(axis='y', alpha=0.3, linestyle='--', which='major')
    
    plt.setp(ax.get_xticklabels(), fontsize=font_size-2, rotation=0) 
    ax.tick_params(axis='y', labelsize=font_size)

    if ax.get_legend():
        ax.get_legend().remove()

    target_ax = ax 
    handles, labels = target_ax.get_legend_handles_labels()

    unique_model_handles = []
    unique_model_labels = []
    seen = set()
    for h, l in zip(handles, labels):
        if l in MODEL_NAMES_ORDER and l not in seen:
            unique_model_handles.append(h)
            unique_model_labels.append(l)
            seen.add(l)

    bs_handles = []
    for bs in [16, 8, 4, 2]:
        patch = mpatches.Patch(
            facecolor='white', edgecolor='black', hatch=BS_HATCH_MAP[bs], label=f"Batch {bs}"
        )
        bs_handles.append(patch)

    legend_1 = target_ax.legend(
        unique_model_handles, unique_model_labels,
        title="Model", 
        loc='upper left', 
        bbox_to_anchor=(1.02, 1.0), 
        fontsize=font_size-6, 
        title_fontsize=font_size-6,
        framealpha=0.9
    )
    target_ax.add_artist(legend_1)

    legend_2 = target_ax.legend(
        handles=bs_handles,
        title="Batch Size Contribution",
        loc='upper left', 
        bbox_to_anchor=(1.02, 0.50), 
        fontsize=font_size-6, 
        title_fontsize=font_size-6,
        framealpha=0.9
    )

    plt.tight_layout()
    save_path = os.path.join(OUTPUT_PLOT_DIR, f"Stacked_Average_{metric_key}_{GPU_name}.png")
    plt.savefig(save_path, dpi=300, bbox_inches='tight', bbox_extra_artists=(legend_1, legend_2))
    plt.close(fig)
    print(f"✅ Saved to: {save_path}")


In [None]:
#fig6

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import matplotlib.patches as mpatches
from matplotlib.ticker import LogLocator, FuncFormatter

GPU_LIST = ['A100', 'A6000', 'H200', 'Ascend-910'] 

CSV_FILE_TEMPLATE = "stability_token_level_report_STD_RANGE_{}.csv"

font_size = 22

MODEL_NAME_MAPPING = {
    "gemma3": "Gemma3-12B",
    "llama3.2": "Llama3.2-11B",
    "qwen3": "Qwen3-VL-8B", 
    "deepseek_qwen3": "DeepSeek-Qwen3-8B"
}

MODEL_NAMES_ORDER = [
    MODEL_NAME_MAPPING["gemma3"],
    MODEL_NAME_MAPPING["llama3.2"],
    MODEL_NAME_MAPPING["qwen3"], 
    MODEL_NAME_MAPPING["deepseek_qwen3"] 
]

BATCH_SIZES = [2, 4, 8, 16]

BS_HATCH_MAP = {
    16: "xx",  
    8: "..",   
    4: "//",   
    2: ""      
}

GPU_PALETTE = {
    'A100': '#76b900',  
    'A6000': '#003366', 
    'H200': '#808080',  
    'Ascend-910': '#C02C38'
}

PROB_BINS = [
    0, 0.001, 0.1, 0.2, 0.3, 0.4, 0.5, 
    0.6, 0.7, 0.8, 0.9, 0.999, 1.0
]
PROB_BIN_LABELS = [
    "0-0.001", "0.001-0.1", "0.1-0.2", "0.2-0.3", "0.3-0.4", "0.4-0.5",
    "0.5-0.6", "0.6-0.7", "0.7-0.8", "0.8-0.9", "0.9-0.999", "0.999-1"
]

OUTPUT_PLOT_DIR = f"Figures_final/Fig6"
os.makedirs(OUTPUT_PLOT_DIR, exist_ok=True)
USE_LOG_SCALE_Y = True

df_list = []

for gpu in GPU_LIST:
    file_path = CSV_FILE_TEMPLATE.format(gpu)
    try:
        temp_df = pd.read_csv(file_path)
        temp_df['GPU'] = gpu 
        temp_df = temp_df.rename(columns={
            "Std_Prob_Runs (sigma_j)": "Std_Prob_Runs",
            "Range_Prob_Runs (R_j)": "Range_Prob_Runs"
        })
        df_list.append(temp_df)
    except Exception as e:
        print(f"  ❌ Failed loading {file_path}: {e}")

if not df_list:
    print("No data")
    exit()

df_all = pd.concat(df_list, ignore_index=True)


df_all['Prob_B1_Bin'] = pd.cut(
    df_all['Prob_B1'], bins=PROB_BINS, labels=PROB_BIN_LABELS, right=True, include_lowest=True
)
df_all = df_all.dropna(subset=['Prob_B1_Bin'])

df_all['Model'] = df_all['Model'].replace(MODEL_NAME_MAPPING)


for current_model in MODEL_NAMES_ORDER:
    print(f"\n====== Processing: {current_model} ======")
    
    df_model = df_all[df_all['Model'] == current_model].copy()
    
    if df_model.empty:
        print(f"  ⚠️ {current_model} has no data, skipped")
        continue

    df_agg = df_model.groupby(
        ['BatchSize', 'GPU', 'Prob_B1_Bin'], observed=True
    ).agg(
        Std_Prob_Runs_Mean=('Std_Prob_Runs', 'mean'),
        Range_Prob_Runs_Mean=('Range_Prob_Runs', 'mean')
    ).reset_index()

    metrics = ["Std_Prob_Runs_Mean", "Range_Prob_Runs_Mean"]
    df_pivoted = df_agg.pivot_table(
        index=['GPU', 'Prob_B1_Bin'], 
        columns='BatchSize', 
        values=metrics,
        fill_value=0 
    )

    df_pivoted.columns = [f"{col[0]}_BS{col[1]}" for col in df_pivoted.columns]
    df_pivoted = df_pivoted.reset_index()

    for metric in ["Std_Prob_Runs", "Range_Prob_Runs"]:
        base_col = f"{metric}_Mean"
        df_pivoted[f'{base_col}_Stack_2'] = df_pivoted[f'{base_col}_BS2']
        df_pivoted[f'{base_col}_Stack_4'] = df_pivoted[f'{base_col}_BS2'] + df_pivoted[f'{base_col}_BS4']
        df_pivoted[f'{base_col}_Stack_8'] = df_pivoted[f'{base_col}_Stack_4'] + df_pivoted[f'{base_col}_BS8']
        df_pivoted[f'{base_col}_Stack_16'] = df_pivoted[f'{base_col}_Stack_8'] + df_pivoted[f'{base_col}_BS16']

    plot_configs = {
        "Range_Prob_Runs": "Range", 
        "Std_Prob_Runs": "Standard Deviation"
    }
    
    fig, axes = plt.subplots(1, 2, figsize=(30, 7), sharey=True)
    config_items = list(plot_configs.items()) 

    for i, (ax, (metric_key, title_label)) in enumerate(zip(axes, config_items)):
        metric_col_base = f"{metric_key}_Mean"
        
        stack_order = [16, 8, 4, 2] 
        ylim_bottom = 1e-6
        ylim_top = 1.5 

        for bs in stack_order:
            y_col = f'{metric_col_base}_Stack_{bs}'
            
            sns.barplot(
                data=df_pivoted,
                x='Prob_B1_Bin',
                y=y_col,
                hue='GPU',
                hue_order=GPU_LIST, 
                palette=GPU_PALETTE,
                ax=ax,
                order=PROB_BIN_LABELS,
                edgecolor='black',
                linewidth=1,
                dodge=True, 
                zorder=20 - bs 
            )
            
            current_hatch = BS_HATCH_MAP[bs]
            num_gpus = len(GPU_LIST)
            containers = ax.containers[-num_gpus:] 
            
            for container in containers:
                for bar in container:
                    bar.set_hatch(current_hatch)
                    bar.set_alpha(1.0) 

        ax.set_title(f"{current_model}: Average {title_label}", fontsize=font_size, y=1.02)
        ax.set_xlabel("Baseline Probability", fontsize=font_size)
        
        if i == 0:
            ax.set_ylabel(f"Average {title_label}", fontsize=font_size)
        else:
            ax.set_ylabel("")

        if USE_LOG_SCALE_Y:
            ax.set_yscale('log')
            ax.set_ylim(bottom=ylim_bottom, top=ylim_top)
        
        ax.yaxis.set_major_locator(LogLocator(base=10.0, subs=(1.0,), numticks=10))
        
        def math_formatter(x, pos):
            if x <= 0: return "0"
            exponent = int(np.round(np.log10(x)))
            return f"$10^{{{exponent}}}$"

        ax.yaxis.set_major_formatter(FuncFormatter(math_formatter))
        ax.yaxis.set_minor_formatter(plt.NullFormatter())

        ax.grid(axis='y', alpha=0.3, linestyle='--', which='major')
        plt.setp(ax.get_xticklabels(), fontsize=font_size-4, rotation=0) 
        ax.tick_params(axis='y', labelsize=font_size)

        if ax.get_legend():
            ax.get_legend().remove()

    target_ax = axes[1]
    handles, labels = target_ax.get_legend_handles_labels()

    unique_gpu_handles = []
    unique_gpu_labels = []
    seen = set()
    for h, l in zip(handles, labels):
        if l in GPU_LIST and l not in seen:
            unique_gpu_handles.append(h)
            unique_gpu_labels.append(l)
            seen.add(l)

    bs_handles = []
    for bs in [16, 8, 4, 2]:
        patch = mpatches.Patch(
            facecolor='white', edgecolor='black', hatch=BS_HATCH_MAP[bs], label=f"Batch {bs}"
        )
        bs_handles.append(patch)

    legend_1 = target_ax.legend(
        unique_gpu_handles, unique_gpu_labels,
        title="GPU", 
        loc='upper left', 
        bbox_to_anchor=(1.02, 1.0), 
        fontsize=font_size-2, 
        title_fontsize=font_size-2,
        framealpha=0.9
    )
    target_ax.add_artist(legend_1)

    legend_2 = target_ax.legend(
        handles=bs_handles,
        title="Batch Size Contribution",
        loc='upper left', 
        bbox_to_anchor=(1.02, 0.50), 
        fontsize=font_size-6, 
        title_fontsize=font_size-6,
        framealpha=0.9
    )

    plt.tight_layout()

    safe_model_name = current_model.replace(" ", "_").replace(".", "-")
    save_path = os.path.join(OUTPUT_PLOT_DIR, f"Stacked_Average_{safe_model_name}_AllGPUs.png")
    plt.savefig(save_path, dpi=300, bbox_inches='tight', bbox_extra_artists=(legend_1, legend_2))
    plt.close(fig)
    print(f"✅ Saved to: {save_path}")

In [None]:
#fig7 and fig8

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import matplotlib.patches as mpatches
from matplotlib.ticker import LogLocator, FuncFormatter


GPU_name = 'A6000'
CSV_FILE_PATH = f"stability_token_level_report_STD_RANGE_gemma_{GPU_name}.csv"


font_size = 18
MODEL_NAME_MAPPING = {
    "gemma3_270M": "Gemma3-270M",
    "gemma3_1B": "Gemma3-1B",
    "gemma3_4B": "Gemma3-4B", 
    "gemma3": "Gemma3-12B"
}

MODEL_NAMES_ORDER = [
    MODEL_NAME_MAPPING["gemma3_270M"],
    MODEL_NAME_MAPPING["gemma3_1B"],
    MODEL_NAME_MAPPING["gemma3_4B"], 
    MODEL_NAME_MAPPING["gemma3"] 
]

BATCH_SIZES = [2, 4, 8, 16]

BS_HATCH_MAP = {
    16: "xx",  
    8: "..",   
    4: "//",   
    2: ""      
}

MODEL_PALETTE = {
    MODEL_NAME_MAPPING["gemma3_270M"]: "skyblue",
    MODEL_NAME_MAPPING["gemma3_1B"]: "salmon",
    MODEL_NAME_MAPPING["gemma3_4B"]: "lightgreen",
    MODEL_NAME_MAPPING["gemma3"]: "mediumpurple"
}


PROB_BINS = [
    0, 0.001, 0.1, 0.2, 0.3, 0.4, 0.5, 
    0.6, 0.7, 0.8, 0.9, 0.999, 1.0
]
PROB_BIN_LABELS = [
    "0-0.001", "0.001-0.1", "0.1-0.2", "0.2-0.3", "0.3-0.4", "0.4-0.5",
    "0.5-0.6", "0.6-0.7", "0.7-0.8", "0.8-0.9", "0.9-0.999", "0.999-1"
]

OUTPUT_PLOT_DIR = f"Figures_final/Fig7"
os.makedirs(OUTPUT_PLOT_DIR, exist_ok=True)
USE_LOG_SCALE_Y = True

try:
    df = pd.read_csv(CSV_FILE_PATH)
    df = df.rename(columns={
        "Std_Prob_Runs (sigma_j)": "Std_Prob_Runs",
        "Range_Prob_Runs (R_j)": "Range_Prob_Runs"
    })
except Exception as e:
    print(f"❌ Error: {e}")
    # exit()

df['Prob_B1_Bin'] = pd.cut(
    df['Prob_B1'], bins=PROB_BINS, labels=PROB_BIN_LABELS, right=True, include_lowest=True
)
df = df.dropna(subset=['Prob_B1_Bin'])

df_agg = df.groupby(
    ['BatchSize', 'Model', 'Prob_B1_Bin'], observed=True
).agg(
    Std_Prob_Runs_Mean=('Std_Prob_Runs', 'mean'),
    Range_Prob_Runs_Mean=('Range_Prob_Runs', 'mean')
).reset_index()

df_agg['Model'] = df_agg['Model'].replace(MODEL_NAME_MAPPING)

metrics = ["Std_Prob_Runs_Mean", "Range_Prob_Runs_Mean"]
df_pivoted = df_agg.pivot_table(
    index=['Model', 'Prob_B1_Bin'], 
    columns='BatchSize', 
    values=metrics,
    fill_value=0 
)

df_pivoted.columns = [f"{col[0]}_BS{col[1]}" for col in df_pivoted.columns]
df_pivoted = df_pivoted.reset_index()

for metric in ["Std_Prob_Runs", "Range_Prob_Runs"]:
    base_col = f"{metric}_Mean"
    df_pivoted[f'{base_col}_Stack_2'] = df_pivoted[f'{base_col}_BS2']
    df_pivoted[f'{base_col}_Stack_4'] = df_pivoted[f'{base_col}_BS2'] + df_pivoted[f'{base_col}_BS4']
    df_pivoted[f'{base_col}_Stack_8'] = df_pivoted[f'{base_col}_Stack_4'] + df_pivoted[f'{base_col}_BS8']
    df_pivoted[f'{base_col}_Stack_16'] = df_pivoted[f'{base_col}_Stack_8'] + df_pivoted[f'{base_col}_BS16']

plot_configs = {
    "Range_Prob_Runs": "Range", 
    "Std_Prob_Runs": "Standard Deviation"
}

for metric_key, title_label in plot_configs.items():
    metric_col_base = f"{metric_key}_Mean"
    
    fig, ax = plt.subplots(figsize=(16, 4))

    stack_order = [16, 8, 4, 2] 
    
    ylim_bottom = 1e-6
    ylim_top = 1.5 

    for bs in stack_order:
        y_col = f'{metric_col_base}_Stack_{bs}'
        
        sns.barplot(
            data=df_pivoted,
            x='Prob_B1_Bin',
            y=y_col,
            hue='Model',
            hue_order=MODEL_NAMES_ORDER,
            palette=MODEL_PALETTE,
            ax=ax,
            order=PROB_BIN_LABELS,
            edgecolor='black',
            linewidth=1,
            dodge=True, 
            zorder=20 - bs 
        )
        
        current_hatch = BS_HATCH_MAP[bs]
        num_models = len(MODEL_NAMES_ORDER)
        containers = ax.containers[-num_models:] 
        
        for container in containers:
            for bar in container:
                bar.set_hatch(current_hatch)
                bar.set_alpha(1.0) 


    ax.set_title(f"Average {title_label} vs Probability ({GPU_name})", fontsize=font_size, y=1.02)
    ax.set_xlabel("Baseline Probability", fontsize=font_size)
    ax.set_ylabel(f"Average {title_label}", fontsize=font_size)

    if USE_LOG_SCALE_Y:
        ax.set_yscale('log')
        ax.set_ylim(bottom=ylim_bottom, top=ylim_top)
    

    ax.yaxis.set_major_locator(LogLocator(base=10.0, subs=(1.0,), numticks=10))
    
    def math_formatter(x, pos):
        if x <= 0:
            return "0"
        exponent = int(np.round(np.log10(x)))
        return f"$10^{{{exponent}}}$"

    ax.yaxis.set_major_formatter(FuncFormatter(math_formatter))
    ax.yaxis.set_minor_formatter(plt.NullFormatter())

    ax.grid(axis='y', alpha=0.3, linestyle='--', which='major')
    
    plt.setp(ax.get_xticklabels(), fontsize=font_size-2, rotation=0) 
    ax.tick_params(axis='y', labelsize=font_size)

    if ax.get_legend():
        ax.get_legend().remove()

    target_ax = ax 
    handles, labels = target_ax.get_legend_handles_labels()

    unique_model_handles = []
    unique_model_labels = []
    seen = set()
    for h, l in zip(handles, labels):
        if l in MODEL_NAMES_ORDER and l not in seen:
            unique_model_handles.append(h)
            unique_model_labels.append(l)
            seen.add(l)

    bs_handles = []
    for bs in [16, 8, 4, 2]:
        patch = mpatches.Patch(
            facecolor='white', edgecolor='black', hatch=BS_HATCH_MAP[bs], label=f"Batch {bs}"
        )
        bs_handles.append(patch)

    legend_1 = target_ax.legend(
        unique_model_handles, unique_model_labels,
        title="Model", 
        loc='upper left', 
        bbox_to_anchor=(1.02, 1.0), 
        fontsize=font_size-6, 
        title_fontsize=font_size-6,
        framealpha=0.9
    )
    target_ax.add_artist(legend_1)

    legend_2 = target_ax.legend(
        handles=bs_handles,
        title="Batch Size Contribution",
        loc='upper left', 
        bbox_to_anchor=(1.02, 0.50), 
        fontsize=font_size-6, 
        title_fontsize=font_size-6,
        framealpha=0.9
    )

    plt.tight_layout()
    save_path = os.path.join(OUTPUT_PLOT_DIR, f"Stacked_Average_{metric_key}_{GPU_name}.png")
    plt.savefig(save_path, dpi=300, bbox_inches='tight', bbox_extra_artists=(legend_1, legend_2))
    plt.close(fig)
    print(f"✅ Saved to: {save_path}")
