In [32]:
import pandas as pd

# Load data

In [33]:
results_file = "zlibtrunc-02-bits-results/results.jsonl"
raw = pd.read_json(results_file, lines=True)
df = pd.json_normalize(raw.to_dict(orient="records"), sep=".")

# Make distortion values NaN for 0 truncated bits
df.loc[df['results.mse'] == 0, ['results.mse']] = float('nan')
df.loc[df['results.ks_statistic'] == 0, ['results.ks_statistic']] = float('nan')
df.loc[df['results.earth_mover_distance'] == 0, ['results.earth_mover_distance']] = float('nan')
df.loc[df['results.jensen_shannon_divergence'] == 0, ['results.jensen_shannon_divergence']] = float('nan')
df

Unnamed: 0,config.branches,config.chunk_size,config.compressor,config.compressor_config.compressionLevel,config.compressor_config.truncBits,config.decomp_file,config.input_file,config.iterations,config.results_file,config.tree,...,results.earth_mover_distance,results.jensen_shannon_divergence,results.ks_statistic,results.mse,results.original_size_bytes,results.psnr,results.rel_error_avg,results.rel_error_max,system.host,system.timestamp
0,AnalysisJetsAuxDyn.pt,65536,zlib-trunc,5,0,,data.root,10,zlibtrunc-02-bits.sh.20260128_120800/results.j...,CollectionTree,...,,,,,10635492,,0.000000,0.000000,Niamh,2026-01-28 12:08:15
1,AnalysisJetsAuxDyn.eta,65536,zlib-trunc,5,0,,data.root,10,zlibtrunc-02-bits.sh.20260128_120800/results.j...,CollectionTree,...,,,,,10635492,,0.000000,0.000000,Niamh,2026-01-28 12:08:28
2,AnalysisJetsAuxDyn.phi,65536,zlib-trunc,5,0,,data.root,10,zlibtrunc-02-bits.sh.20260128_120800/results.j...,CollectionTree,...,,,,,10635492,,0.000000,0.000000,Niamh,2026-01-28 12:08:41
3,AnalysisElectronsAuxDyn.pt,65536,zlib-trunc,5,0,,data.root,10,zlibtrunc-02-bits.sh.20260128_120800/results.j...,CollectionTree,...,,,,,55500,,0.000000,0.000000,Niamh,2026-01-28 12:08:41
4,AnalysisElectronsAuxDyn.eta,65536,zlib-trunc,5,0,,data.root,10,zlibtrunc-02-bits.sh.20260128_120800/results.j...,CollectionTree,...,,,,,55500,,0.000000,0.000000,Niamh,2026-01-28 12:08:41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,AnalysisJetsAuxDyn.eta,65536,zlib-trunc,5,23,,data.root,10,zlibtrunc-02-bits.sh.20260128_120800/results.j...,CollectionTree,...,0.638012,0.785233,0.216767,6.790482e-01,10635492,21.351120,0.241489,0.500000,Niamh,2026-01-28 12:24:50
140,AnalysisJetsAuxDyn.phi,65536,zlib-trunc,5,23,,data.root,10,zlibtrunc-02-bits.sh.20260128_120800/results.j...,CollectionTree,...,0.416323,0.749478,0.190197,2.764731e-01,10635492,21.547062,0.269270,0.500000,Niamh,2026-01-28 12:25:00
141,AnalysisElectronsAuxDyn.pt,65536,zlib-trunc,5,23,,data.root,10,zlibtrunc-02-bits.sh.20260128_120800/results.j...,CollectionTree,...,1890.933594,0.088452,0.443243,1.174993e+07,55500,41.765934,0.288008,0.499959,Niamh,2026-01-28 12:25:00
142,AnalysisElectronsAuxDyn.eta,65536,zlib-trunc,5,23,,data.root,10,zlibtrunc-02-bits.sh.20260128_120800/results.j...,CollectionTree,...,0.278382,0.712746,0.171892,1.350175e-01,55500,22.662504,0.258806,0.499976,Niamh,2026-01-28 12:25:00


# Visualization

In [37]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Assuming df is already loaded

# Color mapping: group by variable type (.pt, .eta, .phi) with shades within each family
branches = df['config.branches'].unique()

# Color families for each variable type
color_families = {
    # Orange family (from #FFA15A)
    '.pt': ['#FFA15A', '#CC7A3D', '#FFB87D', '#994F1A'],
    # Purple family (from #AB63FA)
    # 'purple': ['#AB63FA', '#8344CC', '#C490FF', '#5C2299'],
    # Pink family (from #FF6692)
    # 'pink': ['#FF6692', '#CC4570', '#FF94B3', '#992244'],
    # Lime/yellow-green family (from #B6E880)
    '.eta': ['#B6E880', '#92C45C', '#CEFF9F', '#6B9933'],
    # Cyan family (from #19D3F3)
    '.phi': ['#19D3F3', '#0FA6C2', '#5DE3FF', '#067891'],
}

# Count how many branches per family to assign shades
family_counts = {key: 0 for key in color_families}
colors = {}
other_idx = 0

for branch in branches:
    assigned = False
    for suffix, palette in color_families.items():
        if suffix in branch:
            idx = family_counts[suffix]
            colors[branch] = palette[idx % len(palette)]
            family_counts[suffix] += 1
            assigned = True
            break
    # if not assigned:
    #     colors[branch] = fallback_colors[other_idx % len(fallback_colors)]
    #     other_idx += 1

# Ensure truncBits is numeric and get tick values
df['config.compressor_config.truncBits'] = pd.to_numeric(df['config.compressor_config.truncBits'])
tickvals = sorted(df['config.compressor_config.truncBits'].unique())

# =============================================================================
# Figure 1: System Metrics (ratio, compression/decompression throughput)
# =============================================================================
fig_system = make_subplots(
    rows=1, cols=3,
    subplot_titles=(
        'Compression Ratio',
        'Compression Throughput (MB/s)',
        'Decompression Throughput (MB/s)'
    ),
    horizontal_spacing=0.08
)

for branch in branches:
    branch_data = df[df['config.branches'] == branch].sort_values('config.compressor_config.truncBits')
    color = colors[branch]
    
    fig_system.add_trace(go.Scatter(
        x=branch_data['config.compressor_config.truncBits'], 
        y=branch_data['results.compression_ratio'],
        mode='lines+markers',
        name=branch,
        legendgroup=branch,
        line=dict(color=color),
        marker=dict(color=color)
    ), row=1, col=1)
    
    fig_system.add_trace(go.Scatter(
        x=branch_data['config.compressor_config.truncBits'], 
        y=branch_data['results.compression_throughput_mbps'],
        mode='lines+markers',
        name=branch,
        legendgroup=branch,
        showlegend=False,
        line=dict(color=color),
        marker=dict(color=color)
    ), row=1, col=2)
    
    fig_system.add_trace(go.Scatter(
        x=branch_data['config.compressor_config.truncBits'], 
        y=branch_data['results.decompression_throughput_mbps'],
        mode='lines+markers',
        name=branch,
        legendgroup=branch,
        showlegend=False,
        line=dict(color=color),
        marker=dict(color=color)
    ), row=1, col=3)


# =============================================================================
# Figure 2: Quality Metrics (PSNR, K-S, EMD, JS divergence)
# =============================================================================
fig_quality = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'PSNR (dB)',
        'K-S Statistic',
        'Earth Mover\'s Distance',
        'Jensen-Shannon Divergence'
    ),
    horizontal_spacing=0.1,
    vertical_spacing=0.15
)

for branch in branches:
    branch_data = df[df['config.branches'] == branch].sort_values('config.compressor_config.truncBits')
    color = colors[branch]
    
    fig_quality.add_trace(go.Scatter(
        x=branch_data['config.compressor_config.truncBits'], 
        y=branch_data['results.psnr'],
        mode='lines+markers',
        name=branch,
        legendgroup=branch,
        line=dict(color=color),
        marker=dict(color=color)
    ), row=1, col=1)
    
    fig_quality.add_trace(go.Scatter(
        x=branch_data['config.compressor_config.truncBits'], 
        y=branch_data['results.ks_statistic'],
        mode='lines+markers',
        name=branch,
        legendgroup=branch,
        showlegend=False,
        line=dict(color=color),
        marker=dict(color=color)
    ), row=1, col=2)
    
    fig_quality.add_trace(go.Scatter(
        x=branch_data['config.compressor_config.truncBits'], 
        y=branch_data['results.earth_mover_distance'],
        mode='lines+markers',
        name=branch,
        legendgroup=branch,
        showlegend=False,
        line=dict(color=color),
        marker=dict(color=color)
    ), row=2, col=1)
    
    fig_quality.add_trace(go.Scatter(
        x=branch_data['config.compressor_config.truncBits'], 
        y=branch_data['results.jensen_shannon_divergence'],
        mode='lines+markers',
        name=branch,
        legendgroup=branch,
        showlegend=False,
        line=dict(color=color),
        marker=dict(color=color)
    ), row=2, col=2)

# Update layouts and display

fig_system.update_xaxes(tickvals=tickvals, ticktext=tickvals, title_text='Mantissa Bits Truncated')
fig_system.update_layout(
    height=400,
    width=1600,
    title_text="Truncation+ZLib: Effect of bit truncation on performance metrics",
)

fig_quality.update_xaxes(tickvals=tickvals, ticktext=tickvals, title_text='Mantissa Bits Truncated')
fig_quality.update_yaxes(type='log', row=1, col=2)  # K-S on log scale
fig_quality.update_yaxes(type='log', row=2, col=1)  # EMD on log scale
fig_quality.update_yaxes(type='log', row=2, col=2)  # JS Divergence on log scale
fig_quality.update_layout(
    height=700,
    width=1200,
    title_text="Truncation+ZLib: Effect of bit truncation on quality metrics",
)

# Show both
fig_system.show()
fig_quality.show()

# Optionally save
# fig_system.write_html('truncation_system_metrics.html')
# fig_quality.write_html('truncation_quality_metrics.html')