In [1]:
import pandas as pd

The goal of this experiment is to understand the impact of chunk size on bit truncation with zlib.

`zlibtrunc01-chunksize.sh` performs bit truncation followed by lossless compression with zlib. 

We set zlib's compression level to `5`, and choose to truncate `8` mantissa bits. 

The results are averaged over `3` iterations.

# Load data

In [2]:
results_file = "zlibtrunc-01-chunksize-results/results.jsonl"
raw = pd.read_json(results_file, lines=True)
df = pd.json_normalize(raw.to_dict(orient="records"), sep=".")
df.head()

Unnamed: 0,config.branches,config.chunk_size,config.compressor,config.compressor_config.compressionLevel,config.compressor_config.truncBits,config.decomp_file,config.input_file,config.iterations,config.results_file,config.tree,...,results.earth_mover_distance,results.jensen_shannon_divergence,results.ks_statistic,results.mse,results.original_size_bytes,results.psnr,results.rel_error_avg,results.rel_error_max,system.host,system.timestamp
0,AnalysisJetsAuxDyn.pt,1024,zlib-trunc,5,8,,data.root,10,zlibtrunc-01-chunksize.sh.20260128_114959/resu...,CollectionTree,...,0.254886,2.900851e-10,4.2e-05,0.1282544,10635492,129.934479,1.1e-05,3e-05,Niamh,2026-01-28 11:50:16
1,AnalysisJetsAuxDyn.eta,1024,zlib-trunc,5,8,,data.root,10,zlibtrunc-01-chunksize.sh.20260128_114959/resu...,CollectionTree,...,3.4e-05,6.210986e-08,5.2e-05,2.068024e-09,10635492,106.514572,1.2e-05,3e-05,Niamh,2026-01-28 11:50:30
2,AnalysisJetsAuxDyn.phi,1024,zlib-trunc,5,8,,data.root,10,zlibtrunc-01-chunksize.sh.20260128_114959/resu...,CollectionTree,...,1.8e-05,2.656628e-08,2e-05,5.660784e-10,10635492,108.434822,1.1e-05,3e-05,Niamh,2026-01-28 11:50:46
3,AnalysisElectronsAuxDyn.pt,1024,zlib-trunc,5,8,,data.root,10,zlibtrunc-01-chunksize.sh.20260128_114959/resu...,CollectionTree,...,0.071289,0.0,0.000288,0.01569665,55500,130.508209,1.1e-05,3e-05,Niamh,2026-01-28 11:50:47
4,AnalysisElectronsAuxDyn.eta,1024,zlib-trunc,5,8,,data.root,10,zlibtrunc-01-chunksize.sh.20260128_114959/resu...,CollectionTree,...,1.4e-05,1.311613e-06,0.000216,3.720873e-10,55500,108.259956,1.1e-05,3e-05,Niamh,2026-01-28 11:50:47


In [3]:
kb_count = df['config.chunk_size'] / 1024
df['chunk_size_kb'] = kb_count.astype(int).astype(str) + ' KB'
df

Unnamed: 0,config.branches,config.chunk_size,config.compressor,config.compressor_config.compressionLevel,config.compressor_config.truncBits,config.decomp_file,config.input_file,config.iterations,config.results_file,config.tree,...,results.jensen_shannon_divergence,results.ks_statistic,results.mse,results.original_size_bytes,results.psnr,results.rel_error_avg,results.rel_error_max,system.host,system.timestamp,chunk_size_kb
0,AnalysisJetsAuxDyn.pt,1024,zlib-trunc,5,8,,data.root,10,zlibtrunc-01-chunksize.sh.20260128_114959/resu...,CollectionTree,...,2.900851e-10,0.000042,1.282544e-01,10635492,129.934479,0.000011,0.00003,Niamh,2026-01-28 11:50:16,1 KB
1,AnalysisJetsAuxDyn.eta,1024,zlib-trunc,5,8,,data.root,10,zlibtrunc-01-chunksize.sh.20260128_114959/resu...,CollectionTree,...,6.210986e-08,0.000052,2.068024e-09,10635492,106.514572,0.000012,0.00003,Niamh,2026-01-28 11:50:30,1 KB
2,AnalysisJetsAuxDyn.phi,1024,zlib-trunc,5,8,,data.root,10,zlibtrunc-01-chunksize.sh.20260128_114959/resu...,CollectionTree,...,2.656628e-08,0.000020,5.660784e-10,10635492,108.434822,0.000011,0.00003,Niamh,2026-01-28 11:50:46,1 KB
3,AnalysisElectronsAuxDyn.pt,1024,zlib-trunc,5,8,,data.root,10,zlibtrunc-01-chunksize.sh.20260128_114959/resu...,CollectionTree,...,0.000000e+00,0.000288,1.569665e-02,55500,130.508209,0.000011,0.00003,Niamh,2026-01-28 11:50:47,1 KB
4,AnalysisElectronsAuxDyn.eta,1024,zlib-trunc,5,8,,data.root,10,zlibtrunc-01-chunksize.sh.20260128_114959/resu...,CollectionTree,...,1.311613e-06,0.000216,3.720873e-10,55500,108.259956,0.000011,0.00003,Niamh,2026-01-28 11:50:47,1 KB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,AnalysisJetsAuxDyn.eta,1048576,zlib-trunc,5,8,,data.root,10,zlibtrunc-01-chunksize.sh.20260128_114959/resu...,CollectionTree,...,6.210986e-08,0.000052,2.068024e-09,10635492,106.514572,0.000012,0.00003,Niamh,2026-01-28 11:58:24,1024 KB
62,AnalysisJetsAuxDyn.phi,1048576,zlib-trunc,5,8,,data.root,10,zlibtrunc-01-chunksize.sh.20260128_114959/resu...,CollectionTree,...,2.656628e-08,0.000020,5.660784e-10,10635492,108.434822,0.000011,0.00003,Niamh,2026-01-28 11:58:41,1024 KB
63,AnalysisElectronsAuxDyn.pt,1048576,zlib-trunc,5,8,,data.root,10,zlibtrunc-01-chunksize.sh.20260128_114959/resu...,CollectionTree,...,0.000000e+00,0.000288,1.569665e-02,55500,130.508209,0.000011,0.00003,Niamh,2026-01-28 11:58:41,1024 KB
64,AnalysisElectronsAuxDyn.eta,1048576,zlib-trunc,5,8,,data.root,10,zlibtrunc-01-chunksize.sh.20260128_114959/resu...,CollectionTree,...,1.311613e-06,0.000216,3.720873e-10,55500,108.259956,0.000011,0.00003,Niamh,2026-01-28 11:58:41,1024 KB


# Visualization

In [4]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Assuming df is already loaded

# Color mapping: group by variable type (.pt, .eta, .phi) with shades within each family
branches = df['config.branches'].unique()

# Color families for each variable type
color_families = {
    # Orange family (from #FFA15A)
    '.pt': ['#FFA15A', '#CC7A3D', '#FFB87D', '#994F1A'],
    # Purple family (from #AB63FA)
    # 'purple': ['#AB63FA', '#8344CC', '#C490FF', '#5C2299'],
    # Pink family (from #FF6692)
    # 'pink': ['#FF6692', '#CC4570', '#FF94B3', '#992244'],
    # Lime/yellow-green family (from #B6E880)
    '.eta': ['#B6E880', '#92C45C', '#CEFF9F', '#6B9933'],
    # Cyan family (from #19D3F3)
    '.phi': ['#19D3F3', '#0FA6C2', '#5DE3FF', '#067891'],
}

# Count how many branches per family to assign shades
family_counts = {key: 0 for key in color_families}
colors = {}
other_idx = 0

for branch in branches:
    assigned = False
    for suffix, palette in color_families.items():
        if suffix in branch:
            idx = family_counts[suffix]
            colors[branch] = palette[idx % len(palette)]
            family_counts[suffix] += 1
            assigned = True
            break
    # if not assigned:
    #     colors[branch] = fallback_colors[other_idx % len(fallback_colors)]
    #     other_idx += 1

# Ensure truncBits is numeric and get tick values
df['config.compressor_config.truncBits'] = pd.to_numeric(df['config.compressor_config.truncBits'])
tickvals = sorted(df['config.compressor_config.truncBits'].unique())

# =============================================================================
# Figure 1: System Metrics (ratio, compression/decompression throughput)
# =============================================================================
fig_system = make_subplots(
    rows=1, cols=3,
    subplot_titles=(
        'Compression Ratio',
        'Compression Throughput (MB/s)',
        'Decompression Throughput (MB/s)'
    ),
    horizontal_spacing=0.08
)

for branch in branches:
    branch_data = df[df['config.branches'] == branch].sort_values('config.compressor_config.truncBits')
    color = colors[branch]
    
    fig_system.add_trace(go.Scatter(
        x=branch_data['config.chunk_size'], 
        y=branch_data['results.compression_ratio'],
        mode='lines+markers',
        name=branch,
        legendgroup=branch,
        line=dict(color=color),
        marker=dict(color=color)
    ), row=1, col=1)
    
    fig_system.add_trace(go.Scatter(
        x=branch_data['config.chunk_size'], 
        y=branch_data['results.compression_throughput_mbps'],
        mode='lines+markers',
        name=branch,
        legendgroup=branch,
        showlegend=False,
        line=dict(color=color),
        marker=dict(color=color)
    ), row=1, col=2)
    
    fig_system.add_trace(go.Scatter(
        x=branch_data['config.chunk_size'], 
        y=branch_data['results.decompression_throughput_mbps'],
        mode='lines+markers',
        name=branch,
        legendgroup=branch,
        showlegend=False,
        line=dict(color=color),
        marker=dict(color=color)
    ), row=1, col=3)


# =============================================================================
# Figure 2: Quality Metrics (PSNR, K-S, EMD, JS divergence)
# =============================================================================
fig_quality = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'PSNR (dB)',
        'K-S Statistic',
        'Earth Mover\'s Distance',
        'Jensen-Shannon Divergence'
    ),
    horizontal_spacing=0.1,
    vertical_spacing=0.15
)

for branch in branches:
    branch_data = df[df['config.branches'] == branch].sort_values('config.compressor_config.truncBits')
    color = colors[branch]
    
    fig_quality.add_trace(go.Scatter(
        x=branch_data['config.chunk_size'], 
        y=branch_data['results.psnr'],
        mode='lines+markers',
        name=branch,
        legendgroup=branch,
        line=dict(color=color),
        marker=dict(color=color)
    ), row=1, col=1)
    
    fig_quality.add_trace(go.Scatter(
        x=branch_data['config.chunk_size'], 
        y=branch_data['results.ks_statistic'],
        mode='lines+markers',
        name=branch,
        legendgroup=branch,
        showlegend=False,
        line=dict(color=color),
        marker=dict(color=color)
    ), row=1, col=2)
    
    fig_quality.add_trace(go.Scatter(
        x=branch_data['config.chunk_size'], 
        y=branch_data['results.earth_mover_distance'],
        mode='lines+markers',
        name=branch,
        legendgroup=branch,
        showlegend=False,
        line=dict(color=color),
        marker=dict(color=color)
    ), row=2, col=1)
    
    fig_quality.add_trace(go.Scatter(
        x=branch_data['config.chunk_size'], 
        y=branch_data['results.jensen_shannon_divergence'],
        mode='lines+markers',
        name=branch,
        legendgroup=branch,
        showlegend=False,
        line=dict(color=color),
        marker=dict(color=color)
    ), row=2, col=2)

# Update layouts and display

fig_system.update_xaxes(tickvals=tickvals, ticktext=tickvals, title_text='Chunk Size (KB)')
fig_system.update_layout(
    height=400,
    width=1600,
    title_text="Truncation+ZLib: Effect of chunk size on performance metrics",
)

fig_quality.update_xaxes(tickvals=tickvals, ticktext=tickvals, title_text='Chunk Size (KB)')
fig_quality.update_yaxes(type='log', row=1, col=2)  # K-S on log scale
fig_quality.update_yaxes(type='log', row=2, col=1)  # EMD on log scale
fig_quality.update_yaxes(type='log', row=2, col=2)  # JS Divergence on log scale
fig_quality.update_layout(
    height=700,
    width=1200,
    title_text="Truncation+ZLib: Effect of chunk size on quality metrics",
)

# Show both
fig_system.show()
fig_quality.show()

# Optionally save
# fig_system.write_html('truncation_system_metrics.html')
# fig_quality.write_html('truncation_quality_metrics.html')