In [1]:
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path

# Add the project root (one level up) to sys.path
proj_root = Path("..").resolve()
sys.path.insert(0, str(proj_root))

# Now imports work
from src.ttree_exploration import *
from src.branch_exploration import *

In [2]:
import uproot
import awkward as ak
import numpy as np
import pandas as pd

from plotly import graph_objects as go
import math

We load statistics about each branch:
- Metadata:
    - Compressed bytes
    - Uncompressed bytes
    - Compression ratio
- Data:
    - Min
    - Max
    - Mean
    - Standard deviation
    - Dynamic range = Max - Min

In [3]:
filepath = "~/data/DAOD_PHYSLITE.37019878._000009.pool.root.1"

# branch_df = create_branch_df(
#     filepath=filepath,
#     outfile='branch_stats_DAOD_PHYSLITE.37019878._000009.csv',
#     tree="CollectionTree;1",
#     filters=["MET", "AntiKt"]
# )

branch_df = pd.read_csv('branch_stats_DAOD_PHYSLITE.37019878._000009.csv')
branch_df

Unnamed: 0,branch,container,dtype_category,interpretation,compressed_bytes,uncompressed_bytes,compression_ratio,file,mean,std,min,max,dynamic_range
0,AnalysisJetsAuxDyn.pt,AnalysisJetsAuxDyn,vector<float>,"AsJagged(AsDtype('>f4'), header_bytes=10)",11735110,16408174,1.398212,DAOD_PHYSLITE.37019878._000009.pool.root.1,23357.122056,15108.994820,7916.893600,1.131901e+06,1.123984e+06
1,AnalysisJetsAuxDyn.eta,AnalysisJetsAuxDyn,vector<float>,"AsJagged(AsDtype('>f4'), header_bytes=10)",12173536,16409785,1.347988,DAOD_PHYSLITE.37019878._000009.pool.root.1,-0.076793,3.190549,-4.815889,4.811450e+00,9.627339e+00
2,AnalysisJetsAuxDyn.phi,AnalysisJetsAuxDyn,vector<float>,"AsJagged(AsDtype('>f4'), header_bytes=10)",12342705,16409785,1.329513,DAOD_PHYSLITE.37019878._000009.pool.root.1,-0.053708,1.816104,-3.141591,3.141590e+00,6.283180e+00
3,AnalysisJetsAuxDyn.m,AnalysisJetsAuxDyn,vector<float>,"AsJagged(AsDtype('>f4'), header_bytes=10)",11798895,16406563,1.390517,DAOD_PHYSLITE.37019878._000009.pool.root.1,2808.997077,2469.089778,-0.111024,8.162691e+04,8.162702e+04
4,AnalysisJetsAuxDyn.JetConstitScaleMomentum_pt,AnalysisJetsAuxDyn,vector<float>,"AsJagged(AsDtype('>f4'), header_bytes=10)",11936248,16446968,1.377901,DAOD_PHYSLITE.37019878._000009.pool.root.1,13859.840516,13235.782782,5000.003000,9.714796e+05,9.664796e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,AnalysisSiHitElectronsAuxDyn.topoetcone20_Clos...,AnalysisSiHitElectronsAuxDyn,vector<float>,"AsJagged(AsDtype('>f4'), header_bytes=10)",1211934,5848206,4.825515,DAOD_PHYSLITE.37019878._000009.pool.root.1,10729.059015,17620.476622,-7054.883300,3.401529e+05,3.472078e+05
189,AnalysisSiHitElectronsAuxDyn.z0stheta,AnalysisSiHitElectronsAuxDyn,vector<float>,"AsJagged(AsDtype('>f4'), header_bytes=10)",1186799,5822494,4.906049,DAOD_PHYSLITE.37019878._000009.pool.root.1,-0.070634,20.896347,-190.786760,1.639398e+02,3.547266e+02
190,AnalysisSiHitElectronsAuxDyn.d0Normalized,AnalysisSiHitElectronsAuxDyn,vector<float>,"AsJagged(AsDtype('>f4'), header_bytes=10)",1190663,5828922,4.895526,DAOD_PHYSLITE.37019878._000009.pool.root.1,2.527040,7.388036,0.000118,2.045937e+02,2.045935e+02
191,AnalysisSiHitElectronsAuxDyn.clEta,AnalysisSiHitElectronsAuxDyn,vector<float>,"AsJagged(AsDtype('>f4'), header_bytes=10)",1181343,5817673,4.924626,DAOD_PHYSLITE.37019878._000009.pool.root.1,0.023310,1.324875,-2.466279,2.469400e+00,4.935679e+00


In [4]:
# Correlation between compression ratio and branch stats
numeric_branch_df = branch_df.select_dtypes(include=[np.number])

pearson_corr = numeric_branch_df.corr(method='pearson')
pearson_corr['compression_ratio'].sort_values(ascending=False)

compression_ratio     1.000000
mean                  0.040395
std                   0.040392
dynamic_range         0.040391
max                   0.040391
min                  -0.085010
uncompressed_bytes   -0.729200
compressed_bytes     -0.831760
Name: compression_ratio, dtype: float64

In [5]:
spearman_corr = numeric_branch_df.corr(method='spearman')
spearman_corr['compression_ratio'].sort_values(ascending=False)

compression_ratio     1.000000
mean                  0.209723
min                   0.157404
std                   0.129608
max                   0.121282
dynamic_range         0.087631
uncompressed_bytes   -0.831784
compressed_bytes     -0.958982
Name: compression_ratio, dtype: float64

In [6]:
pt_data = load_branch_data(
    filepath=filepath,
    branch='AnalysisJetsAuxDyn.pt',
    tree='CollectionTree;1'
)

pt_data

In [7]:
# def value_entropy(branch_data, bins=256):
#     flat_data = ak.flatten(branch_data)
#     flat_data = flat_data[np.isfinite(flat_data)]  # Remove NaN and Inf values
    
#     if len(flat_data) == 0:
#         return np.nan
    
#     counts, _ = np.histogram(flat_data, bins=bins, range=(ak.min(flat_data), ak.max(flat_data)))
#     p = counts / ak.sum(counts)
#     p = p[p > 0]
    
#     return -np.sum(p * np.log2(p))
    
# value_entropy(pt_data, bins=len(ak.flatten(pt_data)))

In [8]:
import plotly.io as pio

color = 'steelblue'
# for branch_name in branch_df['branch']:
#     fig = plot_branch_histogram(load_branch_data(filepath, branch_name), branch_name, color)
#     pio.write_image(fig, f"branch_histograms/{branch_name.replace('/', '_')}.svg")

In [16]:
# List branches in order of reduction
branch_df['reduction_pct'] = (1 - 1 / branch_df['compression_ratio']) * 100
branch_df.sort_values(by='compression_ratio', ascending=False)[['branch', 'compression_ratio', 'reduction_pct']]

Unnamed: 0,branch,compression_ratio,reduction_pct
21,EventInfoAuxDyn.mcEventWeights,6.406930,84.391900
72,InDetTrackParticlesAuxDyn.vz,5.609422,82.172853
185,AnalysisSiHitElectronsAuxDyn.m,5.051693,80.204655
186,AnalysisSiHitElectronsAuxDyn.charge,5.023068,80.091850
79,egammaClustersAuxDyn.calM,4.974905,79.899113
...,...,...,...
7,AnalysisJetsAuxDyn.JetConstitScaleMomentum_m,1.344569,25.626718
2,AnalysisJetsAuxDyn.phi,1.329513,24.784481
10,AnalysisJetsAuxDyn.ActiveArea4vec_phi,1.328662,24.736298
6,AnalysisJetsAuxDyn.JetConstitScaleMomentum_phi,1.328488,24.726438
