In [1]:
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path

# Add the project root (one level up) to sys.path
proj_root = Path("..").resolve()
sys.path.insert(0, str(proj_root))

# Now imports work
from src.ttree_exploration import *

We open a `.root` file and list the `TTree`s inside.

`CollectionTree` holds the experiment data. It has `791` branches.

In [2]:
# filepath, treename = "~/data/DAOD_PHYSLITE.37019878._000009.pool.root.1", "CollectionTree;1"
filepath, treename = "~/data/mc_147771.Zmumu.root", "mini;1"

trees = list_trees(filepath)
trees

{'mini;1': <TTree 'mini' (46 branches) at 0x73b399c5af90>}

We collect relevant information from each branch in `CollectionTree`:
- Branch name (Analysis variable)
- Container for analysis variable
- Datatype category of branch
    - Vector: `vector<float>`, `vector<double>`, `vector<int32>`, `vector<int64>`, `vector<uint32>`, `vector<uint8/bool>`, `vector<other>`
    - Fixed-size array: `array<float>`, `array<double>`, `array<other>`
    - Scalar: `float32`, `float64`, `int32`, `int64`, `uint32`, `uint8`
    - `other`
- String interpretation of the branch's datatype
- Compressed bytes --> Branch size when compressed
- Uncompressed bytes --> Branch size when uncompressed
- Compression ratio --> Uncompressed bytes / Compressed bytes

In [3]:
tree = trees.get(treename)
tree_df = get_branch_info(tree)

tree_df

Unnamed: 0,branch,container,dtype_category,interpretation,compressed_bytes,uncompressed_bytes,compression_ratio
0,runNumber,runNumber,int32,AsDtype('>i4'),152145,30004484,197.209793
1,eventNumber,eventNumber,int32,AsDtype('>i4'),17156309,30004602,1.748896
2,channelNumber,channelNumber,int32,AsDtype('>i4'),152440,30004720,196.829703
3,mcWeight,mcWeight,float,AsDtype('>f4'),17138502,30004425,1.750703
4,pvxp_n,pvxp_n,int32,AsDtype('>i4'),6831366,30004307,4.392139
5,vxp_z,vxp_z,float,AsDtype('>f4'),27873331,30004248,1.07645
6,scaleFactor_PILEUP,scaleFactor_PILEUP,float,AsDtype('>f4'),2222763,30005015,13.498972
7,scaleFactor_ELE,scaleFactor_ELE,float,AsDtype('>f4'),495349,30004838,60.573127
8,scaleFactor_MUON,scaleFactor_MUON,float,AsDtype('>f4'),21265620,30004897,1.410958
9,scaleFactor_BTAG,scaleFactor_BTAG,float,AsDtype('>f4'),11293602,30004897,2.656805


We summarize compression statistics for each datatype in `CollectionTree`.

In [4]:
tree_summary = summarize_by_dtype(tree_df)
print_summary(tree_summary)

OVERALL FILE SUMMARY
Total branches:       46
Compressed size:      946.23 MB
Uncompressed size:    2130.45 MB
Overall compression:  2.25x

SUMMARY BY DATA TYPE

vector<float>
  Branches:          18
  Compressed:        664.19 MB (70.2% of file)
  Uncompressed:      1201.98 MB (56.4%)
  Compression ratio: 1.81x

float
  Branches:          11
  Compressed:        168.40 MB (17.8% of file)
  Uncompressed:      330.05 MB (15.5%)
  Compression ratio: 1.96x

vector<int32>
  Branches:          4
  Compressed:        48.81 MB (5.2% of file)
  Uncompressed:      259.99 MB (12.2%)
  Compression ratio: 5.33x

int32
  Branches:          7
  Compressed:        35.24 MB (3.7% of file)
  Uncompressed:      210.03 MB (9.9%)
  Compression ratio: 5.96x

vector<other>
  Branches:          1
  Compressed:        16.34 MB (1.7% of file)
  Uncompressed:      55.51 MB (2.6%)
  Compression ratio: 3.40x

vector<uint8/bool>
  Branches:          1
  Compressed:        12.48 MB (1.3% of file)
  Uncompressed:   

# Visualization

In [5]:
fig = plot_reduction_by_dtype(tree_summary, filepath)
fig.show()

In [6]:
fig = plot_reduction_by_branch(tree_summary, filepath, reduction_pct=100, n=len(tree_df))
fig.show()

KeyError: 'dtype_category'