In [1]:
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path

# Add the project root (one level up) to sys.path
proj_root = Path("..").resolve()
sys.path.insert(0, str(proj_root))

# Now imports work
from src.ttree_exploration import *

We open a `.root` file and list the `TTree`s inside.

`CollectionTree` holds the experiment data. It has `791` branches.

In [2]:
filepath = "~/data/DAOD_PHYSLITE.37019878._000009.pool.root.1"
trees = list_trees(filepath)
trees

{'##Params;3': <TTree '##Params' (2 branches) at 0x7d12d5ad0ec0>,
 '##Params;2': <TTree '##Params' (2 branches) at 0x7d12d5c5bc50>,
 '##Shapes;3': <TTree '##Shapes' (2 branches) at 0x7d1349508550>,
 '##Shapes;2': <TTree '##Shapes' (2 branches) at 0x7d12d5aae9e0>,
 '##Links;3': <TTree '##Links' (2 branches) at 0x7d12d5aaefd0>,
 '##Links;2': <TTree '##Links' (2 branches) at 0x7d12d5aaad50>,
 'MetaData;1': <TTree 'MetaData' (26 branches) at 0x7d12d5aeae00>,
 'MetaDataHdr;1': <TTree 'MetaDataHdr' (1 branches) at 0x7d12d5aeb350>,
 'MetaDataHdrForm;1': <TTree 'MetaDataHdrForm' (1 branches) at 0x7d12d511d450>,
 'CollectionTree;1': <TTree 'CollectionTree' (791 branches) at 0x7d12d511d950>,
 'POOLContainer;1': <TTree 'POOLContainer' (2 branches) at 0x7d134950d130>,
 'POOLContainerForm;1': <TTree 'POOLContainerForm' (2 branches) at 0x7d134950d400>,
 'POOLCollectionTree;1': <TTree 'POOLCollectionTree' (13 branches) at 0x7d12d5b57850>}

We collect relevant information from each branch in `CollectionTree`:
- Branch name (Analysis variable)
- Container for analysis variable
- Datatype category of branch
    - Vector: `vector<float>`, `vector<double>`, `vector<int32>`, `vector<int64>`, `vector<uint32>`, `vector<uint8/bool>`, `vector<other>`
    - Fixed-size array: `array<float>`, `array<double>`, `array<other>`
    - Scalar: `float32`, `float64`, `int32`, `int64`, `uint32`, `uint8`
    - `other`
- String interpretation of the branch's datatype
- Compressed bytes --> Branch size when compressed
- Uncompressed bytes --> Branch size when uncompressed
- Compression ratio --> Uncompressed bytes / Compressed bytes

In [3]:
collection_tree = trees.get('CollectionTree;1')
collection_tree_df = get_branch_info(collection_tree)

collection_tree_df.head()

Unnamed: 0,branch,container,dtype_category,interpretation,compressed_bytes,uncompressed_bytes,compression_ratio
0,xTrigDecisionAux.,xTrigDecisionAux,vector<int32>,AsGroup(<TBranchElement 'xTrigDecisionAux.' (2...,0,0,1.0
1,xTrigDecisionAux./xTrigDecisionAux.xAOD::AuxIn...,xTrigDecisionAux,other,none of the rules matched\nin file ~/data/DAOD...,1122726,25704380,22.894615
2,xTrigDecisionAux./xTrigDecisionAux.smk,xTrigDecisionAux,int32,AsDtype('>u4'),497448,1980493,3.981307
3,xTrigDecisionAux./xTrigDecisionAux.bgCode,xTrigDecisionAux,other,AsDtype('int8'),201544,558836,2.772774
4,xTrigDecisionAux./xTrigDecisionAux.tav,xTrigDecisionAux,vector<int32>,"AsJagged(AsDtype('>u4'), header_bytes=10)",2570407,31295028,12.175126


We summarize compression statistics for each datatype in `CollectionTree`.

In [4]:
collection_tree_summary = summarize_by_dtype(collection_tree_df)
print_summary(collection_tree_summary)

OVERALL FILE SUMMARY
Total branches:       868
Compressed size:      1627.76 MB
Uncompressed size:    16076.64 MB
Overall compression:  9.88x

SUMMARY BY DATA TYPE

vector<float>
  Branches:          219
  Compressed:        674.27 MB (41.4% of file)
  Uncompressed:      1772.17 MB (11.0%)
  Compression ratio: 2.63x

other
  Branches:          412
  Compressed:        481.00 MB (29.5% of file)
  Uncompressed:      7817.27 MB (48.6%)
  Compression ratio: 16.25x

float
  Branches:          40
  Compressed:        230.14 MB (14.1% of file)
  Uncompressed:      1050.03 MB (6.5%)
  Compression ratio: 4.56x

vector<uint8/bool>
  Branches:          60
  Compressed:        82.42 MB (5.1% of file)
  Uncompressed:      359.63 MB (2.2%)
  Compression ratio: 4.36x

vector<int32>
  Branches:          66
  Compressed:        76.65 MB (4.7% of file)
  Uncompressed:      4476.41 MB (27.8%)
  Compression ratio: 58.40x

vector<other>
  Branches:          31
  Compressed:        43.14 MB (2.7% of file)
 

# Visualization

In [5]:
fig = plot_reduction_by_dtype(collection_tree_summary, filepath)
fig.show()

In [None]:
fig = plot_reduction_by_branch(collection_tree_df, filepath, reduction_pct=100, n=len(collection_tree_df))
fig.show()