# Datasets
Author: Javier Duarte


## Load datasets from `ROOT` files using `uproot`
Here we load the `ROOT` datasets in python using `uproot`

In [None]:
import uproot

In [None]:
uproot.__version__

## Load `ROOT` files
Here we load the `ROOT` datasets into `NumPy` arrays (possibly with jagged structured. See: https://github.com/scikit-hep/uproot

In [None]:
import numpy as np
import h5py

treename = 'HZZ4LeptonsAnalysisReduced'
filename = {}
upfile = {}
params = {}

filename['bkg'] = 'data/ntuple_4mu_bkg.root'
filename['VV'] = 'data/ntuple_4mu_VV.root'

upfile['bkg'] = uproot.open(f'{filename["bkg"]}:{treename}')
upfile['VV'] = uproot.open(f'{filename["VV"]}:{treename}')

params['bkg'] = upfile['bkg'].arrays() # returns a dictionary of arrays
params['VV'] = upfile['VV'].arrays()

# print all variables
print(type(params['bkg']))
print(params['bkg'][0].fields)

# print the shape of one of the NumPy arrays
# print(params['bkg']['f_mass4l'].shape)

# print mass4l value of first entry
print(params['bkg']['f_mass4l'][0])

# print massjj value of first entry
print(params['bkg']['f_massjj'][0])

## Convert `NumPy` arrays to `pandas` DataFrames
In my opinion, `pandas` DataFrames are a more convenient/flexible data container in python: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html. 
So we'll use this instead of structured `NumPy` arrays.

In [None]:
import pandas as pd
import awkward as ak


df = {}
df['bkg'] = ak.to_pandas(params['bkg'])
df['VV'] = ak.to_pandas(params['VV'])

# print first entry
print(df['bkg'].iloc[:1])

# print shape of DataFrame
print(df['bkg'].shape)

# print first entry for f_mass4l and f_massjj
print(df['bkg'][['f_mass4l','f_massjj']].iloc[:1])

# convert back into unstructured NumPY array
print(df['bkg'].values)
print(df['bkg'].values.shape)

# get boolean array
print(df['bkg']['f_mass4l'] > 125)

# cut usigg this boolean array
print(df['bkg']['f_mass4l'][df['bkg']['f_mass4l'] > 125])