In [1]:
# import uproot
import uproot
from glob import glob
import json 
import os
from tqdm import tqdm
import numpy as np
import pandas as pd

In [2]:
pi0_dir = '/clusterfs/ml4hep/mpettee/ml4pions/data/user.angerami.mc16_13TeV.900246.PG_singlepi0_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/'
pion_dir = '/clusterfs/ml4hep/mpettee/ml4pions/data/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/'
pi0_files = sorted(glob(pi0_dir+"*.root"))
pion_files = sorted(glob(pion_dir+"*.root"))

In [3]:
# use a pared-down list of variables 
variables = ['cluster_cell_E', 'cluster_cell_ID',
             'trackPt','trackD0','trackZ0',
             'trackEta_EMB2','trackPhi_EMB2',
             'trackEta','trackPhi',
             'nCluster','nTrack','truthPartE',
             'cluster_ENG_CALIB_TOT','cluster_E','cluster_Eta','cluster_Phi',
             'cluster_EM_PROBABILITY','cluster_E_LCCalib','cluster_HAD_WEIGHT']

Check out one file as a test...

In [4]:
a = uproot.open(pi0_files[0])["EventTree"].arrays(library = "np")
a.keys()

dict_keys(['runNumber', 'eventNumber', 'lumiBlock', 'coreFlags', 'mcEventNumber', 'mcChannelNumber', 'mcEventWeight', 'nTruthPart', 'G4PreCalo_n_EM', 'G4PreCalo_E_EM', 'G4PreCalo_n_Had', 'G4PreCalo_E_Had', 'truthVertexX', 'truthVertexY', 'truthVertexZ', 'truthPartPdgId', 'truthPartStatus', 'truthPartBarcode', 'truthPartPt', 'truthPartE', 'truthPartMass', 'truthPartEta', 'truthPartPhi', 'nTrack', 'trackPt', 'trackP', 'trackMass', 'trackEta', 'trackPhi', 'trackNumberOfPixelHits', 'trackNumberOfSCTHits', 'trackNumberOfPixelDeadSensors', 'trackNumberOfSCTDeadSensors', 'trackNumberOfPixelSharedHits', 'trackNumberOfSCTSharedHits', 'trackNumberOfPixelHoles', 'trackNumberOfSCTHoles', 'trackNumberOfInnermostPixelLayerHits', 'trackNumberOfNextToInnermostPixelLayerHits', 'trackExpectInnermostPixelLayerHit', 'trackExpectNextToInnermostPixelLayerHit', 'trackNumberOfTRTHits', 'trackNumberOfTRTOutliers', 'trackChiSquared', 'trackNumberDOF', 'trackD0', 'trackZ0', 'trackEta_PreSamplerB', 'trackPhi_PreS

Now apply all the cuts...

In [5]:
def apply_cuts(arrays):
    df = pd.DataFrame(arrays)
    
    ### Single-track, single-cluster 
#     df = df[(df.nCluster == 1) & (df.nTrack == 1)]
#     df = df[df["cluster_E"] > 0.5] # if you only have one cluster
#     df['dR'] = np.sqrt((df['cluster_Eta'].astype('float') - df['trackEta_EMB2'].astype('float'))**2 + 
#                      (df['cluster_Phi'].astype('float') - df['trackPhi_EMB2'].astype('float'))**2)
#     df = df[df.dR < 0.2]

    ### Single-track, multi-cluster 
    df = df[(df.nTrack == 1)]
    dR_pass = []
    for row in df.index:
        dR_pass.append(np.sqrt((df['cluster_Eta'][row].astype('float') - df['trackEta_EMB2'][row].astype('float'))**2 + 
                               (df['cluster_Phi'][row].astype('float') - df['trackPhi_EMB2'][row].astype('float'))**2) < 0.2)
    df["dR_pass"] = dR_pass
    df["event_number"] = df.index
    df.reset_index(inplace=True)
    indices_pass = [] 
    for row in df.index: # kill all events with no clusters passing the delta R cut; deal with individual clusters at the training stage
        if df.dR_pass[row].sum() > 0:
            indices_pass.append(row)
    df = df.iloc[indices_pass]
    return df

In [8]:
for file in tqdm(pi0_files):
    prefix = file.split("/")[:-2]
    number = file.split("000")[-1][:-5]
    folder = os.path.join("/".join(prefix), "onetrack_multicluster", "pi0_files")
    os.makedirs(folder, exist_ok=True)
    npy_filename = os.path.join(folder, str(number)+".npy")
    a = uproot.open(file)["EventTree"].arrays(variables, library = "np")
    df = apply_cuts(a)
    a_cuts = df.to_dict('list')
    np.save(npy_filename, a_cuts) 

100%|██████████| 500/500 [17:36<00:00,  2.11s/it]


In [9]:
for file in tqdm(pion_files):
    prefix = file.split("/")[:-2]
    number = file.split("000")[-1][:-5]
    folder = os.path.join("/".join(prefix), "onetrack_multicluster", "pion_files")
    os.makedirs(folder, exist_ok=True)
    npy_filename = os.path.join(folder, str(number)+".npy")
    a = uproot.open(file)["EventTree"].arrays(variables, library = "np")
    df = apply_cuts(a)
    a_cuts = df.to_dict('list')
    np.save(npy_filename, a_cuts) 

100%|██████████| 500/500 [29:50<00:00,  3.58s/it]
