In [144]:
import uproot
from glob import glob
import json 
import os
from tqdm import tqdm
import numpy as np
import pandas as pd

In [145]:
pi0_dir = '/clusterfs/ml4hep/mpettee/ml4pions/data/user.angerami.mc16_13TeV.900246.PG_singlepi0_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/'
pion_dir = '/clusterfs/ml4hep/mpettee/ml4pions/data/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/'
pi0_files = sorted(glob(pi0_dir+"*.root"))
pion_files = sorted(glob(pion_dir+"*.root"))

Check out one file as a test...

In [146]:
# use a pared-down list of variables 
variables = ['cluster_cell_E', 'cluster_cell_ID',
             'trackPt','trackD0','trackZ0',
             'trackEta_EMB2','trackPhi_EMB2',
             'trackEta','trackPhi',
             'nCluster','nTrack','truthPartE', 'truthPartPt',
             'cluster_ENG_CALIB_TOT','cluster_E','cluster_Eta','cluster_Phi',
             'cluster_EM_PROBABILITY','cluster_E_LCCalib','cluster_HAD_WEIGHT']
a = uproot.open(pion_files[11])["EventTree"].arrays(variables, library = "np")
# a.keys()

Now apply all the cuts...

In [147]:
def apply_cuts(arrays):
    df = pd.DataFrame(arrays)
    
    ### Single-track, single-cluster 
#     df = df[(df.nCluster == 1) & (df.nTrack == 1)]
#     df = df[df["cluster_E"] > 0.5] # if you only have one cluster
#     df['dR'] = np.sqrt((df['cluster_Eta'].astype('float') - df['trackEta_EMB2'].astype('float'))**2 + 
#                      (df['cluster_Phi'].astype('float') - df['trackPhi_EMB2'].astype('float'))**2)
#     df = df[df.dR < 0.2]

    ### Single-track, multi-cluster 
    l0 = len(df)
    df = df[(df.nTrack == 1)]
    l1 = len(df) 
    df = df[(df.nCluster > 0)]
    l2 = len(df)
    df.reset_index(inplace=True, drop=True)
    dR = []
    dR_pass = []
    for row in df.index:
        try:
            deltaR = np.sqrt((df['cluster_Eta'][row].astype('float') - df['trackEta'][row].astype('float'))**2 + 
                              (df['cluster_Phi'][row].astype('float') - df['trackPhi'][row].astype('float'))**2)
        except:
            print("Error, track length is actually", len(df['trackEta'][row]))
            deltaR = np.array(999)

        dR.append(deltaR)
        dR_pass.append(deltaR < 0.2)    
    df["dR"] = dR
    df["dR_pass"] = dR_pass
    df["event_number"] = df.index
    indices_pass = [] 
    for row in df.index: # kill all events with no clusters passing the delta R cut; deal with individual clusters at the training stage
        if df.dR_pass[row].sum() > 0:
            indices_pass.append(row)
    df = df.iloc[indices_pass]
    l3 = len(df)
#     print("nTrack cut: {:.2f}".format(100*l1/l0))
#     print("nCluster cut: {:.2f}".format(100*l2/l0))
#     print("dR cut: {:.2f}".format(100*l3/l1))
#     print("overall: {:.2f}".format(100*l3/l0))
    return df

### Test:

In [132]:
df = pd.DataFrame(a)

In [133]:
df = apply_cuts(a)

Error, track length is actually 2
nTrack cut: 60.85
nCluster cut: 52.09
dR cut: 79.71
overall: 48.51


In [None]:
# for row in df.index: 
#     print(row, df.dR_pass[row], df.dR_pass[row].sum())

In [None]:
# x = df.dR.explode()
# np.sum(x < 0.2)/len(x)

In [None]:
# df = pd.DataFrame(a)
# plt.figure(dpi=200)
# # plt.yscale('log')
# plt.hist(df.nTrack, bins=21, histtype="step");
# plt.xlabel("nTrack");

In [None]:
# plt.figure(dpi=200)
# plt.hist(df.dR.explode(), bins=100);
# plt.xlabel(r"$\Delta$R(cluster, track)");

In [None]:
# df[["dR", "dR_pass"]]

### Apply to all files

In [148]:
for file in tqdm(pi0_files[:10]):
    prefix = file.split("/")[:-2]
    number = file.split("000")[-1][:-5]
    folder = os.path.join("/".join(prefix), "onetrack_multicluster", "pi0_files")
    os.makedirs(folder, exist_ok=True)
    npy_filename = os.path.join(folder, str(number)+".npy")
    a = uproot.open(file)["EventTree"].arrays(variables, library = "np")
    df = apply_cuts(a)
    a_cuts = df.to_dict('list')
    np.save(npy_filename, a_cuts) 

 60%|██████    | 6/10 [00:14<00:09,  2.34s/it]

Error, track length is actually 2


100%|██████████| 10/10 [00:23<00:00,  2.32s/it]


In [137]:
for file in tqdm(pion_files):
    prefix = file.split("/")[:-2]
    number = file.split("000")[-1][:-5]
    folder = os.path.join("/".join(prefix), "onetrack_multicluster", "pion_files")
    os.makedirs(folder, exist_ok=True)
    npy_filename = os.path.join(folder, str(number)+".npy")
    a = uproot.open(file)["EventTree"].arrays(variables, library = "np")
    df = apply_cuts(a)
    a_cuts = df.to_dict('list')
    np.save(npy_filename, a_cuts)

  2%|▏         | 11/500 [00:46<34:02,  4.18s/it]

Error, track length is actually 2


 68%|██████▊   | 340/500 [23:53<11:16,  4.23s/it]

Error, track length is actually 2


 71%|███████   | 353/500 [24:48<10:13,  4.17s/it]

Error, track length is actually 2


 80%|████████  | 402/500 [28:15<06:56,  4.25s/it]

Error, track length is actually 2


100%|██████████| 500/500 [35:07<00:00,  4.21s/it]
