In [111]:
# import uproot
import uproot
from glob import glob
import json 
import os
from tqdm import tqdm
import numpy as np
import pandas as pd

In [112]:
pi0_dir = '/clusterfs/ml4hep/mpettee/ml4pions/data/user.angerami.mc16_13TeV.900246.PG_singlepi0_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/'
pion_dir = '/clusterfs/ml4hep/mpettee/ml4pions/data/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/'
pi0_files = sorted(glob(pi0_dir+"*.root"))
pion_files = sorted(glob(pion_dir+"*.root"))

In [113]:
# use a pared-down list of variables 
variables = ['cluster_cell_E', 'cluster_cell_ID',
             'trackPt','trackD0','trackZ0',
             'trackEta_EMB2','trackPhi_EMB2',
             'trackEta','trackPhi',
             'nCluster','nTrack','truthPartE',
             'cluster_ENG_CALIB_TOT','cluster_E','cluster_Eta','cluster_Phi']
# variables = [var for var in a.keys()] # or use all columns available

In [114]:
def apply_cuts(arrays):
    df = pd.DataFrame(arrays)
    l0 = len(df)
    df = df[(df.nCluster == 1) & (df.nTrack == 1)]
#     print("ncluster/ntrack reduces by {:.1f}%.".format(100*(l0-len(df))/l0))
    df = df[df["cluster_E"] > 0.5] # if you only have one cluster
#       df_cluster_E = pd.DataFrame(df.cluster_E.values.tolist()).add_prefix('cluster_E_')
#       df_cluster_E = df_cluster_E[(df_cluster_E > 0.5)]
#     print("cluster_E > 0.5 reduces by {:.1f}%.".format(100*(l0-len(df))/l0))
#     df['cluster_Eta'] = df['cluster_Eta'].astype('float')
#     df['cluster_Phi'] = df['cluster_Phi'].astype('float')
#     df['trackEta_EMB2'] = df['trackEta_EMB2'].astype('float')
#     df['trackPhi_EMB2'] = df['trackPhi_EMB2'].astype('float')
#     df = df[((df.trackEta_EMB2.astype(float) < -1000000000) & (df.trackPhi_EMB2.astype('float') < -1000000000))]
#     print("not dummy value reduces by {:.1f}%.".format(100*(l0-len(df))/l0))
    df['dR'] = np.sqrt((df['cluster_Eta'].astype('float') - df['trackEta_EMB2'].astype('float'))**2 + 
                     (df['cluster_Phi'].astype('float') - df['trackPhi_EMB2'].astype('float'))**2)
    df = df[df.dR < 0.2]
#     print("deltaR by {:.1f}%.".format(100*(l0-len(df))/l0))

#     l3 = len(df)
#     print("n_clusters == 1 and n_tracks == 1 reduces by {:.1f}%.".format(100*(l0-l1)/l0))
#     print("deltaR(cluster,track) < 0.2 reduces by {:.1f}%.".format(100*(l2-l3)/l2))
    return df

In [118]:
for file in tqdm(pi0_files):
    prefix = file.split("/")[:-2]
    number = file.split("000")[-1][:-5]
    folder = os.path.join("/".join(prefix), "pi0_files_cuts")
    os.makedirs(folder, exist_ok=True)
    npy_filename = os.path.join(folder, str(number)+".npy")
    a = uproot.open(file)["EventTree"].arrays(variables, library = "np")
    df = apply_cuts(a)
    a_cuts = df.to_dict('list')
    np.save(npy_filename, a_cuts) 

100%|██████████| 500/500 [17:18<00:00,  2.08s/it]


In [98]:
# rows = []
# for row in df_dict:
#     delete_row = True
#     for var in df_cluster_E.keys(): 
#         if row[var] > 0.5: 
#             delete_row = False 
#         else: 
#             continue
#     rows.append(delete_row)

In [119]:
for file in tqdm(pion_files):
    prefix = file.split("/")[:-2]
    number = file.split("000")[-1][:-5]
    folder = os.path.join("/".join(prefix), "pion_files_cuts")
    os.makedirs(folder, exist_ok=True)
    npy_filename = os.path.join(folder, str(number)+".npy")
    a = uproot.open(file)["EventTree"].arrays(variables, library = "np")
    df = apply_cuts(a)
    a_cuts = df.to_dict('list')
    np.save(npy_filename, a_cuts) 

100%|██████████| 500/500 [20:52<00:00,  2.51s/it]


# Scratch work

In [None]:
# uproot.open(pi0_dir+"user.angerami.24559740.OutputStream._000011.root")["CellGeo"].keys()

In [71]:
a = uproot.open(pi0_dir+"user.angerami.24559740.OutputStream._000011.root")["EventTree"].arrays(library = "np")

In [72]:
df = pd.DataFrame(a)

In [74]:
df.nTrack.value_counts()

0    18400
1      733
2      693
3      102
4       63
5        6
6        2
8        1
Name: nTrack, dtype: int64

In [None]:
df[[var for var in df.keys() if "Eta" in var]]

In [None]:
[var for var in a.keys()]

In [None]:
# a

In [None]:
np.save("/clusterfs/ml4hep/mpettee/ml4pions/data/pi0_files/011.npy", a) 

In [None]:
np.load("/clusterfs/ml4hep/mpettee/ml4pions/data/pi0_files/011.npy", allow_pickle=True).item()["cluster_E"]

In [None]:
file = uproot3.recreate("test.root")
file["tree"] = uproot3.newtree({"branch1": np.int32, 
                               "branch2": np.float})
file["tree"].extend({"branch1": [1,3,5], "branch2": [2,4,6],})
file.close()

In [None]:
uproot.open("test.root")["tree"].show()

In [None]:
uproot.open("test.root")["tree"]["branch1"].array()

# Pi0 files

In [None]:
jsonString = json.dumps(pi0_files[:400])
jsonFile = open("pi0_training_data.json", "w")
jsonFile.write(jsonString)
jsonFile.close()

jsonString = json.dumps(pi0_files[400:])
jsonFile = open("pi0_validation_data.json", "w")
jsonFile.write(jsonString)
jsonFile.close()

In [None]:
total_clusters = 0 

for file in pi0_files[:400]:
    total_clusters += uproot.open(file)["EventTree"].arrays(["nCluster"], 
                                                            library = "pd").sum()
print("Pi0 training",total_clusters)  

total_clusters = 0 

for file in pi0_files[400:]:
    total_clusters += uproot.open(file)["EventTree"].arrays(["nCluster"], 
                                                            library = "pd").sum()
print("Pi0 validation",total_clusters)    

# Pion files

In [None]:
jsonString = json.dumps(pion_files[:400])
jsonFile = open("pion_training_data.json", "w")
jsonFile.write(jsonString)
jsonFile.close()

jsonString = json.dumps(pion_files[400:])
jsonFile = open("pion_validation_data.json", "w")
jsonFile.write(jsonString)
jsonFile.close()

In [None]:
total_clusters = 0 

for file in pion_files[:400]:
    total_clusters += uproot.open(file)["EventTree"].arrays(["nCluster"], 
                                                            library = "pd").sum()
print("Pion training",total_clusters)  

total_clusters = 0 

for file in pion_files[400:]:
    total_clusters += uproot.open(file)["EventTree"].arrays(["nCluster"], 
                                                            library = "pd").sum()
print("Pion validation",total_clusters)    

# Experiment with adding cuts

In [None]:
df = uproot.open(pion_dir+"user.angerami.24559744.OutputStream._000001.root")["EventTree"].arrays(
    library= "pd", entry_stop=10)

In [None]:
df[0].shape

In [None]:
df[1].shape

In [None]:
df[0].keys()

In [None]:
df[0][["nCluster","nTrack"]]

In [None]:
df2 = df[0].loc[df[0].nCluster > 0]

In [None]:
df2

In [None]:
df[1]