In [1]:
# import uproot
import uproot
from glob import glob
import json 
import os
from tqdm import tqdm
import numpy as np
import pandas as pd

In [2]:
pi0_dir = '/clusterfs/ml4hep/mpettee/ml4pions/data/user.angerami.mc16_13TeV.900246.PG_singlepi0_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/'
pion_dir = '/clusterfs/ml4hep/mpettee/ml4pions/data/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/'
pi0_files = sorted(glob(pi0_dir+"*.root"))
pion_files = sorted(glob(pion_dir+"*.root"))

In [3]:
# use a pared-down list of variables 
variables = ['cluster_cell_E', 'cluster_cell_ID',
             'trackPt','trackD0','trackZ0',
             'trackEta_EMB2','trackPhi_EMB2',
             'trackEta','trackPhi',
             'nCluster','nTrack','truthPartE',
             'cluster_ENG_CALIB_TOT','cluster_E','cluster_Eta','cluster_Phi',
             'cluster_EM_PROBABILITY','cluster_E_LCCalib']
# variables = [var for var in a.keys()] # or use all columns available

In [4]:
def apply_cuts(arrays):
    df = pd.DataFrame(arrays)
    
    ### Single-track, single-cluster 
#     df = df[(df.nCluster == 1) & (df.nTrack == 1)]
#     df = df[df["cluster_E"] > 0.5] # if you only have one cluster
#     df['dR'] = np.sqrt((df['cluster_Eta'].astype('float') - df['trackEta_EMB2'].astype('float'))**2 + 
#                      (df['cluster_Phi'].astype('float') - df['trackPhi_EMB2'].astype('float'))**2)
#     df = df[df.dR < 0.2]

    ### Single-track, multi-cluster 
    df = df[(df.nTrack == 1)]
    dR_pass = []
    for row in df.index:
        dR_pass.append(np.sqrt((df['cluster_Eta'][row].astype('float') - df['trackEta_EMB2'][row].astype('float'))**2 + 
                               (df['cluster_Phi'][row].astype('float') - df['trackPhi_EMB2'][row].astype('float'))**2) < 0.2)
    df["dR_pass"] = dR_pass
    df["event_number"] = df.index
    df.reset_index(inplace=True)
    indices_pass = [] 
    for row in df.index: # kill all events with no clusters passing the delta R cut; deal with individual clusters at the training stage
        if df.dR_pass[row].sum() > 0:
            indices_pass.append(row)
    df = df.iloc[indices_pass]
    return df

In [5]:
for file in tqdm(pi0_files[:10]):
    prefix = file.split("/")[:-2]
    number = file.split("000")[-1][:-5]
    folder = os.path.join("/".join(prefix), "onetrack_multicluster", "pi0_files")
    os.makedirs(folder, exist_ok=True)
    npy_filename = os.path.join(folder, str(number)+".npy")
    a = uproot.open(file)["EventTree"].arrays(variables, library = "np")
    df = apply_cuts(a)
    a_cuts = df.to_dict('list')
    np.save(npy_filename, a_cuts) 

100%|██████████| 10/10 [00:21<00:00,  2.11s/it]


In [6]:
for file in tqdm(pion_files[:10]):
    prefix = file.split("/")[:-2]
    number = file.split("000")[-1][:-5]
    folder = os.path.join("/".join(prefix), "onetrack_multicluster", "pion_files")
    os.makedirs(folder, exist_ok=True)
    npy_filename = os.path.join(folder, str(number)+".npy")
    a = uproot.open(file)["EventTree"].arrays(variables, library = "np")
    df = apply_cuts(a)
    a_cuts = df.to_dict('list')
    np.save(npy_filename, a_cuts) 

100%|██████████| 10/10 [00:34<00:00,  3.48s/it]


# Scratch work

In [None]:
import compress_pickle as pickle
f = pickle.load(open("../data/no_cuts/preprocessed/val/data_000.p", 'rb'), compression='gzip')
for i in range(10):
    print(f[i][3])

In [8]:
a = np.load('/clusterfs/ml4hep/mpettee/ml4pions/data/onetrack_multicluster/pion_files/001.npy', allow_pickle=True).item()

In [9]:
a.keys()

dict_keys(['index', 'cluster_cell_E', 'cluster_cell_ID', 'trackPt', 'trackD0', 'trackZ0', 'trackEta_EMB2', 'trackPhi_EMB2', 'trackEta', 'trackPhi', 'nCluster', 'nTrack', 'truthPartE', 'cluster_ENG_CALIB_TOT', 'cluster_E', 'cluster_Eta', 'cluster_Phi', 'cluster_EM_PROBABILITY', 'cluster_E_LCCalib', 'dR_pass', 'event_number'])

In [11]:
a['cluster_E_LCCalib']

[array([655.79767], dtype=float32),
 array([5.422934 , 2.4060838], dtype=float32),
 array([38.971966, 10.388762], dtype=float32),
 array([ 4.6059566, 11.128962 ], dtype=float32),
 array([1.3670410e+03, 2.4247281e-01], dtype=float32),
 array([5.8442082, 3.0211298], dtype=float32),
 array([12.471335], dtype=float32),
 array([300.0803    ,   0.86859894], dtype=float32),
 array([221.47694  ,  60.7862   ,  21.552328 ,  18.437702 ,  18.27369  ,
          9.921601 ,   6.150375 ,   0.7286528], dtype=float32),
 array([41.769474, 17.543583], dtype=float32),
 array([33.499043], dtype=float32),
 array([13.332436 ,  3.5814538], dtype=float32),
 array([4.544989], dtype=float32),
 array([121.61934,  79.61879], dtype=float32),
 array([58.541805], dtype=float32),
 array([34.76368  , 16.434402 ,  1.4102234], dtype=float32),
 array([21.5263  , 14.877807], dtype=float32),
 array([20.1927], dtype=float32),
 array([0.8889026], dtype=float32),
 array([4.935636 , 1.2124783], dtype=float32),
 array([30.310356 

In [None]:
# uproot.open(pi0_dir+"user.angerami.24559740.OutputStream._000011.root")["CellGeo"].keys()

In [None]:
a = uproot.open(pi0_dir+"user.angerami.24559740.OutputStream._000011.root")["EventTree"].arrays(library = "np")

In [None]:
a.keys()

In [None]:
df = pd.DataFrame(a)

In [None]:
[var for var in df.keys() if var.startswith("truth")]

In [None]:
df[['truthPartPdgId']]

In [None]:
df.nTrack.value_counts()

In [None]:
df[[var for var in df.keys() if "Eta" in var]]

In [None]:
[var for var in a.keys()]

In [None]:
# a

In [None]:
np.save("/clusterfs/ml4hep/mpettee/ml4pions/data/pi0_files/011.npy", a) 

In [None]:
np.load("/clusterfs/ml4hep/mpettee/ml4pions/data/pi0_files/011.npy", allow_pickle=True).item()["cluster_E"]

In [None]:
file = uproot3.recreate("test.root")
file["tree"] = uproot3.newtree({"branch1": np.int32, 
                               "branch2": np.float})
file["tree"].extend({"branch1": [1,3,5], "branch2": [2,4,6],})
file.close()

In [None]:
uproot.open("test.root")["tree"].show()

In [None]:
uproot.open("test.root")["tree"]["branch1"].array()

# Pi0 files

In [None]:
jsonString = json.dumps(pi0_files[:400])
jsonFile = open("pi0_training_data.json", "w")
jsonFile.write(jsonString)
jsonFile.close()

jsonString = json.dumps(pi0_files[400:])
jsonFile = open("pi0_validation_data.json", "w")
jsonFile.write(jsonString)
jsonFile.close()

In [None]:
total_clusters = 0 

for file in pi0_files[:400]:
    total_clusters += uproot.open(file)["EventTree"].arrays(["nCluster"], 
                                                            library = "pd").sum()
print("Pi0 training",total_clusters)  

total_clusters = 0 

for file in pi0_files[400:]:
    total_clusters += uproot.open(file)["EventTree"].arrays(["nCluster"], 
                                                            library = "pd").sum()
print("Pi0 validation",total_clusters)    

# Pion files

In [None]:
jsonString = json.dumps(pion_files[:400])
jsonFile = open("pion_training_data.json", "w")
jsonFile.write(jsonString)
jsonFile.close()

jsonString = json.dumps(pion_files[400:])
jsonFile = open("pion_validation_data.json", "w")
jsonFile.write(jsonString)
jsonFile.close()

In [None]:
total_clusters = 0 

for file in pion_files[:400]:
    total_clusters += uproot.open(file)["EventTree"].arrays(["nCluster"], 
                                                            library = "pd").sum()
print("Pion training",total_clusters)  

total_clusters = 0 

for file in pion_files[400:]:
    total_clusters += uproot.open(file)["EventTree"].arrays(["nCluster"], 
                                                            library = "pd").sum()
print("Pion validation",total_clusters)    

# Experiment with adding cuts

In [None]:
df = uproot.open(pion_dir+"user.angerami.24559744.OutputStream._000001.root")["EventTree"].arrays(
    library= "pd", entry_stop=10)

In [None]:
a = uproot.open(pion_dir+"user.angerami.24559744.OutputStream._000001.root")["EventTree"].arrays(library = "np")

In [None]:
[var for var in a.keys()]

In [None]:
df[0].keys()

In [None]:
df[1].keys()

In [None]:
df[0][["nCluster","nTrack"]]

In [None]:
df2 = df[0].loc[df[0].nCluster > 0]

In [None]:
df2

In [None]:
df[1]