In [61]:
# import uproot
import uproot
from glob import glob
import json 
import os
from tqdm import tqdm
import numpy as np
import pandas as pd

In [62]:
pi0_dir = '/clusterfs/ml4hep/mpettee/ml4pions/data/user.angerami.mc16_13TeV.900246.PG_singlepi0_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/'
pion_dir = '/clusterfs/ml4hep/mpettee/ml4pions/data/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/'
pi0_files = sorted(glob(pi0_dir+"*.root"))
pion_files = sorted(glob(pion_dir+"*.root"))

In [63]:
# use a pared-down list of variables 
variables = ['cluster_cell_E', 'cluster_cell_ID',
             'trackPt','trackD0','trackZ0',
             'trackEta_EMB2','trackPhi_EMB2',
             'trackEta','trackPhi',
             'nCluster','nTrack','truthPartE',
             'cluster_ENG_CALIB_TOT','cluster_E','cluster_Eta','cluster_Phi',
             'cluster_EM_PROBABILITY']
# variables = [var for var in a.keys()] # or use all columns available

In [59]:
def apply_cuts(arrays):
    df = pd.DataFrame(arrays)
#     l0 = len(df)
#     df = df[(df.nCluster == 1) & (df.nTrack == 1)]
# #     print("ncluster/ntrack reduces by {:.1f}%.".format(100*(l0-len(df))/l0))
#     df = df[df["cluster_E"] > 0.5] # if you only have one cluster
# #       df_cluster_E = pd.DataFrame(df.cluster_E.values.tolist()).add_prefix('cluster_E_')
# #       df_cluster_E = df_cluster_E[(df_cluster_E > 0.5)]
# #     print("cluster_E > 0.5 reduces by {:.1f}%.".format(100*(l0-len(df))/l0))
#     df['dR'] = np.sqrt((df['cluster_Eta'].astype('float') - df['trackEta_EMB2'].astype('float'))**2 + 
#                      (df['cluster_Phi'].astype('float') - df['trackPhi_EMB2'].astype('float'))**2)
#     df = df[df.dR < 0.2]
#     print("These cuts reduce the dataframe by {:.1f}%.".format(100*(l0-len(df))/l0))
    return df

In [60]:
for file in tqdm(pi0_files):
    prefix = file.split("/")[:-2]
    number = file.split("000")[-1][:-5]
    folder = os.path.join("/".join(prefix), "no_cuts", "pi0_files")
    os.makedirs(folder, exist_ok=True)
    npy_filename = os.path.join(folder, str(number)+".npy")
    a = uproot.open(file)["EventTree"].arrays(variables, library = "np")
    df = apply_cuts(a)
    a_cuts = df.to_dict('list')
    np.save(npy_filename, a_cuts) 

100%|██████████| 500/500 [30:24<00:00,  3.65s/it]


In [55]:
for file in tqdm(pion_files):
    prefix = file.split("/")[:-2]
    number = file.split("000")[-1][:-5]
    folder = os.path.join("/".join(prefix), "no_cuts", "pion_files")
    os.makedirs(folder, exist_ok=True)
    npy_filename = os.path.join(folder, str(number)+".npy")
    a = uproot.open(file)["EventTree"].arrays(variables, library = "np")
    df = apply_cuts(a)
    a_cuts = df.to_dict('list')
    np.save(npy_filename, a_cuts) 

100%|██████████| 500/500 [34:05<00:00,  4.09s/it]


# Scratch work

In [None]:
import compress_pickle as pickle
f = pickle.load(open("../data/no_cuts/preprocessed/val/data_000.p", 'rb'), compression='gzip')
for i in range(10):
    print(f[i][3])

In [10]:
a = np.load('/clusterfs/ml4hep/mpettee/ml4pions/data/no_cuts/pion_files/001.npy', allow_pickle=True).item()

In [11]:
a.keys()

dict_keys(['cluster_cell_E', 'cluster_cell_ID', 'trackPt', 'trackD0', 'trackZ0', 'trackEta_EMB2', 'trackPhi_EMB2', 'trackEta', 'trackPhi', 'nCluster', 'nTrack', 'truthPartE', 'cluster_ENG_CALIB_TOT', 'cluster_E', 'cluster_Eta', 'cluster_Phi'])

In [None]:
a['truthPartE']

In [None]:
# uproot.open(pi0_dir+"user.angerami.24559740.OutputStream._000011.root")["CellGeo"].keys()

In [13]:
a = uproot.open(pi0_dir+"user.angerami.24559740.OutputStream._000011.root")["EventTree"].arrays(library = "np")

In [14]:
a.keys()

dict_keys(['runNumber', 'eventNumber', 'lumiBlock', 'coreFlags', 'mcEventNumber', 'mcChannelNumber', 'mcEventWeight', 'nTruthPart', 'G4PreCalo_n_EM', 'G4PreCalo_E_EM', 'G4PreCalo_n_Had', 'G4PreCalo_E_Had', 'truthVertexX', 'truthVertexY', 'truthVertexZ', 'truthPartPdgId', 'truthPartStatus', 'truthPartBarcode', 'truthPartPt', 'truthPartE', 'truthPartMass', 'truthPartEta', 'truthPartPhi', 'nTrack', 'trackPt', 'trackP', 'trackMass', 'trackEta', 'trackPhi', 'trackNumberOfPixelHits', 'trackNumberOfSCTHits', 'trackNumberOfPixelDeadSensors', 'trackNumberOfSCTDeadSensors', 'trackNumberOfPixelSharedHits', 'trackNumberOfSCTSharedHits', 'trackNumberOfPixelHoles', 'trackNumberOfSCTHoles', 'trackNumberOfInnermostPixelLayerHits', 'trackNumberOfNextToInnermostPixelLayerHits', 'trackExpectInnermostPixelLayerHit', 'trackExpectNextToInnermostPixelLayerHit', 'trackNumberOfTRTHits', 'trackNumberOfTRTOutliers', 'trackChiSquared', 'trackNumberDOF', 'trackD0', 'trackZ0', 'trackEta_PreSamplerB', 'trackPhi_PreS

In [None]:
df = pd.DataFrame(a)

In [None]:
[var for var in df.keys() if var.startswith("truth")]

In [None]:
df[['truthPartPdgId']]

In [None]:
df.nTrack.value_counts()

In [None]:
df[[var for var in df.keys() if "Eta" in var]]

In [None]:
[var for var in a.keys()]

In [None]:
# a

In [None]:
np.save("/clusterfs/ml4hep/mpettee/ml4pions/data/pi0_files/011.npy", a) 

In [None]:
np.load("/clusterfs/ml4hep/mpettee/ml4pions/data/pi0_files/011.npy", allow_pickle=True).item()["cluster_E"]

In [None]:
file = uproot3.recreate("test.root")
file["tree"] = uproot3.newtree({"branch1": np.int32, 
                               "branch2": np.float})
file["tree"].extend({"branch1": [1,3,5], "branch2": [2,4,6],})
file.close()

In [None]:
uproot.open("test.root")["tree"].show()

In [None]:
uproot.open("test.root")["tree"]["branch1"].array()

# Pi0 files

In [None]:
jsonString = json.dumps(pi0_files[:400])
jsonFile = open("pi0_training_data.json", "w")
jsonFile.write(jsonString)
jsonFile.close()

jsonString = json.dumps(pi0_files[400:])
jsonFile = open("pi0_validation_data.json", "w")
jsonFile.write(jsonString)
jsonFile.close()

In [None]:
total_clusters = 0 

for file in pi0_files[:400]:
    total_clusters += uproot.open(file)["EventTree"].arrays(["nCluster"], 
                                                            library = "pd").sum()
print("Pi0 training",total_clusters)  

total_clusters = 0 

for file in pi0_files[400:]:
    total_clusters += uproot.open(file)["EventTree"].arrays(["nCluster"], 
                                                            library = "pd").sum()
print("Pi0 validation",total_clusters)    

# Pion files

In [None]:
jsonString = json.dumps(pion_files[:400])
jsonFile = open("pion_training_data.json", "w")
jsonFile.write(jsonString)
jsonFile.close()

jsonString = json.dumps(pion_files[400:])
jsonFile = open("pion_validation_data.json", "w")
jsonFile.write(jsonString)
jsonFile.close()

In [None]:
total_clusters = 0 

for file in pion_files[:400]:
    total_clusters += uproot.open(file)["EventTree"].arrays(["nCluster"], 
                                                            library = "pd").sum()
print("Pion training",total_clusters)  

total_clusters = 0 

for file in pion_files[400:]:
    total_clusters += uproot.open(file)["EventTree"].arrays(["nCluster"], 
                                                            library = "pd").sum()
print("Pion validation",total_clusters)    

# Experiment with adding cuts

In [None]:
df = uproot.open(pion_dir+"user.angerami.24559744.OutputStream._000001.root")["EventTree"].arrays(
    library= "pd", entry_stop=10)

In [None]:
df[0].shape

In [None]:
df[1].shape

In [None]:
df[0].keys()

In [None]:
df[0][["nCluster","nTrack"]]

In [None]:
df2 = df[0].loc[df[0].nCluster > 0]

In [None]:
df2

In [None]:
df[1]