In [16]:
import uproot
from glob import glob
import json 
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [17]:
pi0_dir = '/clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900246.PG_singlepi0_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/'
pion_dir = '/clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/'
pi0_files = sorted(glob(pi0_dir+"*.root"))
pion_files = sorted(glob(pion_dir+"*.root"))

In [None]:
with open('pion_train.json', 'w') as f:
    json.dump(pion_files[:350], f)
with open('pion_val.json', 'w') as f:
    json.dump(pion_files[350:400], f)
with open('pion_test.json', 'w') as f:
    json.dump(pion_files[400:], f)

In [18]:
import math 
def delta_phi(phi1, phi2):
    return (phi1 - phi2 + math.pi) % (2*math.pi) - math.pi

def apply_cuts(arrays):
    df = pd.DataFrame(arrays)

    ### Single-track, multi-cluster 
    df = df[(df.nTrack == 1)]
    dR_pass = []
    deltaR_list = []
    for row in df.index:
        try:
            deltaR = np.sqrt((df['cluster_Eta'][row].astype('float') - df['trackEta'][row].astype('float'))**2 + 
                               delta_phi(df['cluster_Phi'][row].astype('float'), df['trackPhi'][row].astype('float'))**2)
        except:
            deltaR = np.array(999)
        deltaR_list.append(deltaR)
        dR_pass.append(deltaR < 1.2)
    df["deltaR"] = deltaR_list
    df["dR_pass"] = dR_pass
    df["event_number"] = df.index
    df.reset_index(inplace=True)
    indices_pass = [] 
    for row in df.index: # kill all events with no clusters passing the delta R cut; deal with individual clusters at the training stage
        if len(df['trackPt'][row]) > 1:
            continue
        elif df.dR_pass[row].sum() > 0:
            indices_pass.append(row)
    df = df.iloc[indices_pass]
    df = df[(df.trackPt < 10**5)] # Track pT cut 
    return df

In [19]:
# use a pared-down list of variables 
variables = ['cluster_cell_E', 'cluster_cell_ID',
             'trackPt','trackD0','trackZ0',
             'trackEta_EMB2','trackPhi_EMB2',
             'trackEta_EME2','trackPhi_EME2',
             'trackEta','trackPhi',
             'nCluster','nTrack','truthPartE', 'truthPartPt',
             'cluster_ENG_CALIB_TOT','cluster_E','cluster_Eta','cluster_Phi',
             'cluster_EM_PROBABILITY','cluster_E_LCCalib','cluster_HAD_WEIGHT',             
            ]

### SANMAY'S VARIABLES
cluster_var = ['cluster_EM_PROBABILITY', 'cluster_HAD_WEIGHT', 'cluster_OOC_WEIGHT',
               'cluster_DM_WEIGHT', 'cluster_CENTER_MAG', 'cluster_FIRST_ENG_DENS', 
               'cluster_CENTER_LAMBDA', 'cluster_ISOLATION'
              ]

track_var = ['trackPt',
             'trackP',
             'trackMass',
             'trackEta',
             'trackPhi',
             'trackNumberOfPixelHits',
             'trackNumberOfSCTHits',
             'trackNumberOfPixelDeadSensors',
             'trackNumberOfSCTDeadSensors',
             'trackNumberOfInnermostPixelLayerHits',
             'trackNumberOfNextToInnermostPixelLayerHits',
             'trackExpectInnermostPixelLayerHit',
             'trackExpectNextToInnermostPixelLayerHit',
             'trackNumberOfTRTHits',
             'trackNumberOfTRTOutliers',
             'trackChiSquared',
             'trackNumberDOF',
             'trackD0',
             'trackZ0'
            ]

sanmay_vars = variables + cluster_var + track_var

In [None]:
for file in tqdm(pi0_files):
    prefix = file.split("/")[:-2]
    number = file.split("000")[-1][:-5]
    folder = os.path.join("/".join(prefix), 
                          "onetrack_multicluster", 
                          "pi0_files")
    os.makedirs(folder, exist_ok=True)
    npy_filename = os.path.join(folder, str(number)+".npy")
    a = uproot.open(file)["EventTree"].arrays(variables, library = "np")
    df = apply_cuts(a)
    a_cuts = df.to_dict('list')
    np.save(npy_filename, a_cuts) 

In [26]:
for file in tqdm(pion_files[400:]):
    # prefix = file.split("/")[:-2]
    prefix = "/clusterfs/ml4hep/mpettee/ml4pions/data/"
    number = file.split("000")[-1][:-5]
    folder = os.path.join(
        prefix, # "/".join(prefix), 
                          "onetrack_multicluster_sanmay", 
                          "pion_files")
    os.makedirs(folder, exist_ok=True)
    npy_filename = os.path.join(folder, str(number)+".npy")
    a = uproot.open(file)["EventTree"].arrays(sanmay_vars, library = "np")
    df = apply_cuts(a)
    a_cuts = df.to_dict('list')
    np.save(npy_filename, a_cuts) 

100%|██████████| 100/100 [09:35<00:00,  5.75s/it]


### Testing

In [4]:
a = uproot.open(pion_files[400])["EventTree"].arrays(
    # variables, 
    library = "np")
df = apply_cuts(a)

In [26]:
df.truthPartE

0         [5.482727]
2        [26.158798]
3        [14.467227]
4        [4.7492914]
5        [4.5480275]
            ...     
12143     [17.39683]
12145    [41.555126]
12146    [505.93658]
12148    [114.87165]
12149    [19.100674]
Name: truthPartE, Length: 10273, dtype: object

In [21]:
len(df[np.abs(df.truthPartE.explode() - 1.638227) < 0.001].truthPartE)

0

##### Look for the indices to compare with Sanmay:

In [34]:
energies = [1.638227, 9.119415, 29.691723, 100.291687, 499.597717, 995.071655]

In [36]:
for file in tqdm(pion_files[400:]):
    a = uproot.open(file)["EventTree"].arrays(variables, library = "np")
    df = apply_cuts(a)
    for energy in energies:
        if len(df[np.abs(df.truthPartE.explode() - energy) < 0.00001].truthPartE) != 0:
            print("Match found! File = {}".format(file))
            print(df[np.abs(df.truthPartE.explode() - energy) < 0.00001].truthPartE)
# print(df.truthPartE.min())

  7%|▋         | 7/100 [00:24<05:14,  3.38s/it]

Match found! File = /clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/user.angerami.24559744.OutputStream._000410.root
4109    [1.638227]
Name: truthPartE, dtype: object


 28%|██▊       | 28/100 [01:36<04:11,  3.50s/it]

Match found! File = /clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/user.angerami.24559744.OutputStream._000430.root
8254    [499.59772]
Name: truthPartE, dtype: object


 30%|███       | 30/100 [01:43<03:57,  3.39s/it]

Match found! File = /clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/user.angerami.24559744.OutputStream._000432.root
5861    [9.11941]
Name: truthPartE, dtype: object


 38%|███▊      | 38/100 [02:11<03:40,  3.55s/it]

Match found! File = /clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/user.angerami.24559744.OutputStream._000440.root
9082    [9.119415]
Name: truthPartE, dtype: object


 59%|█████▉    | 59/100 [03:23<02:20,  3.44s/it]

Match found! File = /clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/user.angerami.24559744.OutputStream._000461.root
3985    [29.691723]
Name: truthPartE, dtype: object


 97%|█████████▋| 97/100 [05:34<00:10,  3.41s/it]

Match found! File = /clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/user.angerami.24559744.OutputStream._000499.root
3592    [100.29169]
Name: truthPartE, dtype: object
Match found! File = /clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/user.angerami.24559744.OutputStream._000499.root
8425    [995.07166]
Name: truthPartE, dtype: object


100%|██████████| 100/100 [05:44<00:00,  3.45s/it]


In [37]:
selected_pion_files = ["/clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/user.angerami.24559744.OutputStream._000410.root",
"/clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/user.angerami.24559744.OutputStream._000440.root",
"/clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/user.angerami.24559744.OutputStream._000461.root",
"/clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/user.angerami.24559744.OutputStream._000430.root",
"/clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/user.angerami.24559744.OutputStream._000499.root",]

In [52]:
events = []
for file in tqdm(selected_pion_files):
    a = uproot.open(file)["EventTree"].arrays(library = "np")
    df = apply_cuts(a)
    for energy in energies:
        if len(df[np.abs(df.truthPartE.explode() - energy) < 0.00001].truthPartE) != 0:
            print("Match found! File = {}".format(file))
            print(df[np.abs(df.truthPartE.explode() - energy) < 0.00001].truthPartE)
            events.append(df[np.abs(df.truthPartE.explode() - energy) < 0.00001])
# print(df.truthPartE.min())

  0%|          | 0/5 [00:00<?, ?it/s]

Match found! File = /clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/user.angerami.24559744.OutputStream._000410.root
4109    [1.638227]
Name: truthPartE, dtype: object


 20%|██        | 1/5 [00:07<00:28,  7.25s/it]

Match found! File = /clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/user.angerami.24559744.OutputStream._000440.root
9082    [9.119415]
Name: truthPartE, dtype: object


 40%|████      | 2/5 [00:14<00:21,  7.20s/it]

Match found! File = /clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/user.angerami.24559744.OutputStream._000461.root
3985    [29.691723]
Name: truthPartE, dtype: object


 80%|████████  | 4/5 [00:28<00:07,  7.12s/it]

Match found! File = /clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/user.angerami.24559744.OutputStream._000430.root
8254    [499.59772]
Name: truthPartE, dtype: object
Match found! File = /clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/user.angerami.24559744.OutputStream._000499.root
3592    [100.29169]
Name: truthPartE, dtype: object


100%|██████████| 5/5 [00:35<00:00,  7.08s/it]

Match found! File = /clusterfs/ml4hep/mpettee/ml4pions/data/root_files/user.angerami.mc16_13TeV.900247.PG_singlepion_logE0p2to2000.e8312_e7400_s3170_r12383.v01-45-gaa27bcb_OutputStream/user.angerami.24559744.OutputStream._000499.root
8425    [995.07166]
Name: truthPartE, dtype: object





In [53]:
df = pd.concat(events)

In [54]:
df

Unnamed: 0,index,runNumber,eventNumber,lumiBlock,coreFlags,mcEventNumber,mcChannelNumber,mcEventWeight,nTruthPart,G4PreCalo_n_EM,...,cluster_CENTER_LAMBDA,cluster_ISOLATION,cluster_ENERGY_DigiHSTruth,cluster_cell_ID,cluster_cell_E,cluster_hitsTruthIndex,cluster_hitsTruthE,deltaR,dR_pass,event_number
4109,6699,284500,8805468,1,0,8805468,900247,1.0,1,1,...,[16.759415],[1.0],[-999.0],"((754979952, 757113968, 757114480, 757115504, ...","((0.5277696, 0.058863826, 0.075782515, 0.01802...",((0)),((0.21558206)),[0.8515652624760468],[True],6699
9082,15185,284500,1744200,1,0,1744200,900247,1.0,1,4,...,[274.6379],[1.0],[-999.0],"((759182878, 759182880, 759182882, 759182366, ...","((0.19118226, 0.33541742, 0.03791072, 0.040082...",((0)),((0.4949144)),[0.8796527826676044],[True],15185
3985,6542,284500,6740038,1,0,6740038,900247,1.0,1,7,...,[44.152363],[1.0],[-999.0],"((749207614, 749207612, 749207616, 748946494, ...","((0.12675342, 0.005030082, 0.015049332, 0.1322...",((0)),((0.3957773)),[0.4413612757326127],[True],6542
8254,13421,284500,9453782,1,0,9453782,900247,1.0,1,0,...,"[1655.8729, 280.56177, 951.5224, 886.79816, 17...","[0.5583132, 0.6243533, 0.51502, 0.67100334, 0....","[-999.0, -999.0, -999.0, -999.0, -999.0, -999....","((1150091280, 1150107664, 1141702672, 11500915...","((59.199345, 0.426856, 6.5276375, 0.8904916, 7...","((0), (0), (0), (0), (0), (0), (0), (0))","((129.5406), (47.10336), (6.7947793), (2.45943...","[0.028639940728819663, 0.012352409796186377, 0...","[True, True, True, True, True, True, True, True]",13421
3592,5892,284500,6070536,1,0,6070536,900247,1.0,1,9,...,[652.2834],[1.0],[-999.0],"((752879606, 752879608, 752880118, 752879096, ...","((0.5086221, 0.051815987, 0.32129198, 0.112986...",((0)),((1.3566374)),[0.9779932768951003],[True],5892
8425,13863,284500,2149922,1,0,2149922,900247,1.0,1,0,...,"[3134.7249, 1667.5095, 2183.9583, 2426.0288]","[0.78668576, 0.6643609, 0.52810866, 0.75504273]","[-999.0, -999.0, -999.0, -999.0]","((1208846880, 1208830496, 1208863264, 12088473...","((101.894714, 3.6700416, 1.9256063, 0.15658991...","((0), (0), (0), (0))","((134.67303), (5.35034), (4.1942773), (0.33746...","[0.028517369896244267, 0.07940313518030324, 0....","[True, True, True, True]",13863


In [55]:
df.to_hdf("df_transformer_debug.h5", "df")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->Index(['truthPartPdgId', 'truthPartStatus', 'truthPartBarcode', 'truthPartPt',
       'truthPartE', 'truthPartMass', 'truthPartEta', 'truthPartPhi',
       'trackPt', 'trackP',
       ...
       'cluster_FIRST_ENG_DENS', 'cluster_CENTER_LAMBDA', 'cluster_ISOLATION',
       'cluster_ENERGY_DigiHSTruth', 'cluster_cell_ID', 'cluster_cell_E',
       'cluster_hitsTruthIndex', 'cluster_hitsTruthE', 'deltaR', 'dR_pass'],
      dtype='object', length=110)]

  pytables.to_hdf(


In [6]:
df = pd.read_hdf("df_transformer_debug.h5")

In [7]:
df.keys()

Index(['index', 'runNumber', 'eventNumber', 'lumiBlock', 'coreFlags',
       'mcEventNumber', 'mcChannelNumber', 'mcEventWeight', 'nTruthPart',
       'G4PreCalo_n_EM',
       ...
       'cluster_CENTER_LAMBDA', 'cluster_ISOLATION',
       'cluster_ENERGY_DigiHSTruth', 'cluster_cell_ID', 'cluster_cell_E',
       'cluster_hitsTruthIndex', 'cluster_hitsTruthE', 'deltaR', 'dR_pass',
       'event_number'],
      dtype='object', length=129)

In [12]:
[key for key in df.keys() if "Eta" in key]

['truthPartEta',
 'trackEta',
 'trackEta_PreSamplerB',
 'trackEta_PreSamplerE',
 'trackEta_EMB1',
 'trackEta_EMB2',
 'trackEta_EMB3',
 'trackEta_EME1',
 'trackEta_EME2',
 'trackEta_EME3',
 'trackEta_HEC0',
 'trackEta_HEC1',
 'trackEta_HEC2',
 'trackEta_HEC3',
 'trackEta_TileBar0',
 'trackEta_TileBar1',
 'trackEta_TileBar2',
 'trackEta_TileGap1',
 'trackEta_TileGap2',
 'trackEta_TileGap3',
 'trackEta_TileExt0',
 'trackEta_TileExt1',
 'trackEta_TileExt2',
 'AntiKt4EMTopoJetsEta',
 'AntiKt4LCTopoJetsEta',
 'AntiKt4TruthJetsEta',
 'cluster_Eta']

In [15]:
df[['cluster_E','cluster_Eta','trackEta', 'truthPartEta', 'cluster_Phi','trackPhi', 'truthPartPhi','cluster_EM_PROBABILITY', 'cluster_HAD_WEIGHT', 'cluster_OOC_WEIGHT',
               'cluster_DM_WEIGHT', 'cluster_CENTER_MAG', 'cluster_FIRST_ENG_DENS', 
               'cluster_CENTER_LAMBDA', 'cluster_ISOLATION'
              ]]

Unnamed: 0,cluster_E,cluster_Eta,trackEta,truthPartEta,cluster_Phi,trackPhi,truthPartPhi,cluster_EM_PROBABILITY,cluster_HAD_WEIGHT,cluster_OOC_WEIGHT,cluster_DM_WEIGHT,cluster_CENTER_MAG,cluster_FIRST_ENG_DENS,cluster_CENTER_LAMBDA,cluster_ISOLATION
4109,[0.5934837],[-0.25973687],[-0.2805243],[1.607805],[-0.74060816],[-1.5919197],[-1.7966828],[0.49211332],[0.98009163],[1.1358624],[1.0],[1536.88],[4.742205e-07],[16.759415],[1.0]
9082,[0.82536364],[-0.6838584],[-0.628881],[-2.25457],[0.3976988],[-0.4802343],[-0.19429713],[0.06517576],[1.0400422],[1.9183998],[1.0502725],[2111.5806],[2.865492e-07],[274.6379],[1.0]
3985,[0.66128916],[-1.5136156],[-1.9086738],[-1.9151504],[3.1018682],[2.9050717],[2.8025804],[0.5116583],[0.97907114],[2.1411436],[1.288497],[4174.968],[8.01778e-07],[44.152363],[1.0]
8254,"[131.32864, 42.44467, 5.466173, 3.823332, 2.44...","[0.02523974, 0.032032937, 0.013351178, 0.01600...",[0.019860815],[0.019807491],"[-1.1544378, -1.1846708, -1.2848923, -0.943424...",[-1.1825681],[-1.1825013],"[0.000458727, 0.05385432, 0.0013848847, 0.0008...","[1.1211182, 1.0571781, 1.1965067, 1.2111669, 1...","[1.005254, 1.0529661, 1.0632713, 1.132284, 1.0...","[1.1125239, 1.0070257, 1.5859705, 1.2677855, 1...","[3159.362, 1778.8763, 2434.2268, 2389.474, 323...","[4.771859e-07, 9.329076e-06, 6.883655e-08, 5.0...","[1655.8729, 280.56177, 951.5224, 886.79816, 17...","[0.5583132, 0.6243533, 0.51502, 0.67100334, 0...."
3592,[1.7634326],[-1.6830969],[-2.182635],[-2.9042387],[-0.09693346],[0.7438593],[1.1400995],[0.001],[1.2155149],[1.8951625],[1.3286014],[4589.337],[5.8728745e-07],[652.2834],[1.0]
8425,"[119.363686, 7.9105496, 5.7105308, 3.256291]","[-1.0072887, -1.0556288, -1.0082572, -1.1504852]",[-0.97878176],[-0.9791123],"[-0.9378377, -0.9186243, -0.77520394, -1.1300832]",[-0.9386091],[-0.9387261],"[0.001, 0.0003290493, 0.0006894411, 0.02137577]","[1.1482576, 1.0881703, 1.2168978, 1.0106353]","[1.0178477, 1.1170146, 1.0841136, 1.0658295]","[1.0379997, 1.0457845, 1.065008, 1.1431478]","[5359.1133, 4091.796, 4526.9287, 4245.3477]","[3.2673833e-07, 7.6536173e-07, 1.10334085e-07,...","[3134.7249, 1667.5095, 2183.9583, 2426.0288]","[0.78668576, 0.6643609, 0.52810866, 0.75504273]"


In [10]:
df[[
    'truthPartE',
    'trackPt',
             'trackP',
             'trackMass',
             'trackEta',
             'trackPhi',
             'trackNumberOfPixelHits',
             'trackNumberOfSCTHits',
             'trackNumberOfPixelDeadSensors',
             'trackNumberOfSCTDeadSensors',
#              'trackNumberOfPixelSharedHits',
#              'trackNumberOfSCTSharedHits',
#              'trackNumberOfPixelHoles',
#              'trackNumberOfSCTHoles',
             'trackNumberOfInnermostPixelLayerHits',
             'trackNumberOfNextToInnermostPixelLayerHits',
             'trackExpectInnermostPixelLayerHit',
             'trackExpectNextToInnermostPixelLayerHit',
             'trackNumberOfTRTHits',
             'trackNumberOfTRTOutliers',
             'trackChiSquared',
             'trackNumberDOF',
             'trackD0',
             'trackZ0'
            ]]

Unnamed: 0,truthPartE,trackPt,trackP,trackMass,trackEta,trackPhi,trackNumberOfPixelHits,trackNumberOfSCTHits,trackNumberOfPixelDeadSensors,trackNumberOfSCTDeadSensors,trackNumberOfInnermostPixelLayerHits,trackNumberOfNextToInnermostPixelLayerHits,trackExpectInnermostPixelLayerHit,trackExpectNextToInnermostPixelLayerHit,trackNumberOfTRTHits,trackNumberOfTRTOutliers,trackChiSquared,trackNumberDOF,trackD0,trackZ0
4109,[1.638227],[0.56499124],[0.5873681],[0.13957007],[-0.2805243],[-1.5919197],[3],[7],[0],[0],[0],[1],[1],[1],[25],[0],[45.287254],[33],[-7.4432144],[87.2737]
9082,[9.119415],[0.59222865],[0.7132502],[0.13957018],[-0.628881],[-0.4802343],[5],[8],[0],[0],[2],[1],[1],[1],[28],[0],[48.939526],[41],[6.6680737],[27.934269]
3985,[29.691723],[0.7130422],[2.4572914],[0.13957112],[-1.9086738],[2.9050717],[3],[10],[0],[0],[0],[1],[1],[1],[0],[0],[17.507769],[11],[-3.952645],[22.175701]
8254,[499.59772],[436.06116],[436.14716],[0.16326419],[0.019860815],[-1.1825681],[5],[8],[0],[0],[2],[1],[1],[1],[31],[0],[40.224117],[44],[0.0017354734],[25.76988]
3592,[100.29169],[1.1556271],[5.1901484],[0.13957529],[-2.182635],[0.7438593],[5],[10],[0],[0],[2],[1],[1],[1],[0],[0],[12.896353],[15],[9.772532],[-44.83223]
8425,[995.07166],[1118.2161],[1698.0002],[0.36632022],[-0.97878176],[-0.9386091],[4],[9],[0],[0],[1],[1],[1],[1],[20],[0],[23.839802],[32],[-0.004356428],[-1.2269732]


In [None]:
df = pd.DataFrame(a)
# df = df[(df.nTrack == 1)]

In [None]:
np.mean(df.cluster_nCells.explode())

In [None]:
for row in df.index:
    np.sqrt((df['cluster_Eta'][row].astype('float') - 
             df['trackEta'][row].astype('float'))**2 + 
                           delta_phi(df['cluster_Phi'][row].astype('float'), 
                                     df['trackPhi'][row].astype('float'))**2)

In [None]:
df["truthPartE"] = df.truthPartE.explode()
df["trackPt"] = df.trackPt.explode()
df["trackChiSquared"] = df.trackChiSquared.explode()

In [None]:
dfs = []
for file in tqdm(pion_files[:50]):
    a = uproot.open(file)["EventTree"].arrays(library = "np")
    dff = pd.DataFrame(a)
    dff["nCluster"] = dff.nCluster.explode()
    dff["pion_eta"] = dff.truthPartEta.explode()
    dfs.append(dff)
df_pion = pd.concat(dfs)

In [None]:
dfs = []
for file in tqdm(pi0_files[:50]):
    a = uproot.open(file)["EventTree"].arrays(library = "np")
    dff = pd.DataFrame(a)
    dff["nCluster"] = dff.nCluster.explode()
    pion_eta = []
    for row in dff.index:
        pion_eta.append(dff.truthPartEta[row][0])
    dff["pion_eta"] = np.array(pion_eta)
    dfs.append(dff)
df_pi0 = pd.concat(dfs)

In [None]:
from scipy import stats 
binned_stats_pion = stats.binned_statistic(np.array(np.abs(df_pion.pion_eta), dtype=float), 
                                           np.array(df_pion.nCluster, dtype=float), 
                                           statistic='mean', bins=5)

binned_stats_pi0 = stats.binned_statistic(np.array(np.abs(df_pi0.pion_eta), dtype=float), 
                                           np.array(df_pi0.nCluster, dtype=float), 
                                           statistic='mean', bins=5)

plt.figure(dpi=200)
plt.plot(binned_stats_pion.bin_edges[:-1], binned_stats_pion.statistic, label=r"$\pi^{\pm}$")
plt.plot(binned_stats_pi0.bin_edges[:-1], binned_stats_pi0.statistic, label=r"$\pi^{0}$")
plt.xlabel(r"True Pion $|\eta|$");
plt.ylabel("Median Number of Clusters per Pion");
plt.legend();

In [None]:
df = pd.DataFrame(a)

#     ### Single-track, multi-cluster 
df = df[(df.nTrack == 1)]
#     dR_pass = []
#     deltaR_list = []
#     for row in df.index:
#         try:
#             deltaR = np.sqrt((df['cluster_Eta'][row].astype('float') - df['trackEta'][row].astype('float'))**2 + 
#                                delta_phi(df['cluster_Phi'][row].astype('float'), df['trackPhi'][row].astype('float'))**2)
#         except:
#             deltaR = np.array(999)
#         deltaR_list.append(deltaR)
#         dR_pass.append(deltaR < 1.2)
#     df["deltaR"] = deltaR_list
#     df["dR_pass"] = dR_pass
#     df["event_number"] = df.index
#     df.reset_index(inplace=True)
#     indices_pass = [] 
#     for row in df.index: # kill all events with no clusters passing the delta R cut; deal with individual clusters at the training stage
#         if len(df['trackPt'][row]) > 1:
#             continue
#         elif df.dR_pass[row].sum() > 0:
#             indices_pass.append(row)
#     df = df.iloc[indices_pass]
#     df = df[(df.trackPt < 10**5)] # Track pT cut 
#     return df

In [None]:
for file in tqdm(pion_files[:10]):
    a = uproot.open(file)["EventTree"].arrays(variables, library = "np")
    df = pd.DataFrame(a)
    if len(df[(df.truthPartE < 0.3) & (df.nTrack == 1)]) != 0:
        print("Match found!")
# print(df.truthPartE.min())

In [None]:
df = pd.concat([pd.DataFrame(uproot.open(file)["EventTree"].arrays(variables, library = "np")) 
                for file in tqdm(pion_files[400:])])
df.reset_index(inplace=True)
print(len(df[(df.truthPartE < 0.3) & (df.nTrack == 1)]))
print(df[(df.truthPartE < 0.3) & (df.nTrack == 1)].truthPartE)

In [None]:
df[(df.truthPartE < 0.3) & (df.nTrack == 1)].truthPartE

In [None]:
df = pd.concat([pd.DataFrame(uproot.open(file)["EventTree"].arrays(variables, library = "np")) 
                for file in tqdm(pion_files[300:400])])
df.reset_index(inplace=True)
print(len(df[(df.truthPartE < 0.3) & (df.nTrack == 1)]))
print(df[(df.truthPartE < 0.3) & (df.nTrack == 1)].truthPartE)

In [None]:
df = pd.concat([pd.DataFrame(uproot.open(file)["EventTree"].arrays(variables, library = "np")) 
                for file in tqdm(pion_files[:300])])
df.reset_index(inplace=True)
print(len(df[(df.truthPartE < 0.3) & (df.nTrack == 1)]))
print(df[(df.truthPartE < 0.3) & (df.nTrack == 1)].truthPartE)

In [None]:
df[(df.truthPartE < 0.25)].nTrack.value_counts()

In [None]:
df[(df.truthPartE < 0.25) & (df.nTrack == 1)].truthPartE

In [None]:
df_test = pd.concat([pd.DataFrame(uproot.open(file)["EventTree"].arrays(variables, library = "np")) 
                for file in tqdm(pion_files[400:])])
df_test.reset_index(inplace=True)
print(len(df_test[(df_test.truthPartE < 0.25) & (df_test.nTrack == 1)]))
print(df_test[(df_test.truthPartE < 0.25) & (df_test.nTrack == 1)].truthPartE)

In [None]:
df_all = pd.concat([pd.DataFrame(uproot.open(file)["EventTree"].arrays(variables, library = "np")) 
                for file in tqdm(pion_files)])
df_all.reset_index(inplace=True)
print(len(df_all[(df_all.truthPartE < 0.25) & (df_all.nTrack == 1)]))
print(df_all[(df_all.truthPartE < 0.25) & (df_all.nTrack == 1)].truthPartE)

In [None]:
len(df[(df.truthPartE < 0.3) & (df.nTrack == 1)])

In [None]:
100*6/len(df)

In [None]:
df[(df.truthPartE < 0.3) & (df.nTrack == 1)].truthPartE

In [None]:
df_2 = pd.concat([pd.DataFrame(uproot.open(file)["EventTree"].arrays(variables, library = "np")) 
                for file in tqdm(pion_files)])
df_2.reset_index(inplace=True)

In [None]:
df_2[(df_2.truthPartE < 0.3) & (df_2.nTrack == 1)].truthPartE

In [None]:
a = uproot.open(pion_files[0])["EventTree"].arrays(variables, library = "np")
df = pd.DataFrame(a)
df = df[(df.nTrack == 1)]
print(df.truthPartE.min())

In [None]:
df.truthPartE.min()

In [None]:
df = apply_cuts(a)

In [None]:
all_drs = []
dfs = []
for file in tqdm(pion_files[:10]):
    prefix = file.split("/")[:-2]
    number = file.split("000")[-1][:-5]
    folder = os.path.join("/".join(prefix), "onetrack_multicluster", "pion_files")
    os.makedirs(folder, exist_ok=True)
    npy_filename = os.path.join(folder, str(number)+".npy")
    a = uproot.open(file)["EventTree"].arrays(variables, library = "np")
    df = apply_cuts(a)
    list = df.deltaR.to_numpy().flatten()
    drs = np.concatenate(list, axis=0)
    all_drs.append(drs)
    dfs.append(df)
    
import matplotlib.pyplot as plt
all_drs = np.concatenate(all_drs, axis=0)

plt.figure(dpi=200)
plt.hist(all_drs, bins=np.linspace(0.5,1.5,30));
plt.xlabel(r"$\Delta R$(cluster,track)")

plt.figure(dpi=200)
plt.hist(all_drs, bins=np.linspace(0.5,1.5,30));
plt.yscale("log")
plt.xlabel(r"$\Delta R$(cluster,track)")

In [None]:
df = pd.concat([df for df in dfs])

In [None]:
plt.figure(dpi=150)
plt.hist(np.concatenate(df.trackPt.to_numpy().flatten(), axis=0), bins=10);
plt.xlabel(r"Track $p_T$")
plt.yscale("log")

### Scaling

In [None]:
cluster_cell_e = []
cluster_e = []
cluster_eta = []
cluster_phi = []
track_pt = []
track_z0 = []
track_eta = []
track_phi = []
truth_part_e = []

n_files = 10

for file in tqdm(pion_files[:n_files]):
    prefix = file.split("/")[:-2]
    number = file.split("000")[-1][:-5]
    a = uproot.open(file)["EventTree"].arrays(variables, library = "np")
    df = apply_cuts(a)
    track_pt.append(df.trackPt.explode())
    track_z0.append(df.trackZ0)
    track_eta.append(df.trackEta)
    track_phi.append(df.trackPhi)
    truth_part_e.append(df.truthPartE.explode())
    for i in range(len(df)): 
        for cluster in range(df.nCluster.iloc[i]):
            cluster_cell_e.append(np.array(np.log10(df.cluster_cell_E.iloc[i][cluster])))
            cluster_e.append(np.array(np.log10(df.cluster_E.iloc[i][cluster])))
            cluster_eta.append(np.array(df.cluster_Eta.iloc[i][cluster]))
            cluster_phi.append(np.array(df.cluster_Phi.iloc[i][cluster]))

print("Track pT | mean: {} | std: {}".format(np.mean([np.log10(x) for x in np.concatenate(track_pt)]), np.std([np.log10(x) for x in np.concatenate(track_pt)])))
print("Track z0 | mean: {} | std: {}".format(np.mean(np.concatenate(track_z0)), np.std(np.concatenate(track_z0))))
print("Track eta | mean: {} | std: {}".format(np.mean(np.concatenate(track_eta)), np.std(np.concatenate(track_eta))))
print("Track phi | mean: {} | std: {}".format(np.mean(np.concatenate(track_phi)), np.std(np.concatenate(track_phi))))

print("Truth particle E | mean: {} | std: {}".format(np.mean([np.log10(x) for x in np.concatenate(truth_part_e)]), np.std([np.log10(x) for x in np.concatenate(truth_part_e)])))

print("Cluster cell e")
print(np.mean(np.concatenate(cluster_cell_e)))
print(np.std(np.concatenate(cluster_cell_e)))
        
print("Cluster e")
print(np.mean(cluster_e))
print(np.std(cluster_e))

print("Cluster eta")
print(np.mean(cluster_eta))
print(np.std(cluster_eta))

print("Cluster phi")
print(np.mean(cluster_phi))
print(np.std(cluster_phi))

In [None]:
scales = {
    'track_pt_mean': 1.633278727,
    'track_pt_std': 0.8481947183,
    'track_z0_mean': 0.08022017,
    'track_z0_std': 42.53320004,    
    'track_eta_mean': -0.00563187,
    'track_eta_std': 1.35242735,    
    'track_phi_mean': 0.00206431,
    'track_phi_std': 1.81240248,   
    'truth_part_e_mean': 1.92469358, 
    'truth_part_e_std': 0.8289864, 
    'cluster_cell_e_mean': -1.0121697,
    'cluster_cell_e_std': 0.818357, 
    'cluster_e_mean': 0.89923394,
    'cluster_e_std': 1.0585934,
    'cluster_eta_mean': 0.016195267,
    'cluster_eta_std': 1.3400925,
    'cluster_phi_mean': 0.0050816955,
    'cluster_phi_std': 1.8100655,
         }

In [None]:
cell_geo_file = "/clusterfs/ml4hep/mpettee/ml4pions/data/cell_geo.root"
df = uproot.open(cell_geo_file)['CellGeo'].arrays(library='pd')
df.reset_index(inplace=True)
vars = ['cell_geo_sampling', 'cell_geo_eta', 'cell_geo_phi', 'cell_geo_rPerp', 'cell_geo_deta', 'cell_geo_dphi']
for var in vars: 
    print(var)
    print(np.mean(df[var]))
    print(np.std(df[var]))

In [None]:
scales.update({
    'cell_geo_sampling_mean': 3.8827391420197177,
    'cell_geo_sampling_std': 3.9588233603576204,
    'cell_geo_eta_mean': 0.0005979097,
    'cell_geo_eta_std': 1.4709069,
    'cell_geo_phi_mean': -2.8938382e-05,
    'cell_geo_phi_std': 1.813651,
    'cell_geo_rPerp_mean': 1478.9285,
    'cell_geo_rPerp_std': 434.60815,
    'cell_geo_deta_mean': 0.026611786,
    'cell_geo_deta_std': 0.03396141,
    'cell_geo_dphi_mean': 0.068693615,
    'cell_geo_dphi_std': 0.038586758,  
})

In [None]:
scales

In [None]:
# Node features: 
# {
# np.log10(cluster_cell_E), 
# cell_geo_sampling, 
# cell_geo_eta, 
# cell_geo_phi, 
# cell_geo_rPerp, 
# cell_geo_deta, 
# cell_geo_dphi
#}

In [None]:
node_means = [
    scales["cluster_cell_e_mean"], 
    scales["cell_geo_sampling_mean"],
    scales["cell_geo_eta_mean"],
    scales["cell_geo_phi_mean"],
    scales["cell_geo_rPerp_mean"],
    scales["cell_geo_deta_mean"],
    scales["cell_geo_dphi_mean"],
] 

node_stds = [
    scales["cluster_cell_e_std"], 
    scales["cell_geo_sampling_std"],
    scales["cell_geo_eta_std"],
    scales["cell_geo_phi_std"],
    scales["cell_geo_rPerp_std"],
    scales["cell_geo_deta_std"],
    scales["cell_geo_dphi_std"],
] 

In [None]:
node_means

In [None]:
node_stds