In [1]:
import matplotlib.pyplot as plt
import uproot
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.model_selection import train_test_split


events_num_identifier = "_15000_events"

subdir = "wSim_wReco"
save_data = False
path = "/nfs/dust/belle2/user/axelheim/MC_studies/my6modes/"
root_path = path + "rootfiles/" + subdir + '/'
file_pi = uproot.open((root_path + "pi_nTuples_mode1_10_events.root"))

#file_raw = uproot.open(root_path + "mode0_17_events.root")

In [2]:
file_pi["variables"].keys()


['__experiment__',
 '__run__',
 '__event__',
 '__candidate__',
 '__ncandidates__',
 '__weight__',
 'M',
 'x',
 'y',
 'z',
 'px',
 'py',
 'pz',
 'pt',
 'p',
 'E',
 'kaonID',
 'pionID',
 'isSignal',
 'mcErrors',
 'mcPDG',
 'mcPhotos',
 'mcPrimary',
 'mcInitial',
 'charge',
 'uniqueParticleIdentifier',
 'genMotherID',
 'genMotherPDG',
 'genMotherPDG__bo0__bc',
 'genMotherPDG__bo1__bc',
 'genMotherPDG__bo2__bc',
 'genMotherPDG__bo3__bc']

In [3]:
def conditions(s):
    label = -1
    if int(s['genPDG0']) == 0:
        label = 0 # background, cause not related to MC Particles
    elif (int(s['genPDG0']) == Bs_pdg) or (int(s['genPDG1']) == Bs_pdg) or (int(s['genPDG2']) == Bs_pdg) or (int(s['genPDG3']) == Bs_pdg):
        label = 2 # Bs
    elif (int(s['genPDG0']) == Hc_pdg) or (int(s['genPDG1']) == Hc_pdg) or (int(s['genPDG2']) == Hc_pdg) or (int(s['genPDG3']) == Hc_pdg):    
        label = 3 # Hc
    else: 
        label = 1 # X
    
    return label

In [4]:
extraInput_list = [["charge"],["pionID","kaonID"],["dx","dy","dz"],["z0"],["tanlambda"],
                   ["nCDCHits"],["trackNECLClusters"]] #[""] for only 4mom

Bs_pdg_list = [-511, 511,-521,-521,521,521]
Hc_pdg_list = [-411, 411, -411, -421, 411, 423]
dfs =[]
for mode in range(6):
    print("mode:",mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode)
    
    Bs_pdg = Bs_pdg_list[mode]
    Hc_pdg = Hc_pdg_list[mode]
    
    nm = f"_nTuples_mode{mode}" + events_num_identifier + ".root"
    print((root_path + "gamma" + nm))
    raw_nTuples = uproot.concatenate([(root_path + "gamma" + nm), (root_path + "pi" + nm), (root_path + "K" + nm)])
    df = pd.DataFrame({"event" : np.array(raw_nTuples["__event__"]),
       "mcPDG" : np.array(raw_nTuples["mcPDG"]),
       "mcPrimary":np.array(raw_nTuples["mcPrimary"]),
       "isSignal":np.array(raw_nTuples["isSignal"]),
       #"mcInitial":np.array(raw_nTuples["mcInitial"]),
       "px" :  np.array(raw_nTuples["px"]),
       "py" : np.array(raw_nTuples["py"]),
       "pz" : np.array(raw_nTuples["pz"]),
       "E" : np.array(raw_nTuples["E"]),
                   
       "dx" : np.array(raw_nTuples["dx"]),
       "dy" : np.array(raw_nTuples["dy"]),
       "dz" : np.array(raw_nTuples["dz"]),
       "prodVertexX" : np.array(raw_nTuples["prodVertexX"]),
       "prodVertexY" : np.array(raw_nTuples["prodVertexY"]),
       "prodVertexZ" : np.array(raw_nTuples["prodVertexZ"]),
       "z0" : np.array(raw_nTuples["z0"]),
       "d0" : np.array(raw_nTuples["d0"]),
       "tanlambda" : np.array(raw_nTuples["tanlambda"]),
       "nCDCHits" : np.array(raw_nTuples["nCDCHits"]),
       "trackNECLClusters" : np.array(raw_nTuples["trackNECLClusters"]),

       "pionID":np.array(raw_nTuples["pionID"]),   
       "kaonID": np.array(raw_nTuples["kaonID"]),
       "M" : np.array(raw_nTuples["M"]),
       "charge" : np.array(raw_nTuples["charge"]),
       #"fromY4S" : hasAncestor_300553,
       "genPDG0" : np.array(raw_nTuples["genMotherPDG__bo0__bc"]),
       "genPDG1" : np.array(raw_nTuples["genMotherPDG__bo1__bc"]),
       "genPDG2" : np.array(raw_nTuples["genMotherPDG__bo2__bc"]),
       "genPDG3" :  np.array(raw_nTuples["genMotherPDG__bo3__bc"]) })
    df['label'] = df.apply(conditions, axis=1)
    

    ## data preprocessing, dealing with NaN
    imputelist = [["pionID",-1.],["kaonID",-1.],["z0",10.],["tanlambda",10.],["nCDCHits",-1.],
                  ["trackNECLClusters",-1.]]
    for impute in imputelist:
        column_name = impute[0]
        impute_val = impute[1]
        mask = df[column_name].isna() == 1
        df.loc[mask, column_name] = impute_val

    
    #print(df)
    
    dfs.append(df)
    print('\n')
    
    # labels: bg=0 , X=1 , Bs=2 , Hc=3
    label_cut_name_list = [["BsHc",1],["XHc",2],["BsX",3],["BsXHc",-10]]
    for label_cut_names in label_cut_name_list:
        label_cut = label_cut_names[1]
        label_cut_name = label_cut_names[0]
        print(label_cut,label_cut_name,label_cut,label_cut_name,label_cut,label_cut_name)
        #print(df.describe())

        df_cut = df.copy()
        if label_cut_name != "BsXHc":
            df_cut = df_cut[df_cut.label != label_cut]
            
            #important to fix labels (no gap in label classes allowed for NN)
            # only first two cases need relabeling, for third BsX deletion is sufficient
            if label_cut<3: 
                mask = df_cut.label == 3
                column_name = 'label'
                df_cut.loc[mask, column_name] = label_cut
                #df_cut[df_cut.label == 2] = label_cut

       # print(df_cut.describe())
        
        # read out data to save it
        #event_tmp = df_cut.copy()        
        #event_tmp = event_tmp[event_tmp.event == 1]
        #print(event_tmp.describe())

        numFSPs_df = pd.DataFrame({'count' : df_cut.groupby( [ "event"] ).size()}).reset_index()
        minFSPs = numFSPs_df["count"].min()
        maxFSPs = numFSPs_df["count"].max()
        print("minFSPs:",minFSPs)
        print("maxFSPs:",maxFSPs,'\n')
        
        df_cut['numFSPs'] = df_cut.groupby('event')['event'].transform('count')
        
        
        #TODO!!
        for num_FSPs_toData in range(minFSPs, maxFSPs+1):
            #print("num_FSPs_toData:",num_FSPs_toData)

            df_num_subset = df_cut.copy()
            df_num_subset = df_num_subset[df_num_subset['numFSPs'] == num_FSPs_toData]
        

            numEvents = df_num_subset.event.nunique()
            print("numEvents:",numEvents)
            print("num_FSPs_toData:",num_FSPs_toData)  
            if numEvents == 0:
                print("skipped because empty \n")
                continue
            
            if numEvents < 10:
                print("skipped because <10 events \n")
                continue

            for extraInput in extraInput_list:
                #extraInput_names = extraInput[0]
                num_features = 4 + len(extraInput) 
                
                special_dataLabel = str(extraInput)
                
                leaves = np.zeros((numEvents, num_FSPs_toData,  num_features))  
                SA_target =  np.zeros((numEvents, num_FSPs_toData))
                global_tag = np.chararray((numEvents, num_FSPs_toData + 1), itemsize=30)

                event_list = df_num_subset[df_num_subset["numFSPs"] == num_FSPs_toData]["event"].unique()
                #print("len(event_list):",len(event_list))
                for i in range(numEvents):

                    event_iter = event_list[i]

                    global_tag_masterInfo = str(mode) + "_evt" + str(event_iter)
                    global_tag[i,-1] = global_tag_masterInfo
                    #print("global_tag[i,-1]:",global_tag[i,-1])
                    #print("i:",i,"event_iter:",event_iter)
                   
                    event_df = df_num_subset[df_num_subset.event == event_iter]

                    for j in range(num_FSPs_toData):
                        #print("numParticle:",j)
                        particle = event_df.iloc[j]

                        #print(particle["mcPDG"],particle["px"],particle["py"],particle["pz"],particle["E"])
                        leaves[i,j,0] = particle["px"]
                        leaves[i,j,1] = particle["py"]
                        leaves[i,j,2] = particle["pz"]
                        leaves[i,j,3] = particle["E"]
                        for l in range(4, 4 + len(extraInput)):
                            leaves[i,j,l] = particle[extraInput[l-4]]
                            

                        global_tag_Info = str((particle["mcPDG"])) + "_Sg:"
                        global_tag_Info += str((particle["isSignal"])) + "_mcP:" + str((particle["mcPrimary"]))
                        global_tag[i,j] = global_tag_Info
                        
                        SA_target[i,j] = int(particle["label"])

                    del event_df

                for idx in np.arange(leaves.shape[0]):   # arange is like range but gives ndarray instead of list
                    perms = np.random.permutation(leaves.shape[1])

                    leaves[idx,:] = leaves[idx,perms]
                    SA_target[idx,:] = SA_target[idx,perms]
                    global_tag[idx,0:-1] = global_tag[idx,perms]

                data_subdir = subdir + "_" + label_cut_name + events_num_identifier + special_dataLabel + "/"        
                data_dir = Path("/nfs/dust/belle2/user/axelheim/MC_studies/my6modes/data/" + data_subdir)    
                data_dir.mkdir(parents=True, exist_ok=True)

                print(global_tag)
                train_ratio = 0.75
                validation_ratio = 0.15
                test_ratio = 0.10

                #print("leaves.shape:",leaves.shape)
                #print("SA_target.shape:",leaves.shape)
                #print("global_tag.shape:",leaves.shape)

                x=leaves
                y=SA_target
                z=global_tag

                x_train, x_test, y_train, y_test, z_train, z_test = train_test_split(x, y, z, test_size=1 - train_ratio, shuffle=False)
                x_val, x_test, y_val, y_test, z_val, z_test = train_test_split(x_test, y_test, z_test, test_size=test_ratio/(test_ratio + validation_ratio), shuffle=False) 

                if save_data==True:
                    np.save(data_dir / "leaves_train_{}_FSP{}.npy".format(mode,num_FSPs_toData), x_train)
                    np.save(data_dir / "is_left_arr_train_{}_FSP{}.npy".format(mode,num_FSPs_toData), y_train)
                    np.save(data_dir / "global_tag_train_{}_FSP{}.npy".format(mode,num_FSPs_toData), z_train)

                    np.save(data_dir / "leaves_val_{}_FSP{}.npy".format(mode,num_FSPs_toData), x_val)
                    np.save(data_dir / "is_left_arr_val_{}_FSP{}.npy".format(mode,num_FSPs_toData), y_val)
                    np.save(data_dir / "global_tag_val_{}_FSP{}.npy".format(mode,num_FSPs_toData), z_val)

                    np.save(data_dir / "leaves_test_{}_FSP{}.npy".format(mode,num_FSPs_toData), x_test)
                    np.save(data_dir / "is_left_arr_test_{}_FSP{}.npy".format(mode,num_FSPs_toData), y_test)
                    np.save(data_dir / "global_tag_test_{}_FSP{}.npy".format(mode,num_FSPs_toData), z_test)

                print("Data saved to:", data_dir,'is', save_data ,'\n')
                print("")
                #del df_num_subset

        
        del df_cut
        
        

mode: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
/nfs/dust/belle2/user/axelheim/MC_studies/my6modes/rootfiles/wSim_wReco/gamma_nTuples_mode0_15000_events.root


1 BsHc 1 BsHc 1 BsHc
minFSPs: 1
maxFSPs: 19 

numEvents: 1
num_FSPs_toData: 1
skipped because <10 events 

numEvents: 29
num_FSPs_toData: 2
[[b'22.0_Sg:1.0_mcP:1.0' b'-321.0_Sg:1.0_mcP:1.0' b'0_evt182']
 [b'-211.0_Sg:0.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'0_evt536']
 [b'22.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0' b'0_evt1295']
 [b'-211.0_Sg:0.0_mcP:0.0' b'-321.0_Sg:1.0_mcP:1.0' b'0_evt2813']
 [b'321.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'0_evt4015']
 [b'211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:0.0' b'0_evt4864']
 [b'22.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:0.0' b'0_evt5246']
 [b'22.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'0_evt5878']
 [b'nan_Sg:nan_mcP:nan' b'-211.0_Sg:1.0_mcP:1.0' b'0_evt7121']
 [b'211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'0_evt7173']
 [b'22.0_Sg:1.0_mcP:1.0' b'nan_Sg:nan_mcP:nan' b'0_evt8957']
 [b'nan

[[b'-321.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'0_evt182']
 [b'-211.0_Sg:0.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'0_evt536']
 [b'-211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'0_evt1295']
 [b'-321.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:0.0_mcP:0.0' b'0_evt2813']
 [b'321.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'0_evt4015']
 [b'22.0_Sg:1.0_mcP:0.0' b'211.0_Sg:1.0_mcP:1.0' b'0_evt4864']
 [b'22.0_Sg:1.0_mcP:0.0' b'22.0_Sg:1.0_mcP:1.0' b'0_evt5246']
 [b'22.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'0_evt5878']
 [b'-211.0_Sg:1.0_mcP:1.0' b'nan_Sg:nan_mcP:nan' b'0_evt7121']
 [b'211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'0_evt7173']
 [b'nan_Sg:nan_mcP:nan' b'22.0_Sg:1.0_mcP:1.0' b'0_evt8957']
 [b'-211.0_Sg:1.0_mcP:1.0' b'nan_Sg:nan_mcP:nan' b'0_evt9132']
 [b'22.0_Sg:1.0_mcP:1.0' b'321.0_Sg:1.0_mcP:1.0' b'0_evt10373']
 [b'-211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'0_evt11577']
 [b'22.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0' b'0_evt12128']
 [b'22.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'

[[b'211.0_Sg:1.0_mcP:1.0' b'321.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0'
  b'0_evt103']
 [b'-211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'211.0_Sg:1.0_mcP:1.0'
  b'0_evt195']
 [b'22.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:0.0' b'22.0_Sg:1.0_mcP:1.0'
  b'0_evt260']
 [b'22.0_Sg:1.0_mcP:1.0' b'nan_Sg:nan_mcP:nan' b'211.0_Sg:1.0_mcP:1.0'
  b'0_evt265']
 [b'22.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0' b'211.0_Sg:1.0_mcP:1.0'
  b'0_evt436']
 [b'321.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0'
  b'0_evt542']
 [b'211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'-321.0_Sg:1.0_mcP:1.0'
  b'0_evt625']
 [b'22.0_Sg:1.0_mcP:1.0' b'321.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0'
  b'0_evt667']
 [b'22.0_Sg:1.0_mcP:1.0' b'nan_Sg:nan_mcP:nan' b'nan_Sg:nan_mcP:nan'
  b'0_evt707']
 [b'11.0_Sg:0.0_mcP:0.0' b'22.0_Sg:1.0_mcP:1.0' b'321.0_Sg:1.0_mcP:1.0'
  b'0_evt741']
 [b'211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:0.0' b'-211.0_Sg:1.0_mcP:1.0'
  b'0_evt742']
 [b'211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0

[[b'22.0_Sg:1.0_mcP:1.0' b'321.0_Sg:1.0_mcP:1.0' b'211.0_Sg:1.0_mcP:1.0'
  b'0_evt103']
 [b'-211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'211.0_Sg:1.0_mcP:1.0'
  b'0_evt195']
 [b'22.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:0.0' b'22.0_Sg:1.0_mcP:1.0'
  b'0_evt260']
 [b'211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'nan_Sg:nan_mcP:nan'
  b'0_evt265']
 [b'22.0_Sg:1.0_mcP:1.0' b'211.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0'
  b'0_evt436']
 [b'321.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0'
  b'0_evt542']
 [b'-321.0_Sg:1.0_mcP:1.0' b'211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0'
  b'0_evt625']
 [b'321.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0'
  b'0_evt667']
 [b'nan_Sg:nan_mcP:nan' b'22.0_Sg:1.0_mcP:1.0' b'nan_Sg:nan_mcP:nan'
  b'0_evt707']
 [b'11.0_Sg:0.0_mcP:0.0' b'22.0_Sg:1.0_mcP:1.0' b'321.0_Sg:1.0_mcP:1.0'
  b'0_evt741']
 [b'211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:0.0' b'-211.0_Sg:1.0_mcP:1.0'
  b'0_evt742']
 [b'22.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_

[[b'211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'321.0_Sg:1.0_mcP:1.0'
  b'0_evt103']
 [b'22.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0' b'211.0_Sg:1.0_mcP:1.0'
  b'0_evt195']
 [b'22.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:0.0'
  b'0_evt260']
 [b'211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'nan_Sg:nan_mcP:nan'
  b'0_evt265']
 [b'211.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0'
  b'0_evt436']
 [b'22.0_Sg:1.0_mcP:1.0' b'321.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0'
  b'0_evt542']
 [b'211.0_Sg:1.0_mcP:1.0' b'-321.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0'
  b'0_evt625']
 [b'321.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0'
  b'0_evt667']
 [b'nan_Sg:nan_mcP:nan' b'nan_Sg:nan_mcP:nan' b'22.0_Sg:1.0_mcP:1.0'
  b'0_evt707']
 [b'22.0_Sg:1.0_mcP:1.0' b'11.0_Sg:0.0_mcP:0.0' b'321.0_Sg:1.0_mcP:1.0'
  b'0_evt741']
 [b'-211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:0.0' b'211.0_Sg:1.0_mcP:1.0'
  b'0_evt742']
 [b'211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0

[[b'-211.0_Sg:0.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0'
  b'321.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'0_evt1']
 [b'211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0'
  b'321.0_Sg:1.0_mcP:1.0' b'0_evt42']
 [b'211.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0'
  b'211.0_Sg:1.0_mcP:1.0' b'nan_Sg:nan_mcP:nan' b'0_evt67']
 ...
 [b'211.0_Sg:1.0_mcP:1.0' b'211.0_Sg:1.0_mcP:1.0'
  b'-211.0_Sg:1.0_mcP:1.0' b'321.0_Sg:1.0_mcP:1.0' b'0_evt14317']
 [b'-211.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0'
  b'-211.0_Sg:1.0_mcP:1.0' b'321.0_Sg:1.0_mcP:1.0' b'0_evt14515']
 [b'211.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0'
  b'321.0_Sg:1.0_mcP:1.0' b'-321.0_Sg:1.0_mcP:1.0' b'0_evt14923']]
Data saved to: /nfs/dust/belle2/user/axelheim/MC_studies/my6modes/data/wSim_wReco_BsHc_15000_events['charge'] is False 


[[b'321.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:0.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0'
  b'-211.0_Sg:1.0_mcP:1.0' b'0_evt1']
 [b'321.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0'


[[b'-211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:0.0' b'22.0_Sg:1.0_mcP:1.0'
  b'nan_Sg:nan_mcP:nan' b'22.0_Sg:1.0_mcP:1.0' b'0_evt7']
 [b'22.0_Sg:1.0_mcP:1.0' b'-321.0_Sg:0.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0'
  b'321.0_Sg:1.0_mcP:1.0' b'211.0_Sg:1.0_mcP:1.0' b'0_evt15']
 [b'nan_Sg:nan_mcP:nan' b'22.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0'
  b'211.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0' b'0_evt33']
 ...
 [b'-321.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0'
  b'321.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0'
  b'211.0_Sg:1.0_mcP:1.0' b'0_evt10554']
 [b'-211.0_Sg:1.0_mcP:1.0' b'-321.0_Sg:1.0_mcP:1.0'
  b'321.0_Sg:1.0_mcP:1.0' b'211.0_Sg:1.0_mcP:1.0'
  b'-211.0_Sg:0.0_mcP:1.0' b'0_evt12435']
 [b'-211.0_Sg:1.0_mcP:1.0' b'-321.0_Sg:1.0_mcP:1.0'
  b'321.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0'
  b'-211.0_Sg:1.0_mcP:1.0' b'0_evt14567']]
Data saved to: /nfs/dust/belle2/user/axelheim/MC_studies/my6modes/data/wSim_wReco_BsHc_15000_events['z0'] is False 


[[b'22.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:0

[[b'211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0' b'-11.0_Sg:0.0_mcP:0.0'
  ... b'13.0_Sg:0.0_mcP:0.0' b'211.0_Sg:1.0_mcP:1.0' b'0_evt2']
 [b'22.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0'
  b'-321.0_Sg:1.0_mcP:1.0' ... b'321.0_Sg:1.0_mcP:1.0'
  b'22.0_Sg:1.0_mcP:1.0' b'0_evt27']
 [b'-211.0_Sg:1.0_mcP:1.0' b'211.0_Sg:1.0_mcP:1.0' b'22.0_Sg:1.0_mcP:1.0'
  ... b'211.0_Sg:1.0_mcP:1.0' b'nan_Sg:nan_mcP:nan' b'0_evt34']
 ...
 [b'321.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0'
  b'-211.0_Sg:1.0_mcP:1.0' ... b'211.0_Sg:1.0_mcP:1.0'
  b'-321.0_Sg:1.0_mcP:1.0' b'0_evt13130']
 [b'-211.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0'
  b'211.0_Sg:1.0_mcP:1.0' ... b'321.0_Sg:1.0_mcP:1.0'
  b'211.0_Sg:1.0_mcP:1.0' b'0_evt13168']
 [b'-321.0_Sg:1.0_mcP:1.0' b'-211.0_Sg:1.0_mcP:1.0'
  b'-211.0_Sg:1.0_mcP:1.0' ... b'211.0_Sg:1.0_mcP:1.0'
  b'-211.0_Sg:1.0_mcP:1.0' b'0_evt14257']]
Data saved to: /nfs/dust/belle2/user/axelheim/MC_studies/my6modes/data/wSim_wReco_BsHc_15000_events['nCDCHits'] is False 


[[b'22.0_

KeyboardInterrupt: 

In [None]:
Bs_pdg = 511
Hc_pdg = 411
nm = f"_nTuples_mode1" + events_num_identifier + ".root"
raw_nTuples = uproot.concatenate([(root_path + "gamma" + nm), (root_path + "pi" + nm), (root_path + "K" + nm)])

df = pd.DataFrame({"event" : np.array(raw_nTuples["__event__"]),
       "mcPDG" : np.array(raw_nTuples["mcPDG"]),
       "mcPrimary":np.array(raw_nTuples["mcPrimary"]),
       "isSignal":np.array(raw_nTuples["isSignal"]),
       #"mcInitial":np.array(raw_nTuples["mcInitial"]),
       "px" :  np.array(raw_nTuples["px"]),
       "py" : np.array(raw_nTuples["py"]),
       "pz" : np.array(raw_nTuples["pz"]),
       "E" : np.array(raw_nTuples["E"]),
                   
       "dx" : np.array(raw_nTuples["dx"]),
       "dy" : np.array(raw_nTuples["dy"]),
       "dz" : np.array(raw_nTuples["dz"]),
       "prodVertexX" : np.array(raw_nTuples["prodVertexX"]),
       "prodVertexY" : np.array(raw_nTuples["prodVertexY"]),
       "prodVertexZ" : np.array(raw_nTuples["prodVertexZ"]),
       "z0" : np.array(raw_nTuples["z0"]),
       "d0" : np.array(raw_nTuples["d0"]),
       "tanlambda" : np.array(raw_nTuples["tanlambda"]),
       "nCDCHits" : np.array(raw_nTuples["nCDCHits"]),
       "trackNECLClusters" : np.array(raw_nTuples["trackNECLClusters"]),

       "pionID":np.array(raw_nTuples["pionID"]),   
       "kaonID": np.array(raw_nTuples["kaonID"]),
       "M" : np.array(raw_nTuples["M"]),
       "charge" : np.array(raw_nTuples["charge"]),
       #"fromY4S" : hasAncestor_300553,
       "genPDG0" : np.array(raw_nTuples["genMotherPDG__bo0__bc"]),
       "genPDG1" : np.array(raw_nTuples["genMotherPDG__bo1__bc"]),
       "genPDG2" : np.array(raw_nTuples["genMotherPDG__bo2__bc"]),
       "genPDG3" :  np.array(raw_nTuples["genMotherPDG__bo3__bc"]) })
df['label'] = df.apply(conditions, axis=1)

## data preprocessing, dealing with NaN
imputelist = [["pionID",-1.],["kaonID",-1.],["z0",10.],["tanlambda",10.],["nCDCHits",-1.],
              ["trackNECLClusters",-1.]]
for impute in imputelist:
    column_name = impute[0]
    impute_val = impute[1]
    mask = df[column_name].isna() == 1
    df.loc[mask, column_name] = impute_val


In [None]:
df[df["tanlambda"].isna() == True].describe()

In [None]:
mask = df.label == 2
column_name = "name"
df.loc[mask, column_name] = label_cu

In [None]:
Bs_pdg_list = [-511, 511,-521,-521,521,521]
Hc_pdg_list = [-411, 411, -411, -421, 411, 423]
dfs =[]
for mode in range(6):
    print("mode:",mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode)
    
    Bs_pdg = Bs_pdg_list[mode]
    Hc_pdg = Hc_pdg_list[mode]
    
    nm = f"_nTuples_mode{mode}" + events_num_identifier + ".root"
    raw_nTuples = uproot.concatenate([(root_path + "gamma" + nm), (root_path + "pi" + nm), (root_path + "K" + nm)])
    df = pd.DataFrame({"event" : np.array(raw_nTuples["__event__"]),

       "z0" : np.array(raw_nTuples["z0"]),
       "d0" : np.array(raw_nTuples["d0"]),
       "tanlambda" : np.array(raw_nTuples["tanlambda"])})
    print("z0:")
    print("min:",df["z0"].min())
    print("max:",df["z0"].max())
    print("d0:")
    print("min:",df["d0"].min())
    print("max:",df["d0"].max())
    print("tanlambda:")
    print("min:",df["tanlambda"].min())
    print("max:",df["tanlambda"].max())
    print("")

In [None]:
pd.set_option("display.max_columns", None)

In [None]:
df2 = df[["event","tanlambda","z0","pionID","kaonID","nCDCHits","trackNECLClusters"]]
#df2.describe()
df2[df2.event==1]

In [None]:
extraInput_list = [[],["charge"],["pionID","kaonID"],["dx","dy","dz"],["z0"],["tanlambda"],
                   ["nCDCHits"],["trackNECLClusters"]]
for extraInput in extraInput_list:
    print(extraInput, len(extraInput))
    for l in range(4, 4 + len(extraInput)):
        print(l,l-3)


In [None]:
(df[df["event"]==5])

In [None]:
dfs[3][dfs[3].event==1].sort_values("label")

In [None]:
bin_n = 20
bins = np.linspace(0, bin_n, bin_n)

new[(new.mcPDG==22.0) & (new.mcPrimary==1)]["count"].hist(bins=bins, label='gammas')
new[(new.mcPDG==211.0) & (new.mcPrimary==1)]["count"].hist(bins=bins, label='pions')
new[(new.mcPDG==321.0) & (new.mcPrimary==1)]["count"].hist(bins=bins, legend=True)


In [None]:
numFSPs_df = pd.DataFrame({'count' : df.groupby( [ "event"] ).size()}).reset_index()
minFSPs = numFSPs_df["count"].min()
maxFSPs = numFSPs_df["count"].max()


print("maxFSPs:",maxFSPs)
print("minFSPs:",minFSPs)

bins = np.linspace(0, maxFSPs, maxFSPs)
numFSPs_df["count"].hist(bins=bins, legend=True)


In [None]:
for i in range(3):
    evNum = i+1
    print("event",evNum)
    print("# FSPs:",len(df[df.event == evNum]))
    print("# pions:",len(df[(df.event == evNum) & (abs(df.mcPDG) == 211)]))
    print("# kaons:",len(df[(df.event == evNum) & (abs(df.mcPDG) == 321)]))
    print("# gammas:",len(df[(df.event == evNum) & (abs(df.mcPDG) == 22)]))
    print('\n')
    
    

In [None]:
print(len(df[df.event == 1]))
df[df.event == 1]