In [1]:
import matplotlib.pyplot as plt
import uproot
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.model_selection import train_test_split


data_subdir = "noSim_noReco_BsXHc_100events/"

path = "/nfs/dust/belle2/user/axelheim/MC_studies/my6modes/"
root_path = path + "rootfiles/noSim_noReco/"
#file_pi = uproot.open((root_path + "pi_nTuples_mode0.root"))

#file_raw = uproot.open(root_path + "mode0_17_events.root")

In [2]:
def conditions(s):
    label = -1
    if (int(s['genMotherPDG0']) == Bs_pdg) or (int(s['genMotherPDG1']) == Bs_pdg) or (int(s['genMotherPDG2']) == Bs_pdg) or (int(s['genMotherPDG3']) == Bs_pdg):
        label = 1
    elif (int(s['genMotherPDG0']) == Hc_pdg) or (int(s['genMotherPDG1']) == Hc_pdg) or (int(s['genMotherPDG2']) == Hc_pdg) or (int(s['genMotherPDG3']) == Hc_pdg):    
        label = 2
    else:
        label = 0
    
    return label

In [3]:
Bs_pdg_list = [-511, 511,-521,-521,521,521]
Hc_pdg_list = [-411, 411, -411, -421, 411, 423]
dfs =[]
for mode in range(6):
    print("mode:",mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode,mode)
    
    Bs_pdg = Bs_pdg_list[mode]
    Hc_pdg = Hc_pdg_list[mode]
    
    nm = "_nTuples_mode{}.root".format(mode)
    raw_nTuples = uproot.concatenate([(root_path + "gamma" + nm), (root_path + "pi" + nm), (root_path + "K" + nm)])
    event = np.array(raw_nTuples["__event__"])
    mcPDG = np.array(raw_nTuples["mcPDG"])
    px = np.array(raw_nTuples["px"])
    py = np.array(raw_nTuples["py"])
    pz = np.array(raw_nTuples["pz"])
    E = np.array(raw_nTuples["E"])
    M = np.array(raw_nTuples["M"])
    charge = np.array(raw_nTuples["charge"])
    #mcPDG = np.array(raw_nTuples["kaonID"])
    #mcPDG = np.array(raw_nTuples["pionID"])
    genMotherPDG0 = np.array(raw_nTuples["genMotherPDG__bo0__bc"])
    genMotherPDG1 = np.array(raw_nTuples["genMotherPDG__bo1__bc"])
    genMotherPDG2 = np.array(raw_nTuples["genMotherPDG__bo2__bc"])
    genMotherPDG3 = np.array(raw_nTuples["genMotherPDG__bo3__bc"])
    
    #u,c = np.unique(event,return_counts=True)
    #print('\n entries per event:',u,c)
    
    
    df = pd.DataFrame({"event" : event,
                   "mcPDG" : mcPDG,
                   "px" : px,
                   "py" : py,
                   "pz" : pz,
                   "E" : E,
                   #"M" : M,
                   "charge" : charge,
                   "genMotherPDG0" : genMotherPDG0,
                   "genMotherPDG1" : genMotherPDG1,
                   "genMotherPDG2" : genMotherPDG2,
                   "genMotherPDG3" : genMotherPDG3})
    df['label'] = df.apply(conditions, axis=1)
    
    #print(df)
    
    dfs.append(df)
    print('\n')
    
    
    # read out data to save it
    event_df = df[df.event == 1]

    num_FSPs_toData = len(event_df)
    numEvents = df.event.max()
    num_features = 4 


    leaves = np.zeros((numEvents, num_FSPs_toData,  num_features))  
    SA_target =  np.zeros((numEvents, num_FSPs_toData))
    global_tag = np.chararray((numEvents, num_FSPs_toData + 1), itemsize=30)
    global_tag[:,-1] = str(mode)

    for i in range(numEvents):
        event = i + 1

        event_df = df[df.event == event]

        for j in range(num_FSPs_toData):

            particle = event_df.iloc[j]

            #print(particle["mcPDG"],particle["px"],particle["py"],particle["pz"],particle["E"])
            leaves[i,j,0] = particle["px"]
            leaves[i,j,1] = particle["py"]
            leaves[i,j,2] = particle["pz"]
            leaves[i,j,3] = particle["E"]

            global_tag[i,j] = int(particle["mcPDG"])
            SA_target[i,j] = int(particle["label"])


    for idx in np.arange(leaves.shape[0]):   # arange is like range but gives ndarray instead of list
        perms = np.random.permutation(leaves.shape[1])

        leaves[idx,:] = leaves[idx,perms]
        SA_target[idx,:] = SA_target[idx,perms]
        global_tag[idx,0:-1] = global_tag[idx,perms]

    data_dir = Path("/nfs/dust/belle2/user/axelheim/MC_studies/my6modes/data/" + data_subdir)    
    data_dir.mkdir(parents=True, exist_ok=True)
    
    
    train_ratio = 0.75
    validation_ratio = 0.15
    test_ratio = 0.10
    
    x=leaves
    y=SA_target
    z=global_tag
    
    x_train, x_test, y_train, y_test, z_train, z_test = train_test_split(x, y, z, test_size=1 - train_ratio, shuffle=False)
    x_val, x_test, y_val, y_test, z_val, z_test = train_test_split(x_test, y_test, z_test, test_size=test_ratio/(test_ratio + validation_ratio), shuffle=False) 

    
    np.save(data_dir / "leaves_train_{}.npy".format(mode), x_train)
    np.save(data_dir / "is_left_arr_train_{}.npy".format(mode), y_train)
    np.save(data_dir / "global_tag_train_{}.npy".format(mode), z_train)
    
    np.save(data_dir / "leaves_val_{}.npy".format(mode), x_val)
    np.save(data_dir / "is_left_arr_val_{}.npy".format(mode), y_val)
    np.save(data_dir / "global_tag_val_{}.npy".format(mode), z_val)
    
    np.save(data_dir / "leaves_test_{}.npy".format(mode), x_test)
    np.save(data_dir / "is_left_arr_test_{}.npy".format(mode), y_test)
    np.save(data_dir / "global_tag_test_{}.npy".format(mode), z_test)


    print("Data saved to:", data_dir,'\n')

mode: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


Data saved to: /nfs/dust/belle2/user/axelheim/MC_studies/my6modes/data/noSim_noReco_BsXHc_100events 

mode: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1


Data saved to: /nfs/dust/belle2/user/axelheim/MC_studies/my6modes/data/noSim_noReco_BsXHc_100events 

mode: 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2


Data saved to: /nfs/dust/belle2/user/axelheim/MC_studies/my6modes/data/noSim_noReco_BsXHc_100events 

mode: 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3


Data saved to: /nfs/dust/belle2/user/axelheim/MC_studies/my6modes/data/noSim_noReco_BsXHc_100events 

mode: 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4


Data saved to: /nfs/dust/belle2/user/axelheim/MC_studies/my6modes/data/noSim_noReco_BsXHc_100events 

mode: 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5


Data saved to: /nfs/dust/belle2/user/axelheim/MC_studies/my6modes/data/noSim_noReco_BsXHc_100events 



In [4]:
l = 14
print(leaves[l])
print(SA_target[l])
print(global_tag[l])

[[ 7.09068254e-02  2.77826432e-02 -5.46788871e-02  9.37519805e-02]
 [ 2.46605501e-01  1.43007100e-01  1.18047333e+00  1.22240029e+00]
 [ 2.35207915e+00 -1.43704101e-01  1.59298703e-02  2.36064840e+00]
 [-5.70501924e-01 -1.02622175e+00  5.11988580e-01  1.28849359e+00]
 [ 5.78984439e-01 -1.13873042e-01 -9.75721061e-01  1.24255326e+00]
 [-2.72337377e-01  8.80848467e-01  1.74637452e-01  9.48704275e-01]
 [-2.89176702e-02  2.04854548e-01  5.14837086e-01  5.72135242e-01]
 [-5.18861890e-01  6.56866312e-01  1.67010248e-01  9.86052943e-01]
 [-1.35401559e+00 -6.61191463e-01  1.54681778e+00  2.16394489e+00]
 [ 1.49218952e-02  2.96231471e-02 -4.34967764e-02  5.47006705e-02]
 [-6.22111708e-02  2.00817268e-03 -3.89740281e-02  7.34386642e-02]]
[1. 1. 0. 1. 1. 1. 1. 2. 2. 2. 1.]
[b'22' b'211' b'-211' b'211' b'321' b'-211' b'-211' b'-321' b'211' b'22'
 b'22' b'5']


In [5]:
event_df = df[df.event == 15]
event_df.iloc[3]["mcPDG"]

211.0

In [6]:
event_df

Unnamed: 0,event,mcPDG,px,py,pz,E,charge,genMotherPDG0,genMotherPDG1,genMotherPDG2,genMotherPDG3,label
42,15,22.0,0.014922,0.029623,-0.043497,0.054701,0.0,423.0,-521.0,300553.0,0.0,2
43,15,22.0,0.070907,0.027783,-0.054679,0.093752,0.0,111.0,-423.0,521.0,300553.0,1
44,15,22.0,-0.062211,0.002008,-0.038974,0.073439,0.0,111.0,-423.0,521.0,300553.0,1
384,15,211.0,-0.570502,-1.026222,0.511989,1.288494,1.0,521.0,300553.0,0.0,0.0,1
385,15,211.0,0.246606,0.143007,1.180473,1.2224,1.0,521.0,300553.0,0.0,0.0,1
386,15,211.0,-1.354016,-0.661191,1.546818,2.163945,1.0,421.0,423.0,-521.0,300553.0,2
387,15,-211.0,2.352079,-0.143704,0.01593,2.360648,-1.0,-521.0,300553.0,0.0,0.0,0
388,15,-211.0,-0.028918,0.204855,0.514837,0.572135,-1.0,521.0,300553.0,0.0,0.0,1
389,15,-211.0,-0.272337,0.880848,0.174637,0.948704,-1.0,-421.0,-423.0,521.0,300553.0,1
928,15,321.0,0.578984,-0.113873,-0.975721,1.242553,1.0,-421.0,-423.0,521.0,300553.0,1


In [7]:
df = dfs[5]
event_df = df[df.event == 1]
event_df

Unnamed: 0,event,mcPDG,px,py,pz,E,charge,genMotherPDG0,genMotherPDG1,genMotherPDG2,genMotherPDG3,label
0,1,22.0,-0.034705,0.090788,-0.09086,0.13305,0.0,423.0,-521.0,300553.0,0.0,2
1,1,22.0,-0.040399,-0.06301,0.013847,0.076119,0.0,111.0,-423.0,521.0,300553.0,1
2,1,22.0,0.038123,0.014828,0.096376,0.104697,0.0,111.0,-423.0,521.0,300553.0,1
300,1,211.0,0.297317,0.165004,0.553185,0.664167,1.0,521.0,300553.0,0.0,0.0,1
301,1,211.0,0.621048,-0.58248,-0.665926,1.089918,1.0,521.0,300553.0,0.0,0.0,1
302,1,211.0,0.252327,-0.421614,-0.249754,0.568581,1.0,421.0,423.0,-521.0,300553.0,2
303,1,-211.0,1.66564,0.415261,2.049764,2.677277,-1.0,-521.0,300553.0,0.0,0.0,0
304,1,-211.0,0.30535,0.992387,0.126597,1.055261,-1.0,521.0,300553.0,0.0,0.0,1
305,1,-211.0,-0.784782,-0.461327,1.748364,1.976097,-1.0,-421.0,-423.0,521.0,300553.0,1
900,1,321.0,0.126552,-0.169258,-0.24157,0.588843,1.0,-421.0,-423.0,521.0,300553.0,1


In [8]:
u,c = np.unique(event,return_counts=True)
print('\n entries per event:',u,c)

print(mcPDG)

print(event)


 entries per event: [100] [1]
[  22.   22.   22. ... -321.  321. -321.]
100
