In [2]:
import matplotlib.pyplot as plt
import uproot
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.model_selection import train_test_split

from datetime import datetime
import sys
sys.path.insert(1, '/afs/desy.de/user/a/axelheim/private/MC_studies/Dstlnu_Bt_generic/util_funcs/')
from pandas_colFuncs import isBtoDstlnu, whichBisSig, customMCmatching, B_ID

In [3]:
now = datetime.now()
print("time at start =", now)

time at start = 2021-10-05 19:06:33.286270


In [4]:
HTCondorRun = str(sys.argv[1])
print("HTCondorRun:",HTCondorRun)

HTCondorRun: -f


In [5]:
sub = str(sys.argv[2])
if sub.find("sub") == -1:
    sub = "sub02"
print("used sub:", sub)

used sub: sub02


In [6]:
save_data = True
save_preprocessedDataframe = True
tmp_data = True
take_subset = False
subset_size = 100000

In [7]:
nfs_path = "/nfs/dust/belle2/user/axelheim/MC_studies/Dstlnu_Bt_generic/"

data_subdir = "Dstlnu_SHR_BsX_p100cut/"
root_subdir = "SHR_dataSteering_run1/"   

root_path = nfs_path + "SHR_Hc_correctReco_BsX/" + root_subdir + sub + "/"

In [8]:
merged = "merged_"
if tmp_data:
    merged += "tmp_"

In [8]:
fileY4S = uproot.open(root_path + merged + "DXtagDstl.root:variables")
#afsPath = "/afs/desy.de/user/a/axelheim/private/MC_studies/Dstlnu_Bt_generic/SHR_Hc_correctReco_BsX/SHR_dataSteering_run1/"
#fileY4S = uproot.open(afsPath + "DXtagDstl.root:variables")

In [9]:
names = ["gammas","electrons","pions","kaons","muons"]
dfs = []
for name in names:
    filename = root_path + merged + "{}.root:variables".format(name)
    #filename = afsPath + "{}.root:variables".format(name)
    print(filename)
    tmpFileFSPs = uproot.open(filename)
    df_tmp = tmpFileFSPs.arrays(library="pd")
    dfs.append(df_tmp)

/nfs/dust/belle2/user/axelheim/MC_studies/Dstlnu_Bt_generic/SHR_Hc_correctReco_BsX/SHR_dataSteering_run1/sub02/merged_tmp_gammas.root:variables
/nfs/dust/belle2/user/axelheim/MC_studies/Dstlnu_Bt_generic/SHR_Hc_correctReco_BsX/SHR_dataSteering_run1/sub02/merged_tmp_electrons.root:variables
/nfs/dust/belle2/user/axelheim/MC_studies/Dstlnu_Bt_generic/SHR_Hc_correctReco_BsX/SHR_dataSteering_run1/sub02/merged_tmp_pions.root:variables
/nfs/dust/belle2/user/axelheim/MC_studies/Dstlnu_Bt_generic/SHR_Hc_correctReco_BsX/SHR_dataSteering_run1/sub02/merged_tmp_kaons.root:variables
/nfs/dust/belle2/user/axelheim/MC_studies/Dstlnu_Bt_generic/SHR_Hc_correctReco_BsX/SHR_dataSteering_run1/sub02/merged_tmp_muons.root:variables


In [10]:
df_FSPs = pd.concat(dfs)

In [11]:
df_Y4S = fileY4S.arrays(library="pd")

In [12]:
print(df_FSPs.shape[0])
print(df_Y4S.shape[0])

46791043
1011345


In [13]:
# delete FSPs for which no Y4S file entry was found
df_FSPs = df_FSPs[df_FSPs['__event__'].isin(df_Y4S["__event__"])]

In [14]:
df_Y4S = df_Y4S[(df_Y4S['Hc_isSignalAcceptMissingGamma'] == 1.0)]

df_FSPs = df_FSPs[df_FSPs['__event__'].isin(df_Y4S["__event__"])]

In [15]:
print(df_FSPs.shape[0])

2669913


### delete particles which occur more than ones based on uniqueParticleIdentifier

In [46]:
groupsFSPs_uniqParID = pd.DataFrame({'count' : df_FSPs.groupby( ["__event__","uniqueParticleIdentifier"] ).size()}).reset_index()
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#    print(groupsFSPs_uniqParID.sort_values("count"))

In [None]:
print("df_FSPs.shape[0]:",df_FSPs.shape[0])
print("groupsFSPs_uniqParID.shape[0]:",groupsFSPs_uniqParID.shape[0])
print("df_Y4S.shape[0]:",df_Y4S.shape[0])

In [16]:
# delete particles which occur more than ones (keep first) and if possible keep the one with basf2_used==1
print("df_FSPs[basf2_used].value_counts():",df_FSPs["basf2_used"].value_counts())
df_FSPs = df_FSPs.sort_values("basf2_used",ascending=False).drop_duplicates(subset=("__event__","uniqueParticleIdentifier"), keep='first')
print("df_FSPs[basf2_used].value_counts():",df_FSPs["basf2_used"].value_counts())

df_FSPs[basf2_used].value_counts(): 0.0    1453461
1.0    1216452
Name: basf2_used, dtype: int64
df_FSPs[basf2_used].value_counts(): 1.0    1209487
0.0     727706
Name: basf2_used, dtype: int64


In [17]:
print("df_FSPs.shape[0]:",df_FSPs.shape[0])

df_FSPs.shape[0]: 1937193


## take a sample if used in notebook for faster processing

In [14]:
all_evt_nums = df_FSPs['__event__'].unique()
all_evt_nums.shape[0]

90

In [15]:
sample_evt_nums = np.random.choice(all_evt_nums, size=subset_size)
sample_evt_nums.shape[0]

100000

In [16]:
if take_subset:
    df_FSPssample = df_FSPs[df_FSPs['__event__'].isin(sample_evt_nums)]
    #df_Y4S=df_Y4Ssample
    df_FSPs=df_FSPssample

In [17]:
print("df_FSPs.shape[0]:",df_FSPs.shape[0])
print("numEvents:",df_FSPs['__event__'].unique().shape[0])

df_FSPs.shape[0]: 4215
numEvents: 90


## save sample 

In [22]:
if take_subset:
    df_FSPs.to_csv(root_path + "df_FSPs_sample_{}_evts.csv".format(subset_size))
    #df_Y4S.to_csv(root_path + "df_Y4S_sample__evts.csv")

In [23]:
load = False
if load:
    df_FSPs = pd.read_csv(root_path + "df_FSPs_sample_{}_evts.csv".format(subset_size))
    #df_Y4S = pd.read_csv(root_path + "df_Y4S_sample10evts.csv")

## filter the wanted D*lnu and Hc isSignal==1 events

In [18]:
df_Y4S['isBtoDstlnu'] = df_Y4S.apply(isBtoDstlnu, axis=1)

In [19]:
df_Y4S[(df_Y4S['isBtoDstlnu'] == 1) & (df_Y4S['Hc_isSignalAcceptMissingGamma'] == 1.0)].shape[0]

37354

In [20]:
df_Y4S = df_Y4S[(df_Y4S['isBtoDstlnu'] == 1) & (df_Y4S['Hc_isSignalAcceptMissingGamma'] == 1.0)]

In [21]:
# delete FSPs for which no Y4S file entry is left after filer
df_FSPs = df_FSPs[df_FSPs['__event__'].isin(df_Y4S["__event__"])]
df_FSPs.shape[0]

1131187

### add cols with extra info for data prod

In [22]:
df_Y4S['Bsig_uniqParID'] = df_Y4S.apply(whichBisSig, axis=1)

In [23]:
# function to create col with the particles mother B's uniqueParticleIdentifier
df_FSPs['B_ID'] = df_FSPs.apply(B_ID, axis=1)

### print one event

In [47]:
df_Y4S[(df_Y4S['isBtoDstlnu'] == 1) & (df_Y4S['Hc_isSignalAcceptMissingGamma'] == 1.0)][["__event__",'Bsig_uniqParID','isBtoDstlnu','Hc_isSignalAcceptMissingGamma','Hc_mcPDG','Hc_genMotherPDG','Hc_uniqParID','Hc_genMotherID']][:10]

Unnamed: 0,__event__,Bsig_uniqParID,isBtoDstlnu,Hc_isSignalAcceptMissingGamma,Hc_mcPDG,Hc_genMotherPDG,Hc_uniqParID,Hc_genMotherID
0,22239789,83886081.0,1,1.0,421.0,413.0,100663296.0,3.0
1,22242808,83886082.0,1,1.0,421.0,413.0,100663296.0,3.0
2,22243407,83886081.0,1,1.0,-411.0,-413.0,100663296.0,6.0
3,22246415,83886081.0,1,1.0,-421.0,-423.0,100663296.0,13.0
4,14790197,83886082.0,1,1.0,-411.0,511.0,100663296.0,1.0
5,14794866,83886082.0,1,1.0,-421.0,-413.0,100663296.0,3.0
6,26364865,83886082.0,1,1.0,-431.0,-511.0,100663296.0,1.0
7,26367433,83886082.0,1,1.0,-411.0,-413.0,100663296.0,3.0
8,26369939,83886081.0,1,1.0,411.0,-511.0,100663296.0,2.0
9,19881295,83886082.0,1,1.0,-4122.0,-4222.0,100663296.0,4.0


In [26]:
df_Y4S[(df_Y4S['isBtoDstlnu'] == 1) & (df_Y4S['Hc_isSignalAcceptMissingGamma'] == 1.0)]['Hc_genMotherPDG'].value_counts()

-413.0       11839
 413.0       11706
-511.0       10713
 511.0       10670
-423.0        1813
 423.0        1794
 433.0         859
-433.0         843
 10431.0       338
-10431.0       330
-415.0         210
 415.0         196
-10413.0       107
 10413.0        84
 30443.0        65
 10411.0        61
-4212.0         61
 4222.0         60
 4212.0         59
 100443.0       56
-10411.0        56
 4112.0         49
-4114.0         48
-4222.0         48
 20433.0        42
 4114.0         42
-20433.0        41
-4112.0         37
 4214.0         32
-4214.0         30
 20443.0        28
 4224.0         25
-4224.0         24
 20413.0        14
-20413.0        13
 445.0           5
Name: Hc_genMotherPDG, dtype: int64

In [50]:
df_FSPs[df_FSPs["__event__"] == 22246415].sort_values('B_ID',ascending=False)[['mcPDG',
'B_ID',"basf2_used","basf2_Bsig","basf2_X",'genMothPDG_0', 'mcMother0_uniqParID', 'genMotherID_0',
'genMothPDG_1', 'mcMother1_uniqParID', 'genMotherID_1',
'genMothPDG_2', 'mcMother2_uniqParID', 'genMotherID_2',
'genMothPDG_3', 'mcMother3_uniqParID', 'genMotherID_3',
'genMothPDG_4', 'mcMother4_uniqParID', 'genMotherID_4',
'genMothPDG_5', 'mcMother5_uniqParID', 'genMotherID_5',
'genMothPDG_6', 'mcMother6_uniqParID', 'genMotherID_6',
'genMothPDG_7', 'mcMother7_uniqParID', 'genMotherID_7',
'genMothPDG_8', 'mcMother8_uniqParID', 'genMotherID_8',
'genMothPDG_9', 'mcMother9_uniqParID', 'genMotherID_9']]

Unnamed: 0,mcPDG,B_ID,basf2_used,basf2_Bsig,basf2_X,genMothPDG_0,mcMother0_uniqParID,genMotherID_0,genMothPDG_1,mcMother1_uniqParID,...,genMotherID_6,genMothPDG_7,mcMother7_uniqParID,genMotherID_7,genMothPDG_8,mcMother8_uniqParID,genMotherID_8,genMothPDG_9,mcMother9_uniqParID,genMotherID_9
245465,22.0,83886082,1.0,0.0,1.0,111.0,83886096.0,16.0,213.0,83886087.0,...,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0
90510,321.0,83886082,1.0,0.0,0.0,-421.0,83886102.0,22.0,-423.0,83886093.0,...,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0
286686,22.0,83886082,0.0,0.0,0.0,211.0,83886095.0,15.0,213.0,83886087.0,...,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0
245464,22.0,83886082,1.0,0.0,1.0,111.0,83886096.0,16.0,213.0,83886087.0,...,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0
286687,22.0,83886082,0.0,0.0,0.0,111.0,83886103.0,23.0,-423.0,83886093.0,...,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0
90509,211.0,83886082,1.0,0.0,1.0,213.0,83886087.0,7.0,511.0,83886082.0,...,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0
1140,-211.0,83886082,1.0,0.0,0.0,-421.0,83886102.0,22.0,-423.0,83886093.0,...,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0
1139,-211.0,83886082,1.0,0.0,1.0,-415.0,83886086.0,6.0,511.0,83886082.0,...,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0
90511,-321.0,83886081,1.0,1.0,0.0,421.0,83886088.0,8.0,413.0,83886083.0,...,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0
112314,13.0,83886081,1.0,1.0,0.0,-511.0,83886081.0,1.0,300553.0,83886080.0,...,0.0,0.0,,0.0,0.0,,0.0,0.0,,0.0


## check if category combinations make sense

In [42]:
groupsAllFSPs = pd.DataFrame({'count' : df_FSPs.groupby( ["basf2_used","basf2_Bsig","basf2_X"] ).size()}).reset_index()
groupsAllFSPs

Unnamed: 0,basf2_used,basf2_Bsig,basf2_X,count
0,0.0,0.0,0.0,154089
1,0.0,1.0,0.0,54
2,1.0,0.0,0.0,48518
3,1.0,0.0,1.0,123958
4,1.0,1.0,0.0,90302


## delete the H_c FSPs (basf2_used==1 & basf2_Bsig==0 & basf2_X==0)

In [24]:
df_FSPs = df_FSPs[~((df_FSPs["basf2_used"]== 1) & (df_FSPs["basf2_Bsig"]== 0) & (df_FSPs["basf2_X"]== 0))]

In [None]:
groupsAllFSPs = pd.DataFrame({'count' : df_FSPs.groupby( ["basf2_used","basf2_Bsig","basf2_X"] ).size()}).reset_index()
groupsAllFSPs

## data Saving

### save dataframes on NFS

In [25]:
if HTCondorRun == "isHTCondorRun" or save_preprocessedDataframe == True:
    df_FSPs.to_csv(root_path + "df_FSPs_preProcessed_SHR.csv")
    df_Y4S.to_csv(root_path + "df_Y4S_preProcessed_SHR.csv")

### load dataframes from different subs and concat them

In [54]:
df_FSPs_final = df_FSPs
df_Y4S_final = df_Y4S

In [55]:
print(df_FSPs_final.shape[0])
print(df_Y4S_final.shape[0])

368403
14945


In [9]:
df_FSPs_list=[]
df_Y4S_list=[]
for sub in ["sub00","sub01","sub02"]:
    root_path = nfs_path + "SHR_Hc_correctReco_BsX/" + root_subdir + sub + "/"    
    df_FSPs = pd.read_csv(root_path + "df_FSPs_preProcessed_SHR.csv")
    df_FSPs_list.append(df_FSPs)
    df_Y4S = pd.read_csv(root_path + "df_Y4S_preProcessed_SHR.csv")
    df_Y4S_list.append(df_Y4S)
    
df_FSPs_final = pd.concat(df_FSPs_list)
df_Y4S_final = pd.concat(df_Y4S_list)

In [10]:
df_Y4S_final.shape[0]

132200

## keep only the ones where Hc genMotherPDG is 511

In [11]:
df_Y4S_final[(df_Y4S_final['isBtoDstlnu'] == 1) & (df_Y4S_final['Hc_isSignalAcceptMissingGamma'] == 1.0)]['Hc_genMotherPDG'].value_counts()

-413.0       29874
 413.0       29659
-511.0       26839
 511.0       26655
-423.0        4565
 423.0        4563
-433.0        2197
 433.0        2168
-10431.0       894
 10431.0       859
-415.0         562
 415.0         538
-10413.0       275
 10413.0       242
 30443.0       172
 100443.0      166
-10411.0       159
 4212.0        157
 10411.0       149
 4222.0        141
-4212.0        139
 4112.0        125
-4114.0        120
-4222.0        119
 4114.0        114
-4112.0        108
 20433.0       100
-20433.0        98
 4214.0         91
-4214.0         81
 20443.0        78
-4224.0         57
 4224.0         55
-20413.0        37
 20413.0        33
 445.0           8
-10433.0         2
 10441.0         1
Name: Hc_genMotherPDG, dtype: int64

In [12]:
df_Y4S_final = df_Y4S_final[(df_Y4S_final['Hc_genMotherPDG'] == 511.0) | (df_Y4S_final['Hc_genMotherPDG'] == -511.0)]

df_FSPs_final = df_FSPs_final[df_FSPs_final['__event__'].isin(df_Y4S_final["__event__"])]

In [13]:
print(df_FSPs_final.shape[0])
print(df_Y4S_final.shape[0])

1419934
53494


## cut on momenta for neutrals, p>100MeV

In [26]:
df_FSPs_final = df_FSPs_final[~((df_FSPs_final["p"]<0.100) & (df_FSPs_final["PDG"]==22.))]

In [27]:
print(df_FSPs_final.shape[0])
print(df_Y4S_final.shape[0])

812693
53494


## save the concated df as it will be used for data prod

In [None]:
root_path_finalDF = "/nfs/dust/belle2/user/axelheim/MC_studies/Dstlnu_Bt_generic/SHR_Hc_correctReco_BsX/SHR_dataSteering_run1/"

df_FSPs_final.to_csv(root_path_finalDF + "final_df_FSPs_preProcessed_SHR_gamma100cut.csv")
df_Y4S_final.to_csv(root_path_finalDF + "final_df_Y4S_preProcessed_SHR.csv")

# start of NN data creation

In [23]:
numFSPs = pd.DataFrame({'count' : df_FSPs_final.groupby( ["__event__"] ).size()}).reset_index()

minFSPs = numFSPs["count"].min()
maxFSPs = numFSPs["count"].max()
print("minFSPs:",minFSPs)
print("maxFSPs:",maxFSPs,'\n')
print("num Events:",numFSPs.shape[0],'\n')

df_FSPs_final['numFSPs'] = df_FSPs_final.groupby('__event__')['__event__'].transform('count')

minFSPs: 7
maxFSPs: 83 

num Events: 53473 



In [24]:
data_dir = Path(nfs_path + "data/" + data_subdir + root_subdir)    
if save_data:
    data_dir.mkdir(parents=True, exist_ok=True)
print("Will save data to:", data_dir,'is', save_data ,'\n')

Will save data to: /nfs/dust/belle2/user/axelheim/MC_studies/Dstlnu_Bt_generic/data/Dstlnu_SHR_BsX_p100cut/SHR_dataSteering_run1 is True 



In [None]:
#minFSPs = 5
for num_FSPs_toData in range(minFSPs, maxFSPs+1):
    df_num_subset = df_FSPs_final.copy()
    df_num_subset = df_num_subset[df_num_subset['numFSPs'] == num_FSPs_toData]
    
        
    numEvents = df_num_subset.__event__.nunique()
    print("numEvents:",numEvents)
    print("num_FSPs_toData:",num_FSPs_toData)  
    if numEvents == 0:
        print("skipped because empty \n")
        continue

    if numEvents < 10:
        print("skipped because <10 events \n")
        continue
    
    num_features = 4
    leaves = np.zeros((numEvents, num_FSPs_toData,  num_features))  
    SA_target =  np.zeros((numEvents, num_FSPs_toData))
    global_tag = np.chararray((numEvents, num_FSPs_toData + 1), itemsize=30)
    
    event_list = df_num_subset[df_num_subset["numFSPs"] == num_FSPs_toData]["__event__"].unique()
    #print("len(event_list):",len(event_list))
    for i in range(numEvents):

        event_iter = event_list[i]

        global_tag_masterInfo = "evt" + str(event_iter)
        global_tag[i,-1] = global_tag_masterInfo
        #print("global_tag[i,-1]:",global_tag[i,-1])
        #print("i:",i,"event_iter:",event_iter)

        event_df = df_num_subset[df_num_subset.__event__ == event_iter]

        for j in range(num_FSPs_toData):
            #print("numParticle:",j)
            particle = event_df.iloc[j]

            #print(particle["mcPDG"],particle["px"],particle["py"],particle["pz"],particle["E"])
            leaves[i,j,0] = particle["px"]
            leaves[i,j,1] = particle["py"]
            leaves[i,j,2] = particle["pz"]
            leaves[i,j,3] = particle["E"]
            
            basf2_usage = "basf2_NONE"
            if particle["basf2_Bsig"] == 1.0:
                basf2_usage = "basf2_Bsig"
            elif particle["basf2_X"] == 1.0:
                basf2_usage = "basf2_X"
            elif particle["basf2_used"] == 0:
                basf2_usage = "basf2_bg"

            global_tag_Info = str((particle["mcPDG"])) 
            global_tag_Info += "_" + basf2_usage
            global_tag[i,j] = global_tag_Info

            label = -10 # error code if assignment fails
            B_tag_uniqID = -10 # error code if assignment fails     
            
            B_sig_uniqID = df_Y4S_final[df_Y4S_final["__event__"] == event_iter].iloc[0]['Bsig_uniqParID']
            if B_sig_uniqID == 83886082.0:
                B_tag_uniqID = 83886081.0
            elif B_sig_uniqID == 83886081.0:
                B_tag_uniqID = 83886082.0
            
            if particle["B_ID"] == B_tag_uniqID:
                label = 1 # particle belongs to X (MC truth)
            elif particle["B_ID"] == B_sig_uniqID:
                label = 2 # particle belongs to Bsig (MC truth)
            elif particle["B_ID"] == 0:
                label = 0 # background (MC truth)
            
            
            SA_target[i,j] = label
            
        del event_df
        
        
    # shuffle the data    
    for idx in np.arange(leaves.shape[0]):   # arange is like range but gives ndarray instead of list
        perms = np.random.permutation(leaves.shape[1])

        leaves[idx,:] = leaves[idx,perms]
        SA_target[idx,:] = SA_target[idx,perms]
        global_tag[idx,0:-1] = global_tag[idx,perms]
        
        
         


    #print(global_tag)
    train_ratio = 0.82
    validation_ratio = 0.12
    test_ratio = 0.06

    print("leaves.shape:",leaves.shape)
    print("SA_target.shape:",SA_target.shape)
    print("global_tag.shape:",global_tag.shape)


    print("leaves[0]:",leaves[0])
    print("SA_target[0]:",SA_target[0])
    print("global_tag[0]:",global_tag[0])

    x=leaves
    y=SA_target
    z=global_tag

    x_train, x_test, y_train, y_test, z_train, z_test = train_test_split(x, y, z, test_size=1 - train_ratio, shuffle=False)
    x_val, x_test, y_val, y_test, z_val, z_test = train_test_split(x_test, y_test, z_test, test_size=test_ratio/(test_ratio + validation_ratio), shuffle=False) 

    if save_data==True:
        np.save(data_dir / "leaves_train_FSP{}.npy".format(num_FSPs_toData), x_train)
        np.save(data_dir / "is_left_arr_train_FSP{}.npy".format(num_FSPs_toData), y_train)
        np.save(data_dir / "global_tag_train_FSP{}.npy".format(num_FSPs_toData), z_train)

        np.save(data_dir / "leaves_val_FSP{}.npy".format(num_FSPs_toData), x_val)
        np.save(data_dir / "is_left_arr_val_FSP{}.npy".format(num_FSPs_toData), y_val)
        np.save(data_dir / "global_tag_val_FSP{}.npy".format(num_FSPs_toData), z_val)

        np.save(data_dir / "leaves_test_FSP{}.npy".format(num_FSPs_toData), x_test)
        np.save(data_dir / "is_left_arr_test_FSP{}.npy".format(num_FSPs_toData), y_test)
        np.save(data_dir / "global_tag_test_FSP{}.npy".format(num_FSPs_toData), z_test)

    
    print("")
    #del df_num_subset


    del df_num_subset
                                          

In [None]:
print("saving is done")
now = datetime.now()
print("time at end =", now)

In [None]:
#why only one FSP in so many events?

In [59]:
df_num_subset = df_FSPs_final.copy()
df_num_subset = df_num_subset[df_num_subset['numFSPs'] == 1]

In [51]:
df_FSPs_final.keys()

Index(['Unnamed: 0', '__experiment__', '__run__', '__event__', '__candidate__',
       '__ncandidates__', '__weight__', 'basf2_X', 'basf2_used', 'basf2_Bsig',
       'isSignal', 'uniqueParticleIdentifier', 'mcErrors', 'mcPDG',
       'genMotherID', 'genMotherP', 'genMotherPDG', 'px', 'py', 'pz', 'pt',
       'p', 'E', 'kaonID', 'pionID', 'genMothPDG_0', 'genMothPDG_1',
       'genMothPDG_2', 'genMothPDG_3', 'genMothPDG_4', 'genMothPDG_5',
       'genMothPDG_6', 'genMothPDG_7', 'genMothPDG_8', 'genMothPDG_9',
       'genMotherID_0', 'genMotherID_1', 'genMotherID_2', 'genMotherID_3',
       'genMotherID_4', 'genMotherID_5', 'genMotherID_6', 'genMotherID_7',
       'genMotherID_8', 'genMotherID_9', 'mcMother0_uniqParID',
       'mcMother1_uniqParID', 'mcMother2_uniqParID', 'mcMother3_uniqParID',
       'mcMother4_uniqParID', 'mcMother5_uniqParID', 'mcMother6_uniqParID',
       'mcMother7_uniqParID', 'mcMother8_uniqParID', 'mcMother9_uniqParID',
       'PDG', 'B_ID', 'numFSPs'],
      dtyp

In [52]:
groupsAllFSPs = pd.DataFrame({'count' : df_FSPs_final.groupby( ["basf2_used","basf2_Bsig","basf2_X"] ).size()}).reset_index()
groupsAllFSPs

Unnamed: 0,basf2_used,basf2_Bsig,basf2_X,count
0,0.0,0.0,0.0,591463
1,0.0,1.0,0.0,1
2,1.0,0.0,1.0,479138
3,1.0,1.0,0.0,349332


In [3]:
fileY4S = uproot.open("/afs/desy.de/user/a/axelheim/private/MC_studies/Dstlnu_Bt_generic/SHR_Hc_correctReco_BsX/SHR_wChargeLessFiles_run1/DXtagDstl.root:variables")
df_Y4S = fileY4S.arrays(library="pd")

In [4]:
pions = uproot.open("/afs/desy.de/user/a/axelheim/private/MC_studies/Dstlnu_Bt_generic/SHR_Hc_correctReco_BsX/SHR_wChargeLessFiles_run1/pions.root:variables")
df_pions = pions.arrays(library="pd")

In [5]:
df_Y4S.shape[0]

11

In [6]:
df_Y4S[(df_Y4S['Hc_isSignalAcceptMissingGamma'] == 1.0)].shape[0]

11