In [1]:
import matplotlib.pyplot as plt
import uproot
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.model_selection import train_test_split

from datetime import datetime
import sys

In [2]:
now = datetime.now()
print("time at start =", now)

time at start = 2021-09-20 22:15:04.273306


In [3]:
HTCondorRun = str(sys.argv[1])
print("HTCondorRun:",HTCondorRun)

HTCondorRun: -f


In [4]:
save_data = True
tmp_data = False

In [5]:
nfs_path = "/nfs/dust/belle2/user/axelheim/MC_studies/Dstlnu_Bt_generic/"

data_subdir = "Dstlnu_Hc_corr_BsigX_separation_dataRun1/"   
root_subdir = "axheim_data2_MC14_100kEvts/"   

root_path = nfs_path + "createBranchSeparatorData/" + root_subdir

In [6]:
merged = "merged_"
if tmp_data:
    merged += "tmp_"

In [7]:
fileY4S = uproot.open(root_path + merged + "DXtagDstl.root:variables")

In [8]:
names = ["gammas","electrons","pions","kaons","muons"]
dfs = []
for name in names:
    filename = root_path + merged + "{}.root:variables".format(name)
    print(filename)
    tmpFileFSPs = uproot.open(filename)
    df_tmp = tmpFileFSPs.arrays(library="pd")
    dfs.append(df_tmp)

/nfs/dust/belle2/user/axelheim/MC_studies/Dstlnu_Bt_generic/createBranchSeparatorData/axheim_data2_MC14_100kEvts/merged_gammas.root:variables
/nfs/dust/belle2/user/axelheim/MC_studies/Dstlnu_Bt_generic/createBranchSeparatorData/axheim_data2_MC14_100kEvts/merged_electrons.root:variables
/nfs/dust/belle2/user/axelheim/MC_studies/Dstlnu_Bt_generic/createBranchSeparatorData/axheim_data2_MC14_100kEvts/merged_pions.root:variables
/nfs/dust/belle2/user/axelheim/MC_studies/Dstlnu_Bt_generic/createBranchSeparatorData/axheim_data2_MC14_100kEvts/merged_kaons.root:variables
/nfs/dust/belle2/user/axelheim/MC_studies/Dstlnu_Bt_generic/createBranchSeparatorData/axheim_data2_MC14_100kEvts/merged_muons.root:variables


In [9]:
df_FSPs = pd.concat(dfs)

In [10]:
df_Y4S = fileY4S.arrays(library="pd")

In [11]:
print(df_FSPs.shape[0])
print(df_Y4S.shape[0])

9865292
211319


In [12]:
# delete FSPs for which no Y4S file entry was found
df_FSPs = df_FSPs[df_FSPs['__event__'].isin(df_Y4S["__event__"])]

In [13]:
print(df_FSPs.shape[0])
print(df_Y4S.shape[0])

9754944
211319


## take a sample if used in notebook for faster processing

In [14]:
#df_Y4Ssample = df_Y4S.sample(n=100)
#df_FSPssample = df_FSPs[df_FSPs['__event__'].isin(df_Y4Ssample["__event__"])]

In [15]:
#df_Y4S=df_Y4Ssample
#df_FSPs=df_FSPssample

In [16]:
print("df_FSPs.shape[0]:",df_FSPs.shape[0])
print("df_Y4S.shape[0]:",df_Y4S.shape[0])

df_FSPs.shape[0]: 4665
df_Y4S.shape[0]: 100


### delete particles which occur more than ones based on uniqueParticleIdentifier

In [17]:
groupsFSPs_uniqParID = pd.DataFrame({'count' : df_FSPs.groupby( ["__event__","uniqueParticleIdentifier"] ).size()}).reset_index()
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#    print(groupsFSPs_uniqParID.sort_values("count"))

In [18]:
print("df_FSPs.shape[0]:",df_FSPs.shape[0])
print("groupsFSPs_uniqParID.shape[0]:",groupsFSPs_uniqParID.shape[0])
print("df_Y4S.shape[0]:",df_Y4S.shape[0])

df_FSPs.shape[0]: 4665
groupsFSPs_uniqParID.shape[0]: 3476
df_Y4S.shape[0]: 100


In [19]:
# delete particles which occur more than ones (keep first) and if possible keep the one with basf2_used==1
print("df_FSPs[basf2_used].value_counts():",df_FSPs["basf2_used"].value_counts())
df_FSPs = df_FSPs.sort_values("basf2_used",ascending=False).drop_duplicates(subset=("__event__","uniqueParticleIdentifier"), keep='first')
print("df_FSPs[basf2_used].value_counts():",df_FSPs["basf2_used"].value_counts())

df_FSPs[basf2_used].value_counts(): 0.0    2456
1.0    2209
Name: basf2_used, dtype: int64
df_FSPs[basf2_used].value_counts(): 1.0    2209
0.0    1267
Name: basf2_used, dtype: int64


In [20]:
print("df_FSPs.shape[0]:",df_FSPs.shape[0])

df_FSPs.shape[0]: 3476


## check if category combinations make sense

In [21]:
groupsAllFSPs = pd.DataFrame({'count' : df_FSPs.groupby( ["basf2_used","basf2_Bsig","basf2_X"] ).size()}).reset_index()
groupsAllFSPs

Unnamed: 0,basf2_used,basf2_Bsig,basf2_X,count
0,0.0,0.0,0.0,1267
1,1.0,0.0,0.0,391
2,1.0,0.0,1.0,1114
3,1.0,1.0,0.0,704


### add two cols with extra info

In [22]:
# function to create col with the particles mother B's uniqueParticleIdentifier
def B_ID(s):
    label = 0
    for i in range(10): 
        mcMotheri_uniqParID = "mcMother{}_uniqParID".format(i)
        if ((s[mcMotheri_uniqParID]) == 83886082.0):
            label = 83886082   
        elif ((s[mcMotheri_uniqParID]) == 83886081.0):
            label = 83886081   
    return label
df_FSPs['B_ID'] = df_FSPs.apply(B_ID, axis=1)


In [23]:
# if particle was used by basf2 but neither for B-sig or X it is from the Hc
def Hc(s):
    label = 0
    if ((s["basf2_used"] == 1.0) & (s["basf2_Bsig"] == 0.0) & (s["basf2_X"] == 0.0)):
            label = 1   
    
    return label
df_FSPs['Hc'] = df_FSPs.apply(Hc, axis=1)


In [24]:
# this shows that Hc is sometimes combined from both B's which is of course wrong
#groupsAllFSPs = pd.DataFrame({'count' : df_FSPs.groupby( ["__event__","B_ID","Hc","basf2_used","basf2_Bsig","basf2_X"] ).size()}).reset_index()
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#    print(groupsAllFSPs.sort_values("Hc"))

## data preprocessing

### create df with per event info about which B is sig and which is tag

In [25]:
groupsAllFSPs = pd.DataFrame({'count' : df_FSPs.groupby(["__event__","B_ID","Hc"]).size(),
                             'sum_p': df_FSPs.groupby(["__event__","B_ID","Hc"])["p"].sum()}).reset_index()
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#    print(groupsAllFSPs[(groupsAllFSPs["__event__"] == 3183239)].sort_values("__event__"))

In [26]:
#print(groupsAllFSPs[groupsAllFSPs["Hc"] == 1].sort_values("__event__"))

In [27]:
events=[]
B_tag_IDs=[]
B_sig_IDs=[]

In [28]:
unclearHc = 0
for evt in pd.unique(groupsAllFSPs[groupsAllFSPs["Hc"] == 1]["__event__"]):
    # sort by sum_p to take "max_count_idx" from the H_c particles with more momentum if two categories have the same particle count
    singleEvt_Hcs = groupsAllFSPs[(groupsAllFSPs["Hc"] == 1) & (groupsAllFSPs["__event__"] == evt)].sort_values("sum_p",ascending=False)
    
    # B_ID=0 => background, so take the other one if available
    singleEvt_Hcs = singleEvt_Hcs[(singleEvt_Hcs["B_ID"] != 0)]
    
    if singleEvt_Hcs.empty:
        unclearHc += 1
        continue
    
    max_count_idx = singleEvt_Hcs["count"].idxmax()
    
    #print((max_count_idx))
    max_count_row = singleEvt_Hcs.loc[[max_count_idx]]
       
    
    #print(max_count_row,'\n\n\n\n')

        
    events.append(max_count_row.iloc[0]['__event__'])
    B_tag_IDs.append(max_count_row.iloc[0]['B_ID']) # this is Btag because it is Hc's mother B
    Bsig_tmp = 0
    if B_tag_IDs[-1] == 83886082:
        Bsig_tmp = 83886081.0
    elif B_tag_IDs[-1] == 83886081:
        Bsig_tmp = 83886082.0
    #else:
    #    unclearHc +=1
    #    print(singleEvt_Hcs)
    #    print(events[-1],B_tag_IDs[-1],max_count_idx)
    #    raise ValueError('Btag/Bsig assignment unclear')
    B_sig_IDs.append(Bsig_tmp)
    
print("in",unclearHc,"cases, Btag (B mother of Hc) was unclear")
print("equals to",round(unclearHc/(len(B_sig_IDs)+unclearHc) , 4)*100,"%")
    

in 0 cases, Btag (B mother of Hc) was unclear
equals to 0.0 %


In [29]:
event_Bs = pd.DataFrame(
{"__event__" : events,
"B_tag_ID" : B_tag_IDs,
"B_sig_ID" : B_sig_IDs})

In [30]:
# throw away events with unclear Btag
print("df_FSPs.shape[0]:",df_FSPs.shape[0])
df_FSPs = df_FSPs[df_FSPs['__event__'].isin(event_Bs["__event__"])]
print("df_FSPs.shape[0]:",df_FSPs.shape[0])

df_FSPs.shape[0]: 3476
df_FSPs.shape[0]: 3476


In [31]:
event_Bs[:10]

Unnamed: 0,__event__,B_tag_ID,B_sig_ID
0,190398.0,83886082.0,83886081.0
1,1281176.0,83886082.0,83886081.0
2,2126042.0,83886082.0,83886081.0
3,2653187.0,83886082.0,83886081.0
4,2686695.0,83886082.0,83886081.0
5,3765558.0,83886082.0,83886081.0
6,3988810.0,83886082.0,83886081.0
7,4222628.0,83886081.0,83886082.0
8,5241158.0,83886081.0,83886082.0
9,5837467.0,83886082.0,83886081.0


In [32]:
# check that B-tag and B-sig are not equal for any event -> only 2 rows shall appear here
pd.DataFrame({'count' : event_Bs.groupby(["B_tag_ID","B_sig_ID"]).size()}).reset_index()

Unnamed: 0,B_tag_ID,B_sig_ID,count
0,83886081.0,83886082.0,55
1,83886082.0,83886081.0,45


### save dataframes on NFS

In [33]:
if HTCondorRun == "isHTCondorRun":
    df_FSPs.to_csv(root_path + "df_FSPs_preProcessed.csv")
    event_Bs.to_csv(root_path + "event_Bs.csv")

### load dataframes

In [34]:
#df_FSPs = pd.read_csv(root_path + "df_FSPs_preProcessed.csv")
#event_Bs = pd.read_csv(root_path + "event_Bs.csv")

### delete Hc particles, after loading data, so saved df's cotain the Hc particles as well

In [35]:
print("df_FSPs.shape[0]:",df_FSPs.shape[0])
df_FSPs_final = df_FSPs[df_FSPs["Hc"] == 0]
print("df_FSPs_final.shape[0]:",df_FSPs_final.shape[0])

df_FSPs.shape[0]: 3476
df_FSPs_final.shape[0]: 3085


In [36]:
event_Bs.keys()

Index(['__event__', 'B_tag_ID', 'B_sig_ID'], dtype='object')

In [37]:
df_FSPs_final.keys()

Index(['__experiment__', '__run__', '__event__', '__candidate__',
       '__ncandidates__', '__weight__', 'basf2_X', 'basf2_used', 'basf2_Bsig',
       'isSignal', 'uniqueParticleIdentifier', 'mcErrors', 'mcPDG',
       'genMotherID', 'genMotherP', 'genMotherPDG', 'px', 'py', 'pz', 'pt',
       'p', 'E', 'kaonID', 'pionID', 'genMothPDG_0', 'genMothPDG_1',
       'genMothPDG_2', 'genMothPDG_3', 'genMothPDG_4', 'genMothPDG_5',
       'genMothPDG_6', 'genMothPDG_7', 'genMothPDG_8', 'genMothPDG_9',
       'mcMother0_uniqParID', 'mcMother1_uniqParID', 'mcMother2_uniqParID',
       'mcMother3_uniqParID', 'mcMother4_uniqParID', 'mcMother5_uniqParID',
       'mcMother6_uniqParID', 'mcMother7_uniqParID', 'mcMother8_uniqParID',
       'mcMother9_uniqParID', 'PDG', 'B_ID', 'Hc'],
      dtype='object')

# start of NN data creation

In [38]:
numFSPs = pd.DataFrame({'count' : df_FSPs_final.groupby( ["__event__"] ).size()}).reset_index()

minFSPs = numFSPs["count"].min()
maxFSPs = numFSPs["count"].max()
print("minFSPs:",minFSPs)
print("maxFSPs:",maxFSPs,'\n')

df_FSPs_final['numFSPs'] = df_FSPs_final.groupby('__event__')['__event__'].transform('count')

minFSPs: 17
maxFSPs: 47 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_FSPs_final['numFSPs'] = df_FSPs_final.groupby('__event__')['__event__'].transform('count')


In [39]:
data_dir = Path(nfs_path + "NNdata/" + data_subdir + root_subdir)    
data_dir.mkdir(parents=True, exist_ok=True)
print("Will save data to:", data_dir,'is', save_data ,'\n')

Will save data to: /nfs/dust/belle2/user/axelheim/MC_studies/Dstlnu_Bt_generic/NNdata/Dstlnu_Hc_corr_BsigX_separation_dataRun1/axheim_data2_MC14_100kEvts is True 



In [40]:
for num_FSPs_toData in range(minFSPs, maxFSPs+1):
    df_num_subset = df_FSPs_final.copy()
    df_num_subset = df_num_subset[df_num_subset['numFSPs'] == num_FSPs_toData]
    
        
    numEvents = df_num_subset.__event__.nunique()
    print("numEvents:",numEvents)
    print("num_FSPs_toData:",num_FSPs_toData)  
    if numEvents == 0:
        print("skipped because empty \n")
        continue

    if numEvents < 10:
        print("skipped because <10 events \n")
        continue
    
    num_features = 4
    leaves = np.zeros((numEvents, num_FSPs_toData,  num_features))  
    SA_target =  np.zeros((numEvents, num_FSPs_toData))
    global_tag = np.chararray((numEvents, num_FSPs_toData + 1), itemsize=30)
    
    event_list = df_num_subset[df_num_subset["numFSPs"] == num_FSPs_toData]["__event__"].unique()
    #print("len(event_list):",len(event_list))
    for i in range(numEvents):

        event_iter = event_list[i]

        global_tag_masterInfo = "evt" + str(event_iter)
        global_tag[i,-1] = global_tag_masterInfo
        #print("global_tag[i,-1]:",global_tag[i,-1])
        #print("i:",i,"event_iter:",event_iter)

        event_df = df_num_subset[df_num_subset.__event__ == event_iter]

        for j in range(num_FSPs_toData):
            #print("numParticle:",j)
            particle = event_df.iloc[j]

            #print(particle["mcPDG"],particle["px"],particle["py"],particle["pz"],particle["E"])
            leaves[i,j,0] = particle["px"]
            leaves[i,j,1] = particle["py"]
            leaves[i,j,2] = particle["pz"]
            leaves[i,j,3] = particle["E"]
            
            basf2_usage = "basf2_NONE"
            if particle["basf2_Bsig"] == 1.0:
                basf2_usage = "basf2_Bsig"
            elif particle["basf2_X"] == 1.0:
                basf2_usage = "basf2_X"
            elif particle["basf2_used"] == 0:
                basf2_usage = "basf2_bg"

            global_tag_Info = str((particle["mcPDG"])) 
            global_tag_Info += "_" + basf2_usage
            global_tag[i,j] = global_tag_Info

            label = -10 # error code if assignment fails
            B_tag_uniqID = event_Bs[event_Bs.__event__ == event_iter].iloc[0]['B_tag_ID']
            B_sig_uniqID = event_Bs[event_Bs.__event__ == event_iter].iloc[0]['B_sig_ID']
            if particle["B_ID"] == B_tag_uniqID:
                label = 1 # particle belongs to X (MC truth)
            elif particle["B_ID"] == B_sig_uniqID:
                label = 2 # particle belongs to Bsig (MC truth)
            elif particle["B_ID"] == 0:
                label = 0 # background
            
            
            SA_target[i,j] = label
            
        del event_df
        
        
    # shuffle the data    
    for idx in np.arange(leaves.shape[0]):   # arange is like range but gives ndarray instead of list
        perms = np.random.permutation(leaves.shape[1])

        leaves[idx,:] = leaves[idx,perms]
        SA_target[idx,:] = SA_target[idx,perms]
        global_tag[idx,0:-1] = global_tag[idx,perms]
        
        
         


    #print(global_tag)
    train_ratio = 0.75
    validation_ratio = 0.15
    test_ratio = 0.10

    print("leaves.shape:",leaves.shape)
    print("SA_target.shape:",SA_target.shape)
    print("global_tag.shape:",global_tag.shape)


    print("leaves[0]:",leaves[0])
    print("SA_target[0]:",SA_target[0])
    print("global_tag[0]:",global_tag[0])

    x=leaves
    y=SA_target
    z=global_tag

    x_train, x_test, y_train, y_test, z_train, z_test = train_test_split(x, y, z, test_size=1 - train_ratio, shuffle=False)
    x_val, x_test, y_val, y_test, z_val, z_test = train_test_split(x_test, y_test, z_test, test_size=test_ratio/(test_ratio + validation_ratio), shuffle=False) 

    if save_data==True:
        np.save(data_dir / "leaves_train_FSP{}.npy".format(num_FSPs_toData), x_train)
        np.save(data_dir / "is_left_arr_train_FSP{}.npy".format(num_FSPs_toData), y_train)
        np.save(data_dir / "global_tag_train_FSP{}.npy".format(num_FSPs_toData), z_train)

        np.save(data_dir / "leaves_val_FSP{}.npy".format(num_FSPs_toData), x_val)
        np.save(data_dir / "is_left_arr_val_FSP{}.npy".format(num_FSPs_toData), y_val)
        np.save(data_dir / "global_tag_val_FSP{}.npy".format(num_FSPs_toData), z_val)

        np.save(data_dir / "leaves_test_FSP{}.npy".format(num_FSPs_toData), x_test)
        np.save(data_dir / "is_left_arr_test_FSP{}.npy".format(num_FSPs_toData), y_test)
        np.save(data_dir / "global_tag_test_FSP{}.npy".format(num_FSPs_toData), z_test)

    
    print("")
    #del df_num_subset


    del df_num_subset
                                          

numEvents: 1
num_FSPs_toData: 17
skipped because <10 events 

numEvents: 0
num_FSPs_toData: 18
skipped because empty 

numEvents: 0
num_FSPs_toData: 19
skipped because empty 

numEvents: 2
num_FSPs_toData: 20
skipped because <10 events 

numEvents: 0
num_FSPs_toData: 21
skipped because empty 

numEvents: 2
num_FSPs_toData: 22
skipped because <10 events 

numEvents: 2
num_FSPs_toData: 23
skipped because <10 events 

numEvents: 6
num_FSPs_toData: 24
skipped because <10 events 

numEvents: 4
num_FSPs_toData: 25
skipped because <10 events 

numEvents: 4
num_FSPs_toData: 26
skipped because <10 events 

numEvents: 6
num_FSPs_toData: 27
skipped because <10 events 

numEvents: 5
num_FSPs_toData: 28
skipped because <10 events 

numEvents: 6
num_FSPs_toData: 29
skipped because <10 events 

numEvents: 12
num_FSPs_toData: 30
leaves.shape: (12, 30, 4)
SA_target.shape: (12, 30)
global_tag.shape: (12, 31)
leaves[0]: [[-1.12887910e-02 -3.29260081e-02  6.54010428e-03  3.54165466e-02]
 [-5.90942008e-03 

In [41]:
print("saving is done")
now = datetime.now()
print("time at end =", now)

saving is done
time at end = 2021-09-20 22:18:45.767296
