This notebook creates a `./data/trainData.h5` and `./data/testData.h5` files containing training and testing data for the classifiers.

In [1]:
import pandas as pd
import numpy as np
import time
from os import path

In [2]:
outputDirectory = './data'
outputTrain = 'trainData.h5'
outputTest = 'testData.h5'

if not path.isdir(outputDirectory):
    raise NotADirectoryError("Directory " + outputDirectory + " does not exist.")

In [3]:
import pandas as pd
import numpy as np
import time

In [4]:
def MomentumModulus(px, py, pz):
    return np.sqrt(px**2 + py**2 + pz**2)

def TransverseMomentumModulus(px, py):
    return np.sqrt(px**2 + py**2)

Folder for all data files. The notebook will search for the HDF5 files here.

In [5]:
pathToHDF5Files = '~/data/'

The Monte Carlo HDF5 file should contain two $n$-tuples, one named `LHCbMC_Lb` (for reconstructed events) and one named `LHCbMCTruth_Lb` (for "truth" events).

The "real" data HDF5 file only needs one tree, named `LHCbData`.

In [6]:
inputMC = 'LHCbMC_2016-2017-2018_MagUpDown_Lb2JPsiL_Ttracks_v12.h5'
inputData = 'Custom_Shuffled5e5_LHCbData_2016_MagUpDown_Dimuon_Ttracks.h5'

# Monte Carlo data
## Reconstructed events

In [7]:
tickMC = time.perf_counter()
df_reco = pd.read_hdf(pathToHDF5Files + inputMC, key='LHCbMC_Lb')
tockMC = time.perf_counter()
print(f"Monte Carlo imported in {tockMC - tickMC:0.4f} seconds.")

Monte Carlo imported in 0.9165 seconds.


## Truth events

In [8]:
tickMCTruth = time.perf_counter()
df_truth = pd.read_hdf(pathToHDF5Files + inputMC, key='LHCbMCTruth_Lb')
tockMCTruth = time.perf_counter()
print(f"Monte Carlo Truth imported in {tockMCTruth - tickMCTruth:0.4f} seconds.")

Monte Carlo Truth imported in 12.6776 seconds.


## Merge the dataframes

In [9]:
tickMerge = time.perf_counter()
df_MC = pd.merge(df_truth.loc[df_truth['Rec_key'] >= 0], df_reco, left_index=True, right_on='MC_key')
df_MC = df_MC.loc[(df_MC['MC_key'] >= 0) & (df_MC['Rec_key'] >= 0)]
tockMerge = time.perf_counter()
print(f"Monte Carlo merged in {tockMerge - tickMerge:0.4f} seconds.")

Monte Carlo merged in 2.0133 seconds.


## Apply pre-selection cuts matching data

In [10]:
JPsi1SPDGMass = 3096.900

PionPCuts = (MomentumModulus(df_MC['pim_PX'], df_MC['pim_PY'], df_MC['pim_PZ']) > 2000) & (MomentumModulus(df_MC['pim_PX'], df_MC['pim_PY'], df_MC['pim_PZ']) < 5e5)
ProtonPCuts = (MomentumModulus(df_MC['p_PX'], df_MC['p_PY'], df_MC['p_PZ']) > 10000) & (MomentumModulus(df_MC['p_PX'], df_MC['p_PY'], df_MC['p_PZ']) < 5e5)
ProtonPTCuts = TransverseMomentumModulus(df_MC['p_PX'], df_MC['p_PY']) > 400
## Combined m(p-pi)? Seems to be "AM" in the DaVinci opt file
LambdaMCuts = (df_MC['L_M'] > 600) & (df_MC['L_M'] < 1500)
LambdaMMCuts = df_MC['L_MM'] < 1500
LambdaZCuts = (df_MC['L_ENDVERTEX_Z'] > 5500) & (df_MC['L_ENDVERTEX_Z'] < 8500)
LambdaDiraCuts = (df_MC['L_BPVDIRA'] > 0.9999)
LambdaBPVIPCHI2Cuts = df_MC['L_BPVIPCHI2'] < 200
LambdaBPVVDCHI2Cuts = df_MC['L_BPVVDCHI2'] < 2e7
LambdaChi2Cuts = df_MC['L_VFASPF_CHI2_VDOF'] < 750
JPsiMCuts = abs(df_MC['Jpsi_M'] - JPsi1SPDGMass) < 90
LambdaPTCuts = TransverseMomentumModulus(df_MC['L_PX'], df_MC['L_PY']) > 450
## Combined m(JpsiLambda)? See comment above
LambdabMCuts = (df_MC['Lb_M'] < 8500)
LambdabDiraCuts = abs(df_MC['Lb_BPVDIRA']) > 0.99
LambdabBPVIPCHI2Cuts = df_MC['Lb_BPVIPCHI2'] < 1750
LambdabChi2Cuts = df_MC['Lb_VFASPF_CHI2_VDOF'] < 150

df_MC_Filtered = df_MC.loc[
    PionPCuts &
    ProtonPCuts &
    ProtonPTCuts &
    LambdaMCuts &
    LambdaMMCuts &
    LambdaZCuts &
    LambdaDiraCuts &
    LambdaBPVIPCHI2Cuts &
    LambdaBPVVDCHI2Cuts &
    LambdaChi2Cuts &
    JPsiMCuts &
    LambdaPTCuts &
    LambdabMCuts &
    LambdabDiraCuts &
    LambdabBPVIPCHI2Cuts &
    LambdabChi2Cuts
]

df_MC_Filtered

Unnamed: 0_level_0,MCTRUTH_Lb_ENDVERTEX_X,MCTRUTH_Lb_ENDVERTEX_Y,MCTRUTH_Lb_ENDVERTEX_Z,MCTRUTH_L_ENDVERTEX_X,MCTRUTH_L_ENDVERTEX_Y,MCTRUTH_L_ENDVERTEX_Z,MCTRUTH_p_PX,MCTRUTH_p_PY,MCTRUTH_p_PZ,MCTRUTH_pim_PX,...,DTF_FixJPsiLambda_PIDSubs_p_PY,DTF_FixJPsiLambda_PIDSubs_p_PZ,DTF_FixJPsiLambda_PIDSubs_mum_PX,DTF_FixJPsiLambda_PIDSubs_mum_PY,DTF_FixJPsiLambda_PIDSubs_mum_PZ,DTF_FixJPsiLambda_PIDSubs_mup_PX,DTF_FixJPsiLambda_PIDSubs_mup_PY,DTF_FixJPsiLambda_PIDSubs_mup_PZ,TrackType_y,MC_key
Rec_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.9512,-0.1669,26.401600,343.742889,-192.429398,4751.548828,2031.640015,-1218.119995,29336.890625,445.510010,...,-186.566345,5411.963379,-804.469727,-535.944397,22858.960938,1438.071777,1619.347290,23634.302734,T,0
1,0.8237,-0.2123,3.514200,140.238602,325.837311,7463.102539,367.250000,1035.959961,24602.169922,174.279999,...,222.666214,4137.976074,-605.608093,594.454895,11170.495117,71.893692,-2873.631348,73489.484375,T,29
8,0.5442,-0.4049,-30.064199,-237.463104,-161.438004,5029.397461,-1999.180054,-1447.800049,43812.398438,-599.280029,...,-151.918823,7139.517578,100.262436,999.515259,50033.699219,-120.737198,-995.344788,6822.337402,T,277
14,0.9904,-0.2075,-9.769600,224.561996,56.081600,5921.048828,2894.560059,699.140015,74302.851562,293.929993,...,104.645416,10220.954102,-884.967468,195.843964,34903.839844,1645.699585,-1113.117432,24484.212891,T,367
17,0.8236,-0.2738,28.986401,-296.374695,-100.123802,5963.561523,-1268.760010,-522.340027,25478.310547,-184.460007,...,20.153040,3543.737305,-829.315674,-926.117676,37815.328125,2614.833252,-1553.520630,63936.839844,T,493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366374,1.9930,-0.9682,33.883499,454.162109,-444.944794,6047.499512,7053.950195,-6800.229980,92633.132812,949.369995,...,-1059.302368,13806.107422,2310.392578,-2437.838867,42988.765625,2057.492432,-877.707703,10357.686523,T,8921400
366378,1.2187,-0.0757,-10.080700,120.471603,-69.038300,7221.361328,1256.689941,-628.059998,71476.546875,189.229996,...,-259.237457,19192.976562,1663.037476,477.534760,14450.500000,-812.013245,-604.744568,27734.162109,T,8921503
366382,1.1575,-0.3940,43.910599,4.075100,-146.877396,7409.937500,133.949997,-2934.969971,147684.812500,-69.339996,...,-295.752747,15403.834961,196.509079,-679.625732,6668.261719,3547.472900,-3286.872803,190998.375000,T,8921603
366391,0.9741,0.2091,-11.659000,294.090698,39.875000,7125.287109,2412.750000,405.109985,60216.011719,395.200012,...,-18.306223,7767.518555,1504.345581,715.472656,49351.718750,-112.221489,2933.706299,37641.914062,T,8921673


# Real data

In [11]:
tickData = time.perf_counter()
df_Data = pd.read_hdf(pathToHDF5Files + inputData, key='LHCbData')
tockData = time.perf_counter()
print(f"Data imported found in {tockData - tickData:0.4f} seconds.")

Data imported found in 0.9617 seconds.


## Select side bands

In [12]:
df_Data_Sideband = df_Data.loc[df_Data['DTF_FixJPsiLambda_Lb_M'] > 5803]
df_Data_Sideband

Unnamed: 0,Lb_ENDVERTEX_X,Lb_ENDVERTEX_Y,Lb_ENDVERTEX_Z,L_ENDVERTEX_X,L_ENDVERTEX_Y,L_ENDVERTEX_Z,Jpsi_ENDVERTEX_X,Jpsi_ENDVERTEX_Y,Jpsi_ENDVERTEX_Z,L_PX,...,DTF_FixJPsiLambda_PIDSubs_PV_key,DTF_FixJPsiLambda_PIDSubs_PV_X,DTF_FixJPsiLambda_PIDSubs_PV_Y,DTF_FixJPsiLambda_PIDSubs_PV_Z,DTF_FixJPsiLambda_PIDSubs_mum_PX,DTF_FixJPsiLambda_PIDSubs_mum_PY,DTF_FixJPsiLambda_PIDSubs_mum_PZ,DTF_FixJPsiLambda_PIDSubs_mup_PX,DTF_FixJPsiLambda_PIDSubs_mup_PY,DTF_FixJPsiLambda_PIDSubs_mup_PZ
9,0.954203,-0.344464,-53.680211,-262.045608,0.442579,5510.941369,0.9542,-0.3443,-53.6797,-1782.133090,...,0.0,0.8468,-0.2622,-54.893501,437.180298,2096.835693,11598.041992,1090.466187,-844.338867,13521.172852
10,0.594506,1.091681,4.248750,-130.962451,-12.215099,7696.425393,0.5944,1.0918,4.2498,-542.173222,...,1.0,0.7767,-0.1859,-8.075300,246.036743,-837.566833,18804.574219,-324.927582,2296.204346,21652.119141
11,0.510916,-0.100533,2.469290,-115.172428,-15.312432,7416.775760,0.5115,-0.1005,2.4713,-1604.083225,...,0.0,0.8078,-0.1900,-11.636300,-1915.078979,397.245697,40416.109375,1053.967896,-473.800598,14331.053711
12,-3.018767,-1.877033,116.476465,228.699716,60.309666,7438.445918,-3.0203,-1.8776,116.4960,470.992288,...,1.0,0.8418,-0.2003,56.337399,-4871.875977,-1851.765503,51975.703125,-3758.903076,-2457.851807,81130.992188
15,0.821776,-0.194724,-98.650781,704.486275,-114.239457,6479.514135,0.8206,-0.1960,-98.6433,2103.247630,...,0.0,0.8000,-0.2197,-98.998703,601.174988,459.194885,9058.416992,-953.844238,-2465.735107,11614.083984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502096,0.879724,-0.135972,-41.057133,93.670514,7.303664,5570.798979,0.8800,-0.1366,-41.0625,464.499500,...,0.0,0.8832,-0.1605,-41.536201,-1089.839722,486.843903,8835.775391,-1582.450806,4219.804199,93952.031250
502104,0.765221,-0.124892,17.330386,216.976241,-2.940979,7724.294733,0.7632,-0.1235,17.3599,1791.506655,...,0.0,0.8249,-0.1796,16.311600,-1585.558838,3039.507324,45262.421875,-297.524719,-777.621033,12218.561523
502109,0.846524,-0.134417,32.394226,-64.169224,6.479162,5770.951530,0.8465,-0.1340,32.3945,-584.392519,...,0.0,0.8459,-0.1883,32.926998,556.332214,-1148.651367,10566.268555,512.927612,1943.064453,10574.164062
502110,0.871183,-0.286051,41.685312,34.181050,-103.878831,6986.884965,0.8709,-0.2864,41.6887,421.292007,...,0.0,0.8218,-0.2298,36.771099,445.235138,-514.793945,17956.791016,-2666.400391,-2992.653564,30364.205078


# Add missing features

In [13]:
df_MC_Filtered = df_MC_Filtered.assign(
    p_PT = TransverseMomentumModulus(df_MC_Filtered['p_PX'],df_MC_Filtered['p_PY']),
    DTF_FixJPsi_p_PT = TransverseMomentumModulus(df_MC_Filtered['DTF_FixJPsi_p_PX'], df_MC_Filtered['DTF_FixJPsi_p_PY']),
    DTF_FixJPsiLambda_p_PT = TransverseMomentumModulus(df_MC_Filtered['DTF_FixJPsiLambda_p_PX'], df_MC_Filtered['DTF_FixJPsiLambda_p_PY']),
    pim_PT = TransverseMomentumModulus(df_MC_Filtered['pim_PX'],df_MC_Filtered['pim_PY']),
    DTF_FixJPsi_pim_PT = TransverseMomentumModulus(df_MC_Filtered['DTF_FixJPsi_pim_PX'], df_MC_Filtered['DTF_FixJPsi_pim_PY']),
    DTF_FixJPsiLambda_pim_PT = TransverseMomentumModulus(df_MC_Filtered['DTF_FixJPsiLambda_pim_PX'], df_MC_Filtered['DTF_FixJPsiLambda_pim_PY']),
    Jpsi_PT = TransverseMomentumModulus(df_MC_Filtered['Jpsi_PX'],df_MC_Filtered['Jpsi_PY'])
)

In [14]:
successDictionaryReverse = {
    'Success': 0.0,
    'Failed': 1.0,
    'NonConverged': 3.0
}

df_MC_Filtered.replace({'DTF_FixJPsi_status': successDictionaryReverse}, inplace=True)
df_MC_Filtered.replace({'DTF_FixJPsiLambda_status': successDictionaryReverse}, inplace=True)
df_MC_Filtered

Unnamed: 0_level_0,MCTRUTH_Lb_ENDVERTEX_X,MCTRUTH_Lb_ENDVERTEX_Y,MCTRUTH_Lb_ENDVERTEX_Z,MCTRUTH_L_ENDVERTEX_X,MCTRUTH_L_ENDVERTEX_Y,MCTRUTH_L_ENDVERTEX_Z,MCTRUTH_p_PX,MCTRUTH_p_PY,MCTRUTH_p_PZ,MCTRUTH_pim_PX,...,DTF_FixJPsiLambda_PIDSubs_mup_PZ,TrackType_y,MC_key,p_PT,DTF_FixJPsi_p_PT,DTF_FixJPsiLambda_p_PT,pim_PT,DTF_FixJPsi_pim_PT,DTF_FixJPsiLambda_pim_PT,Jpsi_PT
Rec_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.9512,-0.1669,26.401600,343.742889,-192.429398,4751.548828,2031.640015,-1218.119995,29336.890625,445.510010,...,23634.302734,T,0,3670.964844,2624.076172,2733.329834,311.089020,260.727386,334.741455,1259.644897
1,0.8237,-0.2123,3.514200,140.238602,325.837311,7463.102539,367.250000,1035.959961,24602.169922,174.279999,...,73489.484375,T,29,962.454834,1039.160889,1095.172607,292.199097,262.192261,278.820038,2350.277832
8,0.5442,-0.4049,-30.064199,-237.463104,-161.438004,5029.397461,-1999.180054,-1447.800049,43812.398438,-599.280029,...,6822.337402,T,277,6069.714355,2933.882812,354.068542,313.657623,496.833252,456.626129,16.075375
14,0.9904,-0.2075,-9.769600,224.561996,56.081600,5921.048828,2894.560059,699.140015,74302.851562,293.929993,...,24484.212891,T,367,3138.222168,2606.361572,2835.955811,563.971436,397.024719,499.553711,1183.876953
17,0.8236,-0.2738,28.986401,-296.374695,-100.123802,5963.561523,-1268.760010,-522.340027,25478.310547,-184.460007,...,63936.839844,T,493,1088.444824,1386.584595,1326.820923,503.365540,128.678238,192.515366,3060.318604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366374,1.9930,-0.9682,33.883499,454.162109,-444.944794,6047.499512,7053.950195,-6800.229980,92633.132812,949.369995,...,10357.686523,T,8921400,9575.651367,11972.828125,9594.823242,1461.105103,1673.396362,1428.191162,5488.212402
366378,1.2187,-0.0757,-10.080700,120.471603,-69.038300,7221.361328,1256.689941,-628.059998,71476.546875,189.229996,...,27734.162109,T,8921503,1055.863892,1285.636230,1963.163696,429.001678,345.132141,352.585876,855.027405
366382,1.1575,-0.3940,43.910599,4.075100,-146.877396,7409.937500,133.949997,-2934.969971,147684.812500,-69.339996,...,190998.375000,T,8921603,4872.735840,3059.736816,1022.928284,265.291748,270.466003,294.877197,5446.465820
366391,0.9741,0.2091,-11.659000,294.090698,39.875000,7125.287109,2412.750000,405.109985,60216.011719,395.200012,...,37641.914062,T,8921673,2909.400391,2303.910400,2447.316895,426.464935,355.111664,397.183228,3911.406494


In [15]:
df_Data_Sideband = df_Data_Sideband.assign(
    p_PT = TransverseMomentumModulus(df_Data_Sideband['p_PX'],df_Data_Sideband['p_PY']),
    DTF_FixJPsi_p_PT = TransverseMomentumModulus(df_Data_Sideband['DTF_FixJPsi_p_PX'], df_Data_Sideband['DTF_FixJPsi_p_PY']),
    DTF_FixJPsiLambda_p_PT = TransverseMomentumModulus(df_Data_Sideband['DTF_FixJPsiLambda_p_PX'], df_Data_Sideband['DTF_FixJPsiLambda_p_PY']),
    pim_PT = TransverseMomentumModulus(df_Data_Sideband['pim_PX'],df_Data_Sideband['pim_PY']),
    DTF_FixJPsi_pim_PT = TransverseMomentumModulus(df_Data_Sideband['DTF_FixJPsi_pim_PX'], df_Data_Sideband['DTF_FixJPsi_pim_PY']),
    DTF_FixJPsiLambda_pim_PT = TransverseMomentumModulus(df_Data_Sideband['DTF_FixJPsiLambda_pim_PX'], df_Data_Sideband['DTF_FixJPsiLambda_pim_PY']),
    Jpsi_PT = TransverseMomentumModulus(df_Data_Sideband['Jpsi_PX'],df_Data_Sideband['Jpsi_PY'])
)

df_Data_Sideband

Unnamed: 0,Lb_ENDVERTEX_X,Lb_ENDVERTEX_Y,Lb_ENDVERTEX_Z,L_ENDVERTEX_X,L_ENDVERTEX_Y,L_ENDVERTEX_Z,Jpsi_ENDVERTEX_X,Jpsi_ENDVERTEX_Y,Jpsi_ENDVERTEX_Z,L_PX,...,DTF_FixJPsiLambda_PIDSubs_mup_PX,DTF_FixJPsiLambda_PIDSubs_mup_PY,DTF_FixJPsiLambda_PIDSubs_mup_PZ,p_PT,DTF_FixJPsi_p_PT,DTF_FixJPsiLambda_p_PT,pim_PT,DTF_FixJPsi_pim_PT,DTF_FixJPsiLambda_pim_PT,Jpsi_PT
9,0.954203,-0.344464,-53.680211,-262.045608,0.442579,5510.941369,0.9542,-0.3443,-53.6797,-1782.133090,...,1090.466187,-844.338867,13521.172852,1904.555341,1.523500e+03,1646.064087,206.724964,217.829208,304.202637,2016.683654
10,0.594506,1.091681,4.248750,-130.962451,-12.215099,7696.425393,0.5944,1.0918,4.2498,-542.173222,...,-324.927582,2296.204346,21652.119141,454.513704,4.384777e+02,5698.880371,78.775462,77.422356,313.766510,1249.589483
11,0.510916,-0.100533,2.469290,-115.172428,-15.312432,7416.775760,0.5115,-0.1005,2.4713,-1604.083225,...,1053.967896,-473.800598,14331.053711,1017.301236,2.521960e+03,1832.192627,690.887812,444.030945,334.951111,854.123039
12,-3.018767,-1.877033,116.476465,228.699716,60.309666,7438.445918,-3.0203,-1.8776,116.4960,470.992288,...,-3758.903076,-2457.851807,81130.992188,456.500463,5.046539e+02,956.089355,38.721189,62.913738,44.764168,9530.580227
15,0.821776,-0.194724,-98.650781,704.486275,-114.239457,6479.514135,0.8206,-0.1960,-98.6433,2103.247630,...,-953.844238,-2465.735107,11614.083984,2083.186475,2.785406e+07,45111.359375,354.375256,539.114441,421.553406,2025.621167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502096,0.879724,-0.135972,-41.057133,93.670514,7.303664,5570.798979,0.8800,-0.1366,-41.0625,464.499500,...,-1582.450806,4219.804199,93952.031250,440.130983,6.244629e+02,6789.666016,506.169549,128.224396,1137.612183,5443.543081
502104,0.765221,-0.124892,17.330386,216.976241,-2.940979,7724.294733,0.7632,-0.1235,17.3599,1791.506655,...,-297.524719,-777.621033,12218.561523,1430.647395,9.440099e+02,3637.269775,414.150864,462.258026,1282.703369,2924.919771
502109,0.846524,-0.134417,32.394226,-64.169224,6.479162,5770.951530,0.8465,-0.1340,32.3945,-584.392519,...,512.927612,1943.064453,10574.164062,1423.279621,1.261785e+03,396.024902,327.216236,527.466553,185.857224,1345.985452
502110,0.871183,-0.286051,41.685312,34.181050,-103.878831,6986.884965,0.8709,-0.2864,41.6887,421.292007,...,-2666.400391,-2992.653564,30364.205078,792.898343,8.286541e+02,820.907776,630.490371,606.935486,411.936798,4155.462143


In [16]:
features = [
    'p_PT',
    'p_PZ',
    'DTF_FixJPsi_p_PT',
    'DTF_FixJPsi_p_PZ',
    'DTF_FixJPsiLambda_p_PT',
    'DTF_FixJPsiLambda_p_PZ',
    'pim_PT',
    'pim_PZ',
    'DTF_FixJPsi_pim_PT',
    'DTF_FixJPsi_pim_PZ',
    'DTF_FixJPsiLambda_pim_PT',
    'DTF_FixJPsiLambda_pim_PZ',
    'Jpsi_PT',
    'Jpsi_PZ',
    'L_ENDVERTEX_X',
    'L_ENDVERTEX_Y',
    'L_ENDVERTEX_Z',
    'L_BPVDIRA',
    'Lb_BPVDIRA',
    'L_VFASPF_CHI2_VDOF',
    'Lb_VFASPF_CHI2_VDOF',
    'L_BPVIPCHI2',
    'Lb_BPVIPCHI2',
    'L_BPVVDCHI2',
    'Lb_BPVVDCHI2',
    'DTF_FixJPsi_status',
    'DTF_FixJPsiLambda_status'
]

# Select training and testing data
Take all the data (assuming signal and side bands data are comparably large). Scramble it with `pandas.DataFrame.sample`.

## Signal data (type 1)

In [17]:
df_signal = df_MC_Filtered.sample(frac=1, random_state=98)
df_signal['TYPE'] = 1
df_signal

Unnamed: 0_level_0,MCTRUTH_Lb_ENDVERTEX_X,MCTRUTH_Lb_ENDVERTEX_Y,MCTRUTH_Lb_ENDVERTEX_Z,MCTRUTH_L_ENDVERTEX_X,MCTRUTH_L_ENDVERTEX_Y,MCTRUTH_L_ENDVERTEX_Z,MCTRUTH_p_PX,MCTRUTH_p_PY,MCTRUTH_p_PZ,MCTRUTH_pim_PX,...,TrackType_y,MC_key,p_PT,DTF_FixJPsi_p_PT,DTF_FixJPsiLambda_p_PT,pim_PT,DTF_FixJPsi_pim_PT,DTF_FixJPsiLambda_pim_PT,Jpsi_PT,TYPE
Rec_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
352003,0.9900,0.0169,3.300000,116.388496,30.117701,6282.886719,1290.660034,252.080002,66415.468750,197.050003,...,T,8575751,1499.261230,1449.125122,1339.061157,142.576462,222.223770,239.662476,3483.928467,1
265235,1.0184,0.1609,39.636501,962.438599,463.805511,7189.831543,8050.549805,3783.229980,59438.179688,1006.159973,...,T,6484636,6531.696777,7950.545410,7250.666016,1098.382568,1173.632935,1083.654663,8100.985840,1
148181,0.7204,-0.3519,57.459000,-193.378693,-403.258698,6942.271973,-1085.930054,-2357.689941,41237.300781,-240.050003,...,T,3663879,1195.592285,2846.242432,2590.605225,600.855042,531.966919,474.126373,314.042480,1
142519,3.3923,0.6250,-4.199400,938.192383,191.425995,5742.935547,10825.129883,2139.020020,66201.109375,1162.140015,...,T,3529217,8357.870117,13019.551758,10886.081055,954.039429,1251.309082,1106.244019,6472.403320,1
242204,0.8681,1.4937,8.098100,-151.000793,112.281601,4932.158691,-1066.640015,852.979980,33873.550781,-219.490005,...,T,5933505,931.300659,1355.522217,1152.498901,812.275085,194.003128,289.336945,4190.266602,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254053,0.8669,0.1186,-20.466700,-109.663101,-47.713299,7491.895508,-818.690002,-339.369995,49225.160156,-17.280001,...,T,6216172,736.067566,905.189636,866.862000,41.733448,28.566000,29.015322,2405.740723,1
82685,1.2139,-0.9886,80.126602,256.224915,-405.731506,6392.206543,2266.000000,-3699.520020,58149.429688,662.109985,...,T,2048916,4044.346680,4247.795410,4354.338379,1076.822876,1121.902100,1156.929077,2567.075439,1
325358,1.3912,0.3859,82.847298,543.844421,-61.370899,5650.757324,2838.750000,-257.529999,29827.970703,657.669983,...,T,7930482,3212.848877,2960.609863,2977.199707,300.460205,564.119141,571.568848,4692.720215,1
300097,1.1159,0.2578,-50.207901,101.410599,128.787003,6285.319336,1082.810059,1308.699951,65906.679688,45.439999,...,T,7324779,2020.588989,1700.565674,1814.599121,230.976440,145.316101,162.045761,8384.406250,1


## Sidebands (type 0)

In [18]:
df_background  = df_Data_Sideband.sample(frac=1, random_state=98)
df_background['TYPE'] = 0
df_background

Unnamed: 0,Lb_ENDVERTEX_X,Lb_ENDVERTEX_Y,Lb_ENDVERTEX_Z,L_ENDVERTEX_X,L_ENDVERTEX_Y,L_ENDVERTEX_Z,Jpsi_ENDVERTEX_X,Jpsi_ENDVERTEX_Y,Jpsi_ENDVERTEX_Z,L_PX,...,DTF_FixJPsiLambda_PIDSubs_mup_PY,DTF_FixJPsiLambda_PIDSubs_mup_PZ,p_PT,DTF_FixJPsi_p_PT,DTF_FixJPsiLambda_p_PT,pim_PT,DTF_FixJPsi_pim_PT,DTF_FixJPsiLambda_pim_PT,Jpsi_PT,TYPE
28897,0.642704,-0.484829,-19.538637,-66.521678,-41.245603,5718.943484,0.6430,-0.4841,-19.5435,-509.996756,...,-4024.802979,47781.570312,1163.196581,470.725647,155.039932,410.491535,202.408279,9232.249023,4941.192695,0
459044,0.949720,0.141961,-17.777687,-58.294443,-161.454433,7014.824834,0.9509,0.1431,-17.7690,-957.007934,...,1839.569092,12046.143555,1919.734870,4941.097168,7097.678711,1166.447101,1179.657837,1349.979004,1101.150685,0
185001,0.770393,0.028391,1.206817,-1261.291922,2.754168,7526.806847,0.7705,0.0283,1.2093,-2661.591285,...,-2603.352783,42431.714844,2032.556586,3639.465576,3546.886475,269.843432,168.569916,97.409195,2028.476589,0
142856,0.829848,-0.246675,16.307852,98.091390,7.922506,6592.827375,0.8307,-0.2465,16.3120,1215.891286,...,-498.062378,34485.742188,564.124739,1412.951172,1476.258301,498.342746,463.623413,58.872524,1176.263813,0
488176,1.000577,-0.004293,59.323659,152.458419,56.572256,7907.576215,0.9890,0.0010,59.2685,475.039922,...,543.355103,6826.803223,451.150732,614.410950,827.602844,60.362110,46.151928,48.976830,3820.855353,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53960,0.769625,-0.137157,-1.921042,333.144757,-38.314677,7726.183021,0.7695,-0.1370,-1.9239,1414.586463,...,-1948.986938,38782.527344,645.364567,658.586609,12712.377930,757.699363,759.258667,5040.844727,2990.624457,0
192992,0.874200,-0.307568,8.344550,26.918351,102.189184,7123.010733,0.8658,-0.3103,8.3867,-25.430545,...,-446.432861,17866.273438,592.379896,920.779602,1599.084717,315.389950,180.996719,677.035889,3759.275156,0
105515,1.507326,-1.055930,53.047628,-57.905745,182.407189,7814.761008,1.5095,-1.0582,53.0698,-441.493965,...,-2071.187500,18043.048828,636.473898,505.863983,7521.579102,799.192766,711.814575,5866.992188,3322.858089,0
358893,0.774382,-0.002810,58.849712,-75.592934,149.248888,7464.667601,0.7751,-0.0029,58.8506,-527.174437,...,259.170074,10938.573242,681.055386,581.269653,986.649902,643.814391,1090.433594,1912.016602,613.784354,0


## Merge signal & background
Use 90% of the data for training, 10% for testing.

In [19]:
## Splitting points at 90% of the DF length
signalSplitPoint = int(len(df_signal) * 0.9)
backgroundSplitPoint = int(len(df_background) * 0.9)

In [20]:
df_train = pd.concat(
    [df_signal.iloc[:signalSplitPoint], df_background.iloc[:backgroundSplitPoint]], ignore_index=True
)[features + ['TYPE','DTF_FixJPsiLambda_Lb_M']]
df_train.dropna('columns', inplace=True)
df_train

Unnamed: 0,p_PT,p_PZ,DTF_FixJPsi_p_PT,DTF_FixJPsi_p_PZ,DTF_FixJPsiLambda_p_PT,DTF_FixJPsiLambda_p_PZ,pim_PT,pim_PZ,DTF_FixJPsi_pim_PT,DTF_FixJPsi_pim_PZ,...,L_VFASPF_CHI2_VDOF,Lb_VFASPF_CHI2_VDOF,L_BPVIPCHI2,Lb_BPVIPCHI2,L_BPVVDCHI2,Lb_BPVVDCHI2,DTF_FixJPsi_status,DTF_FixJPsiLambda_status,TYPE,DTF_FixJPsiLambda_Lb_M
0,1499.261230,64090.687500,1449.125122,71109.312500,1339.061157,67585.578125,142.576462,16002.674805,222.223770,15295.935547,...,12.397956,1.006837,3.118125,4.247471,8394.141602,839.898499,0.0,0.0,1,5649.782715
1,6531.696777,43770.664062,7950.545410,53713.472656,7250.666016,48946.476562,1098.382568,7154.821289,1173.632935,7339.285645,...,72.018669,4.932845,9.285525,0.700478,316576.343750,28.328979,0.0,0.0,1,5346.287598
2,1195.592285,25513.001953,2846.242432,45061.835938,2590.605225,40850.550781,600.855042,6536.190430,531.966919,6502.588379,...,58.929020,38.498478,109.721619,0.822907,48769.886719,969.064880,0.0,0.0,1,5625.960449
3,8357.870117,48524.460938,13019.551758,77204.125000,10886.081055,65169.863281,954.039429,8363.340820,1251.309082,8721.902344,...,4.634126,2.500692,5.822659,0.760508,2613.000244,11152.960938,0.0,0.0,1,5480.049316
4,931.300659,24994.496094,1355.522217,32809.832031,1152.498901,30213.080078,812.275085,7415.243652,194.003128,7669.465332,...,2.344827,9.562636,28.231209,3.574740,1273.676514,22431.037109,0.0,1.0,1,5446.454102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228493,1220.226567,72937.663691,1427.535278,85209.820312,1239.802612,67344.187500,1089.563873,38845.872752,961.556641,37909.199219,...,0.152025,0.212239,0.582464,8.037920,2371.744690,126.815938,0.0,1.0,0,6832.868652
228494,1083.955904,55949.470793,1105.869141,60395.519531,892.092041,48183.257812,450.382581,27721.281600,485.050476,26523.736328,...,0.300803,1.908871,0.620846,1.462356,74049.033084,19.377282,0.0,1.0,0,6612.175781
228495,567.501520,37682.446927,5925.211914,436724.312500,1658.590454,121800.218750,398.880250,12929.648726,275.511749,10452.941406,...,257.919474,67.560202,197.839502,175.666290,368006.797335,212.409329,0.0,1.0,0,8911.302734
228496,758.780272,33543.168911,941.613098,42739.785156,7304.916016,323111.187500,705.825113,29903.281153,768.507690,34001.078125,...,0.161039,2.624455,7.957362,99.010018,4205.262286,418.366858,0.0,1.0,0,9774.763672


In [21]:
df_test = pd.concat(
    [df_signal.iloc[signalSplitPoint:], df_background.iloc[backgroundSplitPoint:]], ignore_index=True
)[features + ['TYPE','DTF_FixJPsiLambda_Lb_M']]
df_test.dropna('columns', inplace=True)
df_test

Unnamed: 0,p_PT,p_PZ,DTF_FixJPsi_p_PT,DTF_FixJPsi_p_PZ,DTF_FixJPsiLambda_p_PT,DTF_FixJPsiLambda_p_PZ,pim_PT,pim_PZ,DTF_FixJPsi_pim_PT,DTF_FixJPsi_pim_PZ,...,L_VFASPF_CHI2_VDOF,Lb_VFASPF_CHI2_VDOF,L_BPVIPCHI2,Lb_BPVIPCHI2,L_BPVVDCHI2,Lb_BPVVDCHI2,DTF_FixJPsi_status,DTF_FixJPsiLambda_status,TYPE,DTF_FixJPsiLambda_Lb_M
0,2504.376465,27455.556641,2435.769775,27417.642578,2798.346191,31182.896484,416.471710,5826.424316,492.247925,5925.514648,...,0.116805,0.334404,0.975463,2.368801,8.063968e+03,1031.350098,0.0,0.0,1,5678.738770
1,3678.204834,28957.984375,8009.698242,63035.414062,4010.383301,31700.259766,511.451996,4495.157227,606.036377,5295.169922,...,129.013321,2.213622,3.035450,4.987368,1.139099e+06,132.812775,0.0,0.0,1,5152.405273
2,6151.153320,62528.351562,3390.550049,34433.933594,3191.249023,32347.107422,908.855103,8341.050781,954.033325,8586.155273,...,3.052457,1.626008,3.995702,3.794894,5.994128e+05,1733.996338,0.0,0.0,1,5610.329102
3,2215.048828,16164.611328,3509.922363,25586.275391,3299.109375,24183.917969,1217.576172,8981.918945,1129.961060,8099.306641,...,3.684547,6.168911,7.633062,1.222096,6.435177e+03,1848.107422,0.0,0.0,1,5655.911133
4,1299.116333,50831.921875,1585.503662,49852.496094,1661.818237,48686.039062,1164.087158,16436.048828,707.395447,15532.739258,...,3.191066,0.135195,0.518211,1.912595,1.069037e+03,329.424927,0.0,1.0,1,5623.918457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25385,645.364567,14948.375359,658.586609,15131.017578,12712.377930,261919.109375,757.699363,17478.384328,759.258667,17625.787109,...,1.038850,0.812649,0.076899,11.266728,1.439183e+03,61.443946,0.0,3.0,0,10345.851562
25386,592.379896,28444.692313,920.779602,47065.058594,1599.084717,115516.109375,315.389950,22296.099434,180.996719,18559.517578,...,0.067244,21.352331,56.091567,94.749679,2.668843e+05,448.338019,0.0,3.0,0,11415.735352
25387,636.473898,27506.147195,505.863983,20288.097656,7521.579102,311244.218750,799.192766,31655.233084,711.814575,28853.048828,...,2.411407,2.617817,1.098988,24.841964,1.060404e+04,59949.689634,0.0,3.0,0,18485.015625
25388,681.055386,28981.688872,581.269653,24546.640625,986.649902,44888.769531,643.814391,28848.166781,1090.433594,46704.347656,...,0.841319,14.690842,32.489640,125.641237,1.039346e+06,1235.642132,0.0,1.0,0,8497.440430


# Save HDF5 files

In [22]:
tickOutput = time.perf_counter()
df_train.to_hdf(outputDirectory + '/' + outputTrain, 'LHCb_Train', 'w')
df_test.to_hdf(outputDirectory + '/' + outputTest, 'LHCb_Test', 'w')
tockOutput = time.perf_counter()
print(f"Output saved in {tockOutput - tickOutput:0.4f} seconds.")

Output saved in 0.0832 seconds.
