This notebook creates a `./data/trainData.h5` and `./data/testData.h5` files containing training and testing data for the classifiers.

In [6]:
import pandas as pd
import numpy as np
import time
from os import path
import pathlib

In [7]:
outputDirectory = './data'
outputTrain = 'trainData.h5'
outputTest = 'testData.h5'

pathlib.Path(outputDirectory).mkdir(parents=True, exist_ok=True)

In [8]:
import pandas as pd
import numpy as np
import time

In [9]:
def MomentumModulus(px, py, pz):
    return np.sqrt(px**2 + py**2 + pz**2)

def TransverseMomentumModulus(px, py):
    return np.sqrt(px**2 + py**2)

Folder for all data files. The notebook will search for the HDF5 files here.

In [10]:
pathToHDF5Files = '~/data/'

The Monte Carlo HDF5 file should contain two $n$-tuples, one named `LHCbMC_Lb` (for reconstructed events) and one named `LHCbMCTruth_Lb` (for "truth" events).

The "real" data HDF5 file only needs one tree, named `LHCbData`.

In [11]:
inputMC = 'LHCbMC_2016-2017-2018_MagUpDown_Lb2JPsiL_Ttracks_v12.h5'
inputData = 'Custom_Shuffled5e6_LHCbData_2016_MagUpDown_Dimuon_Ttracks.h5'

# Monte Carlo data
## Reconstructed events

In [12]:
tickMC = time.perf_counter()
df_reco = pd.read_hdf(pathToHDF5Files + inputMC, key='LHCbMC_Lb')
tockMC = time.perf_counter()
print(f"Monte Carlo imported in {tockMC - tickMC:0.4f} seconds.")

Monte Carlo imported in 1.1349 seconds.


## Truth events

In [13]:
tickMCTruth = time.perf_counter()
df_truth = pd.read_hdf(pathToHDF5Files + inputMC, key='LHCbMCTruth_Lb')
tockMCTruth = time.perf_counter()
print(f"Monte Carlo Truth imported in {tockMCTruth - tickMCTruth:0.4f} seconds.")

Monte Carlo Truth imported in 14.4109 seconds.


## Merge the dataframes

In [37]:
tickMerge = time.perf_counter()
df_MC = pd.merge(df_truth.loc[df_truth['Rec_key'] >= 0], df_reco, left_index=True, right_on='MC_key')
df_MC = df_MC.loc[(df_MC['MC_key'] >= 0) & (df_MC['Rec_key'] >= 0)]
tockMerge = time.perf_counter()
print(f"Monte Carlo merged in {tockMerge - tickMerge:0.4f} seconds.")

Monte Carlo merged in 3.0456 seconds.


## Apply pre-selection cuts matching data

In [38]:
JPsi1SPDGMass = 3096.900

PionPCuts = (MomentumModulus(df_MC['pim_PX'], df_MC['pim_PY'], df_MC['pim_PZ']) > 2000) & (MomentumModulus(df_MC['pim_PX'], df_MC['pim_PY'], df_MC['pim_PZ']) < 5e5)
ProtonPCuts = (MomentumModulus(df_MC['p_PX'], df_MC['p_PY'], df_MC['p_PZ']) > 10000) & (MomentumModulus(df_MC['p_PX'], df_MC['p_PY'], df_MC['p_PZ']) < 5e5)
ProtonPTCuts = TransverseMomentumModulus(df_MC['p_PX'], df_MC['p_PY']) > 400
## Combined m(p-pi)? Seems to be "AM" in the DaVinci opt file
LambdaMCuts = (df_MC['L_M'] > 600) & (df_MC['L_M'] < 1500)
LambdaMMCuts = df_MC['L_MM'] < 1500
LambdaZCuts = (df_MC['L_ENDVERTEX_Z'] > 5500) & (df_MC['L_ENDVERTEX_Z'] < 8500)
LambdaDiraCuts = (df_MC['L_BPVDIRA'] > 0.9999)
LambdaBPVIPCHI2Cuts = df_MC['L_BPVIPCHI2'] < 200
LambdaBPVVDCHI2Cuts = df_MC['L_BPVVDCHI2'] < 2e7
LambdaChi2Cuts = df_MC['L_VFASPF_CHI2_VDOF'] < 750
JPsiMCuts = abs(df_MC['Jpsi_M'] - JPsi1SPDGMass) < 90
LambdaPTCuts = TransverseMomentumModulus(df_MC['L_PX'], df_MC['L_PY']) > 450
## Combined m(JpsiLambda)? See comment above
LambdabMCuts = (df_MC['Lb_M'] < 8500)
LambdabDiraCuts = abs(df_MC['Lb_BPVDIRA']) > 0.99
LambdabBPVIPCHI2Cuts = df_MC['Lb_BPVIPCHI2'] < 1750
LambdabChi2Cuts = df_MC['Lb_VFASPF_CHI2_VDOF'] < 150

df_MC_Filtered = df_MC.loc[
    PionPCuts &
    ProtonPCuts &
    ProtonPTCuts &
    LambdaMCuts &
    LambdaMMCuts &
    LambdaZCuts &
    LambdaDiraCuts &
    LambdaBPVIPCHI2Cuts &
    LambdaBPVVDCHI2Cuts &
    LambdaChi2Cuts &
    JPsiMCuts &
    LambdaPTCuts &
    LambdabMCuts &
    LambdabDiraCuts &
    LambdabBPVIPCHI2Cuts &
    LambdabChi2Cuts
]

df_MC_Filtered

Unnamed: 0_level_0,MCTRUTH_Lb_ENDVERTEX_X,MCTRUTH_Lb_ENDVERTEX_Y,MCTRUTH_Lb_ENDVERTEX_Z,MCTRUTH_L_ENDVERTEX_X,MCTRUTH_L_ENDVERTEX_Y,MCTRUTH_L_ENDVERTEX_Z,MCTRUTH_p_PX,MCTRUTH_p_PY,MCTRUTH_p_PZ,MCTRUTH_pim_PX,...,DTF_FixJPsiLambda_PIDSubs_p_PY,DTF_FixJPsiLambda_PIDSubs_p_PZ,DTF_FixJPsiLambda_PIDSubs_mum_PX,DTF_FixJPsiLambda_PIDSubs_mum_PY,DTF_FixJPsiLambda_PIDSubs_mum_PZ,DTF_FixJPsiLambda_PIDSubs_mup_PX,DTF_FixJPsiLambda_PIDSubs_mup_PY,DTF_FixJPsiLambda_PIDSubs_mup_PZ,TrackType_y,MC_key
Rec_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.9512,-0.1669,26.401600,343.742889,-192.429398,4751.548828,2031.640015,-1218.119995,29336.890625,445.510010,...,-186.566345,5411.963379,-804.469727,-535.944397,22858.960938,1438.071777,1619.347290,23634.302734,T,0
1,0.8237,-0.2123,3.514200,140.238602,325.837311,7463.102539,367.250000,1035.959961,24602.169922,174.279999,...,222.666214,4137.976074,-605.608093,594.454895,11170.495117,71.893692,-2873.631348,73489.484375,T,29
8,0.5442,-0.4049,-30.064199,-237.463104,-161.438004,5029.397461,-1999.180054,-1447.800049,43812.398438,-599.280029,...,-151.918823,7139.517578,100.262436,999.515259,50033.699219,-120.737198,-995.344788,6822.337402,T,277
14,0.9904,-0.2075,-9.769600,224.561996,56.081600,5921.048828,2894.560059,699.140015,74302.851562,293.929993,...,104.645416,10220.954102,-884.967468,195.843964,34903.839844,1645.699585,-1113.117432,24484.212891,T,367
17,0.8236,-0.2738,28.986401,-296.374695,-100.123802,5963.561523,-1268.760010,-522.340027,25478.310547,-184.460007,...,20.153040,3543.737305,-829.315674,-926.117676,37815.328125,2614.833252,-1553.520630,63936.839844,T,493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366374,1.9930,-0.9682,33.883499,454.162109,-444.944794,6047.499512,7053.950195,-6800.229980,92633.132812,949.369995,...,-1059.302368,13806.107422,2310.392578,-2437.838867,42988.765625,2057.492432,-877.707703,10357.686523,T,8921400
366378,1.2187,-0.0757,-10.080700,120.471603,-69.038300,7221.361328,1256.689941,-628.059998,71476.546875,189.229996,...,-259.237457,19192.976562,1663.037476,477.534760,14450.500000,-812.013245,-604.744568,27734.162109,T,8921503
366382,1.1575,-0.3940,43.910599,4.075100,-146.877396,7409.937500,133.949997,-2934.969971,147684.812500,-69.339996,...,-295.752747,15403.834961,196.509079,-679.625732,6668.261719,3547.472900,-3286.872803,190998.375000,T,8921603
366391,0.9741,0.2091,-11.659000,294.090698,39.875000,7125.287109,2412.750000,405.109985,60216.011719,395.200012,...,-18.306223,7767.518555,1504.345581,715.472656,49351.718750,-112.221489,2933.706299,37641.914062,T,8921673


# Real data

In [39]:
tickData = time.perf_counter()
df_Data = pd.read_hdf(pathToHDF5Files + inputData, key='LHCbData')
tockData = time.perf_counter()
print(f"Data imported found in {tockData - tickData:0.4f} seconds.")

Data imported found in 10.9338 seconds.


# Add missing features

In [40]:
df_MC_Filtered = df_MC_Filtered.assign(
    p_PT = TransverseMomentumModulus(df_MC_Filtered['p_PX'],df_MC_Filtered['p_PY']),
    DTF_FixJPsi_p_PT = TransverseMomentumModulus(df_MC_Filtered['DTF_FixJPsi_p_PX'], df_MC_Filtered['DTF_FixJPsi_p_PY']),
    DTF_FixJPsiLambda_p_PT = TransverseMomentumModulus(df_MC_Filtered['DTF_FixJPsiLambda_p_PX'], df_MC_Filtered['DTF_FixJPsiLambda_p_PY']),
    pim_PT = TransverseMomentumModulus(df_MC_Filtered['pim_PX'],df_MC_Filtered['pim_PY']),
    DTF_FixJPsi_pim_PT = TransverseMomentumModulus(df_MC_Filtered['DTF_FixJPsi_pim_PX'], df_MC_Filtered['DTF_FixJPsi_pim_PY']),
    DTF_FixJPsiLambda_pim_PT = TransverseMomentumModulus(df_MC_Filtered['DTF_FixJPsiLambda_pim_PX'], df_MC_Filtered['DTF_FixJPsiLambda_pim_PY']),
    Jpsi_PT = TransverseMomentumModulus(df_MC_Filtered['Jpsi_PX'],df_MC_Filtered['Jpsi_PY'])
)

In [41]:
successDictionaryReverse = {
    'Success': 0.0,
    'Failed': 1.0,
    'NonConverged': 3.0
}

df_MC_Filtered.replace({'DTF_FixJPsi_status': successDictionaryReverse}, inplace=True)
df_MC_Filtered.replace({'DTF_FixJPsiLambda_status': successDictionaryReverse}, inplace=True)
df_MC_Filtered

Unnamed: 0_level_0,MCTRUTH_Lb_ENDVERTEX_X,MCTRUTH_Lb_ENDVERTEX_Y,MCTRUTH_Lb_ENDVERTEX_Z,MCTRUTH_L_ENDVERTEX_X,MCTRUTH_L_ENDVERTEX_Y,MCTRUTH_L_ENDVERTEX_Z,MCTRUTH_p_PX,MCTRUTH_p_PY,MCTRUTH_p_PZ,MCTRUTH_pim_PX,...,DTF_FixJPsiLambda_PIDSubs_mup_PZ,TrackType_y,MC_key,p_PT,DTF_FixJPsi_p_PT,DTF_FixJPsiLambda_p_PT,pim_PT,DTF_FixJPsi_pim_PT,DTF_FixJPsiLambda_pim_PT,Jpsi_PT
Rec_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.9512,-0.1669,26.401600,343.742889,-192.429398,4751.548828,2031.640015,-1218.119995,29336.890625,445.510010,...,23634.302734,T,0,3670.964844,2624.076172,2733.329834,311.089020,260.727386,334.741455,1259.644897
1,0.8237,-0.2123,3.514200,140.238602,325.837311,7463.102539,367.250000,1035.959961,24602.169922,174.279999,...,73489.484375,T,29,962.454834,1039.160889,1095.172607,292.199097,262.192261,278.820038,2350.277832
8,0.5442,-0.4049,-30.064199,-237.463104,-161.438004,5029.397461,-1999.180054,-1447.800049,43812.398438,-599.280029,...,6822.337402,T,277,6069.714355,2933.882812,354.068542,313.657623,496.833252,456.626129,16.075375
14,0.9904,-0.2075,-9.769600,224.561996,56.081600,5921.048828,2894.560059,699.140015,74302.851562,293.929993,...,24484.212891,T,367,3138.222168,2606.361572,2835.955811,563.971436,397.024719,499.553711,1183.876953
17,0.8236,-0.2738,28.986401,-296.374695,-100.123802,5963.561523,-1268.760010,-522.340027,25478.310547,-184.460007,...,63936.839844,T,493,1088.444824,1386.584595,1326.820923,503.365540,128.678238,192.515366,3060.318604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366374,1.9930,-0.9682,33.883499,454.162109,-444.944794,6047.499512,7053.950195,-6800.229980,92633.132812,949.369995,...,10357.686523,T,8921400,9575.651367,11972.828125,9594.823242,1461.105103,1673.396362,1428.191162,5488.212402
366378,1.2187,-0.0757,-10.080700,120.471603,-69.038300,7221.361328,1256.689941,-628.059998,71476.546875,189.229996,...,27734.162109,T,8921503,1055.863892,1285.636230,1963.163696,429.001678,345.132141,352.585876,855.027405
366382,1.1575,-0.3940,43.910599,4.075100,-146.877396,7409.937500,133.949997,-2934.969971,147684.812500,-69.339996,...,190998.375000,T,8921603,4872.735840,3059.736816,1022.928284,265.291748,270.466003,294.877197,5446.465820
366391,0.9741,0.2091,-11.659000,294.090698,39.875000,7125.287109,2412.750000,405.109985,60216.011719,395.200012,...,37641.914062,T,8921673,2909.400391,2303.910400,2447.316895,426.464935,355.111664,397.183228,3911.406494


In [43]:
df_Data = df_Data.assign(
    p_PT = TransverseMomentumModulus(df_Data['p_PX'],df_Data['p_PY']),
    DTF_FixJPsi_p_PT = TransverseMomentumModulus(df_Data['DTF_FixJPsi_p_PX'], df_Data['DTF_FixJPsi_p_PY']),
    DTF_FixJPsiLambda_p_PT = TransverseMomentumModulus(df_Data['DTF_FixJPsiLambda_p_PX'], df_Data['DTF_FixJPsiLambda_p_PY']),
    pim_PT = TransverseMomentumModulus(df_Data['pim_PX'],df_Data['pim_PY']),
    DTF_FixJPsi_pim_PT = TransverseMomentumModulus(df_Data['DTF_FixJPsi_pim_PX'], df_Data['DTF_FixJPsi_pim_PY']),
    DTF_FixJPsiLambda_pim_PT = TransverseMomentumModulus(df_Data['DTF_FixJPsiLambda_pim_PX'], df_Data['DTF_FixJPsiLambda_pim_PY']),
    Jpsi_PT = TransverseMomentumModulus(df_Data['Jpsi_PX'],df_Data['Jpsi_PY'])
)

df_Data

Unnamed: 0,Lb_ENDVERTEX_X,Lb_ENDVERTEX_Y,Lb_ENDVERTEX_Z,L_ENDVERTEX_X,L_ENDVERTEX_Y,L_ENDVERTEX_Z,Jpsi_ENDVERTEX_X,Jpsi_ENDVERTEX_Y,Jpsi_ENDVERTEX_Z,L_PX,...,DTF_FixJPsiLambda_mup_PX,DTF_FixJPsiLambda_mup_PY,DTF_FixJPsiLambda_mup_PZ,p_PT,DTF_FixJPsi_p_PT,DTF_FixJPsiLambda_p_PT,pim_PT,DTF_FixJPsi_pim_PT,DTF_FixJPsiLambda_pim_PT,Jpsi_PT
0,0.925052,-0.678207,-9.309851,-341.755020,-530.188399,6303.036076,0.9250,-0.6782,-9.3093,-1234.491813,...,26.626278,-1088.075684,8410.094727,1838.123436,2388.441895,2002.636597,631.990184,603.600525,442.353149,4222.937881
1,0.658849,0.801886,-5.706660,270.115791,-5.721809,5951.895616,0.6578,0.8060,-5.8782,1010.379621,...,-92.054810,759.476013,41517.515625,1288.407362,564.582581,527.749817,265.160669,213.536026,159.984573,2360.397452
2,0.555028,-0.524742,62.185736,-143.153318,-236.588826,7900.188126,0.5545,-0.5252,62.1887,-394.466553,...,-1488.140503,-3170.165527,16915.718750,607.090191,310.837036,411.221008,139.097613,147.180145,146.397705,3064.827444
3,0.463447,-0.489553,-83.704683,163.413343,330.928250,7921.420039,0.4637,-0.4896,-83.7049,360.417860,...,1242.073364,666.421265,5034.672852,544.419101,257.022369,355.786957,235.416458,174.329514,117.761795,238.811089
4,0.792450,0.896142,-0.668591,280.008329,-146.317945,7849.805626,0.7924,0.8957,-0.6724,568.676267,...,-1113.175659,3486.854980,44005.027344,451.662792,639.843689,790.629761,230.630541,155.682144,201.640015,8381.333980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5001606,0.554408,-0.320514,-0.789850,97.053856,-18.271533,5611.693538,0.5555,-0.3197,-0.8036,1223.091828,...,-936.231445,-2347.869873,24466.626953,1502.236255,313.611359,3763.508545,112.138097,467.593414,1825.850342,5091.197992
5001607,0.554568,-0.320382,-0.791936,107.030048,-26.437617,6302.106746,0.5555,-0.3197,-0.8036,1165.732118,...,-1043.608887,-2431.440674,26848.666016,1798.265754,380.220673,1913.615723,112.138097,1102.986694,1492.622314,5091.197992
5001608,0.773464,-0.018334,13.879051,-0.479735,71.897004,6058.278197,0.7732,-0.0188,13.8783,-313.798984,...,284.854431,-1285.101929,13390.396484,1156.689174,2051.073730,306.955811,288.860767,2000.975342,187.409241,635.638121
5001609,0.873826,-0.167370,24.378388,73.077364,-45.271725,6071.281366,0.8740,-0.1671,24.3759,587.754243,...,862.345825,-345.342651,12463.309570,1047.961973,519.036377,261.838898,432.656324,286.640015,391.688782,1970.210337


In [44]:
features = [
    'p_PT',
    'p_PZ',
    'DTF_FixJPsi_p_PT',
    'DTF_FixJPsi_p_PZ',
    'DTF_FixJPsiLambda_p_PT',
    'DTF_FixJPsiLambda_p_PZ',
    'pim_PT',
    'pim_PZ',
    'DTF_FixJPsi_pim_PT',
    'DTF_FixJPsi_pim_PZ',
    'DTF_FixJPsiLambda_pim_PT',
    'DTF_FixJPsiLambda_pim_PZ',
    'Jpsi_PT',
    'Jpsi_PZ',
    'L_ENDVERTEX_X',
    'L_ENDVERTEX_Y',
    'L_ENDVERTEX_Z',
    'L_BPVDIRA',
    'Lb_BPVDIRA',
    'L_VFASPF_CHI2_VDOF',
    'Lb_VFASPF_CHI2_VDOF',
    'L_BPVIPCHI2',
    'Lb_BPVIPCHI2',
    'L_BPVVDCHI2',
    'Lb_BPVVDCHI2',
    'DTF_FixJPsi_status',
    'DTF_FixJPsiLambda_status'
]

## Select side bands

In [45]:
SBL_LeftBoundary = 6220
SBL_RightBoundary = 6520

df_Data_Sideband_Right = df_Data.loc[(df_Data['DTF_FixJPsiLambda_Lb_M'] > SBL_LeftBoundary)
                                     & (df_Data['DTF_FixJPsiLambda_Lb_M'] < SBL_RightBoundary)].sample(n=len(df_MC_Filtered), random_state=2021)
df_Data_Sideband_Right

Unnamed: 0,Lb_ENDVERTEX_X,Lb_ENDVERTEX_Y,Lb_ENDVERTEX_Z,L_ENDVERTEX_X,L_ENDVERTEX_Y,L_ENDVERTEX_Z,Jpsi_ENDVERTEX_X,Jpsi_ENDVERTEX_Y,Jpsi_ENDVERTEX_Z,L_PX,...,DTF_FixJPsiLambda_mup_PX,DTF_FixJPsiLambda_mup_PY,DTF_FixJPsiLambda_mup_PZ,p_PT,DTF_FixJPsi_p_PT,DTF_FixJPsiLambda_p_PT,pim_PT,DTF_FixJPsi_pim_PT,DTF_FixJPsiLambda_pim_PT,Jpsi_PT
1126526,1.075703,-0.137006,-15.157787,224.975859,339.131739,6985.702232,1.0757,-0.1370,-15.1578,2013.361002,...,-1090.800171,1421.500488,24920.376953,3356.080518,3389.131836,2644.034424,569.799074,597.631775,421.796997,2042.508582
1034594,1.627742,-0.284601,-2.282267,-23.537843,-156.873968,7340.003692,1.6283,-0.2847,-2.2794,-66.766728,...,3323.115723,-2426.248779,21348.714844,450.227036,485.381927,523.213013,181.701089,157.505920,178.471100,4999.555294
1587079,1.653350,-0.445524,-122.300292,249.373958,-408.853906,7870.968193,1.6492,-0.4445,-122.2424,972.107622,...,-1436.747070,329.901672,98331.093750,1467.385109,927.536804,1105.487915,350.588452,378.592621,389.582153,3360.232189
4408798,0.771813,-0.124264,22.286017,43.368878,85.246774,5652.761535,0.7692,-0.1136,22.1992,263.297729,...,581.523438,-2218.681885,18091.923828,490.584907,675.811279,1093.922852,344.514790,67.179924,251.242126,3317.058794
4384944,0.576533,-0.271873,-32.200913,20.071114,91.142335,6107.419533,0.5767,-0.2722,-32.2002,413.491548,...,1463.650269,562.618835,36746.644531,965.602776,1009.207153,2172.920898,883.083028,607.147400,289.533691,340.775515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4196678,0.595449,-0.156574,32.225194,-24.298713,562.286169,5965.675265,0.5953,-0.1566,32.2274,-142.495613,...,-3285.478027,806.119141,39430.429688,1441.556026,1340.340820,1368.070679,747.324288,430.763214,422.696228,5342.958601
754927,0.997380,-0.158912,66.293904,-112.495016,-149.928743,7784.940033,0.9975,-0.1588,66.2978,-450.609250,...,332.916870,-621.948120,9901.425781,620.560282,619.806946,1444.551514,99.175498,101.810478,106.451729,4511.116485
2248601,0.432278,-0.089086,-41.524266,164.375280,-1.545381,6575.789886,0.4321,-0.0886,-41.5198,2676.046557,...,-2125.011475,630.225098,125395.742188,2639.715117,1582.678955,1461.442383,32.303694,215.138733,239.545685,3530.100949
3275478,0.877242,-0.212688,-0.364476,131.051278,-99.258160,7672.501216,0.8617,-0.2124,-0.4389,1266.754409,...,1686.366943,-584.904480,8773.979492,1596.359655,6114.136230,1634.738647,112.004206,120.055580,84.943939,3006.097530


In [46]:
SBR_LeftBoundary = 4870
SBR_RightBoundary = 5020

df_Data_Sideband_Left = df_Data.loc[(df_Data['DTF_FixJPsiLambda_Lb_M'] > SBR_LeftBoundary)
                                     & (df_Data['DTF_FixJPsiLambda_Lb_M'] < SBR_RightBoundary)].sample(n=len(df_MC_Filtered), random_state=2021)
df_Data_Sideband_Left

Unnamed: 0,Lb_ENDVERTEX_X,Lb_ENDVERTEX_Y,Lb_ENDVERTEX_Z,L_ENDVERTEX_X,L_ENDVERTEX_Y,L_ENDVERTEX_Z,Jpsi_ENDVERTEX_X,Jpsi_ENDVERTEX_Y,Jpsi_ENDVERTEX_Z,L_PX,...,DTF_FixJPsiLambda_mup_PX,DTF_FixJPsiLambda_mup_PY,DTF_FixJPsiLambda_mup_PZ,p_PT,DTF_FixJPsi_p_PT,DTF_FixJPsiLambda_p_PT,pim_PT,DTF_FixJPsi_pim_PT,DTF_FixJPsiLambda_pim_PT,Jpsi_PT
4854321,1.025634,-0.995198,10.555465,-94.739334,27.432672,5874.240586,1.0256,-0.9952,10.5554,-503.703289,...,2013.540894,-2159.074463,45040.546875,1011.112146,449.623016,286.203613,508.376245,193.596329,54.390614,4264.425715
488039,0.875858,-0.241867,-136.556101,85.784955,-616.228194,7241.132979,0.8758,-0.2418,-136.5548,7.630995,...,734.763367,499.157959,8999.548828,932.065908,1531.922607,900.455566,373.881449,329.893311,159.010071,1461.652113
4049149,0.394853,-0.140394,-43.587299,64.187916,-52.655281,6090.351750,0.4028,-0.1407,-43.6282,587.212745,...,-3034.291016,180.406616,15588.925781,910.983370,221.013229,255.532974,422.856214,464.205444,94.119987,3105.010496
1965493,1.195369,0.481556,63.381496,180.543492,528.280287,7876.499376,1.1956,0.4814,63.3815,385.727246,...,1737.446655,-219.117844,66079.921875,810.903686,1234.268555,809.132263,218.461008,233.104568,199.973312,1222.251880
133733,-0.508717,-0.969788,62.844895,-129.704597,-77.531589,7506.636075,-0.5102,-0.9705,62.8556,-815.467629,...,-1962.109985,-1016.836670,14194.675781,652.599508,700.601990,699.935059,275.328631,234.532043,241.315536,3175.784717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2939249,1.278807,-1.866384,-20.995116,85.377618,320.578857,6883.447600,1.2780,-1.8642,-20.9862,209.706270,...,-1681.072998,4569.445312,18649.787109,464.476933,639.475647,459.065643,492.425258,259.103912,129.050705,5469.555998
3471080,0.933425,-0.269622,14.014097,291.778894,230.097756,6843.272529,0.9327,-0.2687,14.0038,1076.600608,...,2383.926025,-2685.153076,31153.384766,1089.370351,825.594849,830.520325,225.497806,283.017944,284.561188,3378.574845
158757,0.469227,-0.087836,-40.715463,16.552392,-131.210032,7327.500368,0.4711,-0.0886,-40.7310,117.856939,...,-911.072693,324.618744,6958.228027,559.328152,436.173035,453.197083,185.230628,177.812653,165.436432,3045.272520
4785129,2.519626,0.410314,65.612693,44.219642,114.973016,6885.832690,2.5196,0.4103,65.6123,322.949336,...,491.637482,1943.229126,29967.322266,894.606531,751.222168,863.118896,198.901387,144.624969,146.701172,3755.043841


# Select training and testing data
Take all the data (assuming signal and side bands data are comparably large). Scramble it with `pandas.DataFrame.sample`.

## Signal data (type 1)

In [47]:
df_signal = df_MC_Filtered.sample(frac=1, random_state=98)
df_signal['TYPE'] = 1
df_signal

Unnamed: 0_level_0,MCTRUTH_Lb_ENDVERTEX_X,MCTRUTH_Lb_ENDVERTEX_Y,MCTRUTH_Lb_ENDVERTEX_Z,MCTRUTH_L_ENDVERTEX_X,MCTRUTH_L_ENDVERTEX_Y,MCTRUTH_L_ENDVERTEX_Z,MCTRUTH_p_PX,MCTRUTH_p_PY,MCTRUTH_p_PZ,MCTRUTH_pim_PX,...,TrackType_y,MC_key,p_PT,DTF_FixJPsi_p_PT,DTF_FixJPsiLambda_p_PT,pim_PT,DTF_FixJPsi_pim_PT,DTF_FixJPsiLambda_pim_PT,Jpsi_PT,TYPE
Rec_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
352003,0.9900,0.0169,3.300000,116.388496,30.117701,6282.886719,1290.660034,252.080002,66415.468750,197.050003,...,T,8575751,1499.261230,1449.125122,1339.061157,142.576462,222.223770,239.662476,3483.928467,1
265235,1.0184,0.1609,39.636501,962.438599,463.805511,7189.831543,8050.549805,3783.229980,59438.179688,1006.159973,...,T,6484636,6531.696777,7950.545410,7250.666016,1098.382568,1173.632935,1083.654663,8100.985840,1
148181,0.7204,-0.3519,57.459000,-193.378693,-403.258698,6942.271973,-1085.930054,-2357.689941,41237.300781,-240.050003,...,T,3663879,1195.592285,2846.242432,2590.605225,600.855042,531.966919,474.126373,314.042480,1
142519,3.3923,0.6250,-4.199400,938.192383,191.425995,5742.935547,10825.129883,2139.020020,66201.109375,1162.140015,...,T,3529217,8357.870117,13019.551758,10886.081055,954.039429,1251.309082,1106.244019,6472.403320,1
242204,0.8681,1.4937,8.098100,-151.000793,112.281601,4932.158691,-1066.640015,852.979980,33873.550781,-219.490005,...,T,5933505,931.300659,1355.522217,1152.498901,812.275085,194.003128,289.336945,4190.266602,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254053,0.8669,0.1186,-20.466700,-109.663101,-47.713299,7491.895508,-818.690002,-339.369995,49225.160156,-17.280001,...,T,6216172,736.067566,905.189636,866.862000,41.733448,28.566000,29.015322,2405.740723,1
82685,1.2139,-0.9886,80.126602,256.224915,-405.731506,6392.206543,2266.000000,-3699.520020,58149.429688,662.109985,...,T,2048916,4044.346680,4247.795410,4354.338379,1076.822876,1121.902100,1156.929077,2567.075439,1
325358,1.3912,0.3859,82.847298,543.844421,-61.370899,5650.757324,2838.750000,-257.529999,29827.970703,657.669983,...,T,7930482,3212.848877,2960.609863,2977.199707,300.460205,564.119141,571.568848,4692.720215,1
300097,1.1159,0.2578,-50.207901,101.410599,128.787003,6285.319336,1082.810059,1308.699951,65906.679688,45.439999,...,T,7324779,2020.588989,1700.565674,1814.599121,230.976440,145.316101,162.045761,8384.406250,1


## Sidebands (type 0)

In [49]:
df_background  = pd.concat([df_Data_Sideband_Left, df_Data_Sideband_Right],ignore_index=True).sample(frac=1, random_state=98)
df_background['TYPE'] = 0
df_background

Unnamed: 0,Lb_ENDVERTEX_X,Lb_ENDVERTEX_Y,Lb_ENDVERTEX_Z,L_ENDVERTEX_X,L_ENDVERTEX_Y,L_ENDVERTEX_Z,Jpsi_ENDVERTEX_X,Jpsi_ENDVERTEX_Y,Jpsi_ENDVERTEX_Z,L_PX,...,DTF_FixJPsiLambda_mup_PY,DTF_FixJPsiLambda_mup_PZ,p_PT,DTF_FixJPsi_p_PT,DTF_FixJPsiLambda_p_PT,pim_PT,DTF_FixJPsi_pim_PT,DTF_FixJPsiLambda_pim_PT,Jpsi_PT,TYPE
28997,0.866282,-0.221824,34.603739,38.929343,126.565117,5620.179680,0.8663,-0.2218,34.6035,149.188832,...,-384.592316,12389.309570,833.286281,463.985535,237.842957,672.788322,259.747070,143.065231,4794.119579,0
129975,1.427026,0.510386,-23.665964,-168.610850,-17.262810,7674.270031,1.4297,0.5121,-23.6187,-1171.799464,...,2380.284180,59607.320312,763.231580,1043.670166,1108.065186,422.313763,304.590302,284.809479,3275.660673,0
145147,0.921230,0.217414,85.529883,-45.426792,-128.974016,7002.884417,0.9214,0.2181,85.5418,-224.499025,...,4892.035645,76342.906250,482.942805,488.910339,483.614166,274.093251,202.115372,196.090530,6901.043438,0
157201,2.855284,-0.335036,7.129014,689.720266,658.355159,6554.258786,2.8552,-0.3350,7.1284,3741.482963,...,1196.984863,14303.850586,3761.514093,1195.304321,4985.639648,822.659105,248.364349,310.453308,4223.270105,0
49192,1.124640,-0.118528,-93.832546,-87.948401,-113.124460,7132.807437,1.1245,-0.1185,-93.8324,-750.285119,...,801.278809,76830.523438,762.620454,1094.758301,936.768982,433.116147,460.151276,343.224243,2393.054952,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18497,0.774348,-0.223688,8.946505,332.556518,-185.159134,7067.099726,0.7744,-0.2237,8.9462,746.326377,...,1698.473389,19138.736328,495.164352,435.353058,356.331024,597.461256,423.360504,192.778870,2674.005940,0
66139,0.787832,-0.218751,-4.019723,-25.128139,-104.976585,6385.352376,0.7873,-0.2194,-4.0257,32.649676,...,-135.171524,121914.484375,871.944372,722.384888,851.610535,397.351603,332.517120,82.926285,2920.482777,0
36287,0.811321,-0.259132,34.668579,302.389748,-369.444335,7955.238810,0.8112,-0.2592,34.6689,639.118251,...,-1601.757812,70303.179688,751.493409,806.047241,873.887512,290.474769,278.130737,271.613159,951.818401,0
123434,1.071882,-0.257701,-5.641630,-72.857261,109.515589,6450.655276,1.0717,-0.2580,-5.6444,-643.493274,...,-425.313934,18108.796875,887.085710,1081.949463,1301.839478,486.905991,254.658630,159.304413,3185.333459,0


## Merge signal & background
Use 90% of the data for training, 10% for testing.

In [50]:
## Splitting points at 90% of the DF length
signalSplitPoint = int(len(df_signal) * 0.9)
backgroundSplitPoint = int(len(df_background) * 0.9)

In [51]:
df_train = pd.concat(
    [df_signal.iloc[:signalSplitPoint], df_background.iloc[:backgroundSplitPoint]], ignore_index=True
)[features + ['TYPE','DTF_FixJPsiLambda_Lb_M']]
df_train.dropna('columns', inplace=True)
df_train

Unnamed: 0,p_PT,p_PZ,DTF_FixJPsi_p_PT,DTF_FixJPsi_p_PZ,DTF_FixJPsiLambda_p_PT,DTF_FixJPsiLambda_p_PZ,pim_PT,pim_PZ,DTF_FixJPsi_pim_PT,DTF_FixJPsi_pim_PZ,...,L_VFASPF_CHI2_VDOF,Lb_VFASPF_CHI2_VDOF,L_BPVIPCHI2,Lb_BPVIPCHI2,L_BPVVDCHI2,Lb_BPVVDCHI2,DTF_FixJPsi_status,DTF_FixJPsiLambda_status,TYPE,DTF_FixJPsiLambda_Lb_M
0,1499.261230,64090.687500,1449.125122,71109.312500,1339.061157,67585.578125,142.576462,16002.674805,222.223770,15295.935547,...,12.397956,1.006837,3.118125,4.247471,8.394142e+03,839.898499,0.0,0.0,1,5649.782715
1,6531.696777,43770.664062,7950.545410,53713.472656,7250.666016,48946.476562,1098.382568,7154.821289,1173.632935,7339.285645,...,72.018669,4.932845,9.285525,0.700478,3.165763e+05,28.328979,0.0,0.0,1,5346.287598
2,1195.592285,25513.001953,2846.242432,45061.835938,2590.605225,40850.550781,600.855042,6536.190430,531.966919,6502.588379,...,58.929020,38.498478,109.721619,0.822907,4.876989e+04,969.064880,0.0,0.0,1,5625.960449
3,8357.870117,48524.460938,13019.551758,77204.125000,10886.081055,65169.863281,954.039429,8363.340820,1251.309082,8721.902344,...,4.634126,2.500692,5.822659,0.760508,2.613000e+03,11152.960938,0.0,0.0,1,5480.049316
4,931.300659,24994.496094,1355.522217,32809.832031,1152.498901,30213.080078,812.275085,7415.243652,194.003128,7669.465332,...,2.344827,9.562636,28.231209,3.574740,1.273677e+03,22431.037109,0.0,1.0,1,5446.454102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217943,445.273392,13820.158702,3972.747559,120230.015625,1337.582764,36320.218750,761.756481,3374.140357,546.493347,3185.904053,...,63.446099,1.218580,3.571386,15.079227,6.501153e+04,44.503804,0.0,1.0,0,4911.453613
217944,625.634413,11269.897699,525.380066,10684.035156,392.802063,8542.464844,145.763878,2076.179413,134.418900,2092.732178,...,28.279526,6.260448,18.039083,3.554084,2.436691e+06,13.191321,0.0,0.0,0,4925.180664
217945,1526.666540,98911.628918,1539.424927,90282.515625,1497.569336,86306.203125,606.937059,22006.441937,451.258881,21892.208984,...,0.144299,0.037113,0.103937,22.219251,9.227146e+03,34.697374,0.0,0.0,0,5015.115234
217946,3970.637721,70980.828844,2426.877930,41451.250000,1642.944458,28352.472656,357.615649,6230.024520,298.756683,6482.682129,...,45.532044,10.496439,29.901501,1.806842,1.390948e+05,2.311459,0.0,0.0,0,4909.061035


In [52]:
df_test = pd.concat(
    [df_signal.iloc[signalSplitPoint:], df_background.iloc[backgroundSplitPoint:]], ignore_index=True
)[features + ['TYPE','DTF_FixJPsiLambda_Lb_M']]
df_test.dropna('columns', inplace=True)
df_test

Unnamed: 0,p_PT,p_PZ,DTF_FixJPsi_p_PT,DTF_FixJPsi_p_PZ,DTF_FixJPsiLambda_p_PT,DTF_FixJPsiLambda_p_PZ,pim_PT,pim_PZ,DTF_FixJPsi_pim_PT,DTF_FixJPsi_pim_PZ,...,L_VFASPF_CHI2_VDOF,Lb_VFASPF_CHI2_VDOF,L_BPVIPCHI2,Lb_BPVIPCHI2,L_BPVVDCHI2,Lb_BPVVDCHI2,DTF_FixJPsi_status,DTF_FixJPsiLambda_status,TYPE,DTF_FixJPsiLambda_Lb_M
0,2504.376465,27455.556641,2435.769775,27417.642578,2798.346191,31182.896484,416.471710,5826.424316,492.247925,5925.514648,...,0.116805,0.334404,0.975463,2.368801,8.063968e+03,1031.350098,0.0,0.0,1,5678.738770
1,3678.204834,28957.984375,8009.698242,63035.414062,4010.383301,31700.259766,511.451996,4495.157227,606.036377,5295.169922,...,129.013321,2.213622,3.035450,4.987368,1.139099e+06,132.812775,0.0,0.0,1,5152.405273
2,6151.153320,62528.351562,3390.550049,34433.933594,3191.249023,32347.107422,908.855103,8341.050781,954.033325,8586.155273,...,3.052457,1.626008,3.995702,3.794894,5.994128e+05,1733.996338,0.0,0.0,1,5610.329102
3,2215.048828,16164.611328,3509.922363,25586.275391,3299.109375,24183.917969,1217.576172,8981.918945,1129.961060,8099.306641,...,3.684547,6.168911,7.633062,1.222096,6.435177e+03,1848.107422,0.0,0.0,1,5655.911133
4,1299.116333,50831.921875,1585.503662,49852.496094,1661.818237,48686.039062,1164.087158,16436.048828,707.395447,15532.739258,...,3.191066,0.135195,0.518211,1.912595,1.069037e+03,329.424927,0.0,1.0,1,5623.918457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24213,495.164352,12093.927433,435.353058,12360.245117,356.331024,8153.224121,597.461256,3475.246475,423.360504,2999.268066,...,264.040639,4.459330,3.390054,10.927444,5.186093e+05,125.813695,0.0,0.0,0,4905.683105
24214,871.944372,41367.721646,722.384888,38921.835938,851.610535,46724.808594,397.351603,2949.787662,332.517120,2877.707031,...,416.473876,10.440838,30.020687,2.207436,6.156371e+04,186.401299,1.0,1.0,0,4923.014160
24215,751.493409,12086.429874,806.047241,13151.909180,873.887512,14362.459961,290.474769,4747.992903,278.130737,4803.949707,...,0.471405,1.454095,4.258694,2.733795,1.741872e+05,193.227455,0.0,0.0,0,4958.273438
24216,887.085710,44114.561839,1081.949463,48514.128906,1301.839478,57984.855469,486.905991,11946.141355,254.658630,11014.551758,...,22.312675,6.983936,2.304110,175.835111,2.052599e+04,1259.390881,0.0,3.0,0,6389.684082


# Save HDF5 files

In [53]:
tickOutput = time.perf_counter()
df_train.to_hdf(outputDirectory + '/' + outputTrain, 'LHCb_Train', 'w')
df_test.to_hdf(outputDirectory + '/' + outputTest, 'LHCb_Test', 'w')
tockOutput = time.perf_counter()
print(f"Output saved in {tockOutput - tickOutput:0.4f} seconds.")

Output saved in 0.1445 seconds.
