This notebook creates a `./data/trainData.h5` and `./data/testData.h5` files containing training and testing data for the classifiers.

In [1]:
import pandas as pd
import numpy as np
import time
from os import path
import pathlib

In [2]:
## Two options:
## 'balance': same number of training events for signal and bkg
## 'bkgskew' uses 4 mln events of bkg, 500k of signal
dataset_type = 'balance'

## If true, require DTF with double m.c. to be successful
dtf_success = False

In [3]:
output_dir = './data/'
output_train = output_dir + 'train_dataset_' +  dataset_type + '.h5'
output_test = output_dir + 'test_dataset_' +  dataset_type + '.h5'

if not path.isdir(output_dir):
    raise NotADirectoryError("Directory " + output_dir + " does not exist.")

In [4]:
def p_modulus(px, py, pz):
    return np.sqrt(px**2 + py**2 + pz**2)

def transverse_p_modulus(px, py):
    return np.sqrt(px**2 + py**2)

The Monte Carlo HDF5 file should contain two $n$-tuples, one named `LHCbMC_Lb` (for reconstructed events) and one named `LHCbMCTruth_Lb` (for "truth" events).

The Run 2 data HDF5 file only needs one tree, named `LHCbData`.

In [5]:
path_to_hd5_files = str(pathlib.Path.home()) + '/data/'
input_mc = path_to_hd5_files + 'LHCbMC_2016-2017-2018_MagUpDown_Lb2JPsiL_Ttracks_v12.h5'
input_data = path_to_hd5_files + 'LHCbData_2016-2017-2018_MagUpDown_Dimuon_Ttracks_v2.h5'

In [6]:
if dataset_type not in ['balance', 'bkgskew']:
    raise ValueError

# Monte Carlo data
## Reconstructed events

In [7]:
df_reco = pd.read_hdf(input_mc, key='LHCbMC_Lb')
df_reco

Unnamed: 0_level_0,Lb_ENDVERTEX_X,Lb_ENDVERTEX_Y,Lb_ENDVERTEX_Z,L_ENDVERTEX_X,L_ENDVERTEX_Y,L_ENDVERTEX_Z,Jpsi_ENDVERTEX_X,Jpsi_ENDVERTEX_Y,Jpsi_ENDVERTEX_Z,L_PX,...,DTF_FixJPsiLambda_PIDSubs_p_PY,DTF_FixJPsiLambda_PIDSubs_p_PZ,DTF_FixJPsiLambda_PIDSubs_mum_PX,DTF_FixJPsiLambda_PIDSubs_mum_PY,DTF_FixJPsiLambda_PIDSubs_mum_PZ,DTF_FixJPsiLambda_PIDSubs_mup_PX,DTF_FixJPsiLambda_PIDSubs_mup_PY,DTF_FixJPsiLambda_PIDSubs_mup_PZ,TrackType,MC_key
Rec_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.952388,-0.165336,26.651770,402.777832,-229.470520,5536.814453,0.9524,-0.1653,26.652201,1910.136719,...,-186.566345,5411.963379,-804.469727,-535.944397,22858.960938,1438.071777,1619.347290,23634.302734,T,0
1,0.845292,-0.195122,3.622388,139.395874,322.585419,7428.522461,0.8453,-0.1950,3.620400,481.652618,...,222.666214,4137.976074,-605.608093,594.454895,11170.495117,71.893692,-2873.631348,73489.484375,T,29
2,-0.480003,-0.762014,18.670696,-819.340149,-626.124390,5389.564453,-0.4801,-0.7620,18.671101,-5395.257812,...,-323.442810,2628.927246,4.108163,-790.314697,5651.807129,-5579.078613,-431.151581,23151.667969,T,69
3,1.681797,-2.255294,-36.569271,113.645294,-329.222229,4678.392090,1.6818,-2.2553,-36.569199,7835.708984,...,-1702.511353,24506.517578,8712.822266,-18033.572266,231397.500000,336.531647,-3496.248779,45100.238281,T,119
4,0.824895,-0.252737,74.871048,-100.702072,60.565861,6466.218750,0.8242,-0.2542,74.883598,-74.406654,...,135.662018,8106.856445,-3448.149902,-1045.294434,90591.554688,588.637329,239.301941,10912.332031,T,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366391,0.964920,0.161698,-12.157676,293.129822,39.016293,7097.797363,0.9649,0.1621,-12.151800,3175.616943,...,-18.306223,7767.518555,1504.345581,715.472656,49351.718750,-112.221489,2933.706299,37641.914062,T,8921673
366392,-0.498788,1.215896,21.856564,-176.847626,145.914993,8691.596680,-0.5001,1.2166,21.869101,-2664.720215,...,-50.586430,-2812.631592,-3547.666504,1885.741577,33505.289062,-1337.816284,1466.083374,45027.328125,T,8921702
366393,0.114458,-0.046962,25.276293,-29.465662,-15.364128,949.110657,0.1145,-0.0469,25.275400,-6107.898926,...,-133.646973,15975.192383,-3526.683594,-343.335175,44636.988281,-3534.635254,-3121.734619,91762.195312,T,8921772
366394,1.096764,-0.076485,-13.951857,42.977512,-134.660324,4807.495117,1.0967,-0.0764,-13.952700,-279.406677,...,-92.683861,3969.945801,1982.495117,-2486.259033,26074.431641,2165.314453,-46.072693,39433.843750,T,8921784


## Truth events

In [8]:
df_truth = pd.read_hdf(input_mc, key='LHCbMCTruth_Lb')
df_truth

Unnamed: 0_level_0,MCTRUTH_Lb_ENDVERTEX_X,MCTRUTH_Lb_ENDVERTEX_Y,MCTRUTH_Lb_ENDVERTEX_Z,MCTRUTH_L_ENDVERTEX_X,MCTRUTH_L_ENDVERTEX_Y,MCTRUTH_L_ENDVERTEX_Z,MCTRUTH_p_PX,MCTRUTH_p_PY,MCTRUTH_p_PZ,MCTRUTH_pim_PX,...,mup_PP_X,mup_PP_Y,mup_PP_Z,mup_PP_PX,mup_PP_PY,mup_PP_PZ,mup_PP_Weight,Rec_key,N_rec_matches,TrackType
MC_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.9512,-0.1669,26.401600,343.742889,-192.429398,4751.548828,2031.640015,-1218.119995,29336.890625,445.510010,...,0.6208,-0.5355,21.231701,1448.819824,1628.266357,23763.910156,0.975000,0,1,T
1,0.4619,-0.3471,-52.081200,-19.765400,-33.384300,45.136002,-1190.050049,-1817.560059,5618.410156,-125.169998,...,0.2795,-0.4641,-50.938000,-3430.944092,-2054.681885,20444.863281,1.000000,-1,0,
2,0.8623,-0.0693,39.078201,145.114395,-70.015701,2563.685059,1349.000000,-574.190002,22276.189453,239.940002,...,0.1603,-0.3852,34.265598,1869.628418,869.200012,13080.188477,1.000000,-1,0,
3,0.8441,-0.1694,17.714300,-9.207400,27.546700,3520.887207,-295.320007,790.229980,93841.296875,3.170000,...,0.4994,-0.5171,14.525300,491.177612,513.784912,5034.924805,1.000000,-1,0,
4,0.6610,-0.4327,7.083600,-23.592199,-54.290798,201.384293,-1843.489990,-4303.959961,15525.910156,-361.470001,...,-0.0085,-0.3661,9.187500,-2067.989014,93.588974,6374.103027,1.000000,-1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8921810,0.7382,-0.1062,0.347100,-2.454600,0.459400,68.980499,-1052.079956,169.649994,21721.480469,-270.470001,...,0.5466,-0.3809,3.584800,-1646.731689,-2497.763672,30773.214844,0.942857,-1,0,
8921811,0.6401,-0.0814,27.420200,4.920900,-10.475100,58.229698,680.200012,-1542.880005,4852.310059,153.279999,...,,,,,,,,-1,0,
8921812,2.0908,-1.0901,46.602200,36.268501,-29.632999,1175.768677,4569.680176,-3904.929932,153438.671875,582.070007,...,0.3603,0.4048,30.488100,1073.656860,-936.144836,10031.811523,0.975000,-1,0,
8921813,1.3535,1.5125,0.694100,147.764999,156.474701,1460.922485,2323.729980,2452.239990,22514.330078,227.960007,...,0.1721,1.6476,-7.021200,818.388855,-101.526367,5160.858398,0.972973,-1,0,


## Merge the dataframes

In [9]:
df_mc = pd.merge(df_truth.loc[df_truth['Rec_key'] >= 0], df_reco, left_index=True, right_on='MC_key')
df_mc = df_mc.loc[(df_mc['MC_key'] >= 0) & (df_mc['Rec_key'] >= 0)]
df_mc

Unnamed: 0_level_0,MCTRUTH_Lb_ENDVERTEX_X,MCTRUTH_Lb_ENDVERTEX_Y,MCTRUTH_Lb_ENDVERTEX_Z,MCTRUTH_L_ENDVERTEX_X,MCTRUTH_L_ENDVERTEX_Y,MCTRUTH_L_ENDVERTEX_Z,MCTRUTH_p_PX,MCTRUTH_p_PY,MCTRUTH_p_PZ,MCTRUTH_pim_PX,...,DTF_FixJPsiLambda_PIDSubs_p_PY,DTF_FixJPsiLambda_PIDSubs_p_PZ,DTF_FixJPsiLambda_PIDSubs_mum_PX,DTF_FixJPsiLambda_PIDSubs_mum_PY,DTF_FixJPsiLambda_PIDSubs_mum_PZ,DTF_FixJPsiLambda_PIDSubs_mup_PX,DTF_FixJPsiLambda_PIDSubs_mup_PY,DTF_FixJPsiLambda_PIDSubs_mup_PZ,TrackType_y,MC_key
Rec_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.9512,-0.1669,26.401600,343.742889,-192.429398,4751.548828,2031.640015,-1218.119995,29336.890625,445.510010,...,-186.566345,5411.963379,-804.469727,-535.944397,22858.960938,1438.071777,1619.347290,23634.302734,T,0
1,0.8237,-0.2123,3.514200,140.238602,325.837311,7463.102539,367.250000,1035.959961,24602.169922,174.279999,...,222.666214,4137.976074,-605.608093,594.454895,11170.495117,71.893692,-2873.631348,73489.484375,T,29
2,-0.4798,-0.7758,18.706800,-785.810974,-591.072998,5157.507812,-5172.310059,-3904.530029,34147.730469,-470.119995,...,-323.442810,2628.927246,4.108163,-790.314697,5651.807129,-5579.078613,-431.151581,23151.667969,T,69
3,1.6829,-2.2522,-36.520901,94.709503,-270.525391,3823.881348,5635.350098,-15984.469727,230495.453125,604.909973,...,-1702.511353,24506.517578,8712.822266,-18033.572266,231397.500000,336.531647,-3496.248779,45100.238281,T,119
5,2.2347,-3.0136,-3.515400,-41.785000,-495.398499,2890.112793,-144.789993,-1430.229980,8650.469727,-27.309999,...,-514.345398,2560.247559,2766.051025,-6544.098633,37503.421875,4405.509766,-5917.182129,27155.025391,T,202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366391,0.9741,0.2091,-11.659000,294.090698,39.875000,7125.287109,2412.750000,405.109985,60216.011719,395.200012,...,-18.306223,7767.518555,1504.345581,715.472656,49351.718750,-112.221489,2933.706299,37641.914062,T,8921673
366392,-0.5103,1.2180,21.928400,-148.940399,123.715599,7319.557129,-1238.500000,978.909973,57653.351562,-45.599998,...,-50.586430,-2812.631592,-3547.666504,1885.741577,33505.289062,-1337.816284,1466.083374,45027.328125,T,8921702
366393,0.1275,-0.0589,25.113001,-123.851601,-28.034100,2985.664307,-6471.830078,-1487.530029,153226.906250,-561.440002,...,-133.646973,15975.192383,-3526.683594,-343.335175,44636.988281,-3534.635254,-3121.734619,91762.195312,T,8921772
366394,1.0493,-0.0526,-14.365200,24.636101,-103.056702,3838.916504,267.079987,-1383.319946,50956.210938,70.400002,...,-92.683861,3969.945801,1982.495117,-2486.259033,26074.431641,2165.314453,-46.072693,39433.843750,T,8921784


## Apply pre-selection cuts matching data

In [10]:
jpsi_pdg_mass = 3096.9

In [11]:
## Missing "combined" invariant masses (for Lambda->p pim and Lambdab->JPsi Lambda)
## These are the AM in the DaVinci config file. Did Salvatore implement them somewhere?

pion_p_cuts = (
    (p_modulus(df_mc['pim_PX'], df_mc['pim_PY'], df_mc['pim_PZ']) > 2000)
    & (p_modulus(df_mc['pim_PX'], df_mc['pim_PY'], df_mc['pim_PZ']) < 5e5)
)

proton_p_cuts = (
    (p_modulus(df_mc['p_PX'], df_mc['p_PY'], df_mc['p_PZ']) > 10000)
    & (p_modulus(df_mc['p_PX'], df_mc['p_PY'], df_mc['p_PZ']) < 5e5)
)

proton_pt_cuts = transverse_p_modulus(df_mc['p_PX'], df_mc['p_PY']) > 400

lambda_m_cuts = (df_mc['L_M'] > 600) & (df_mc['L_M'] < 1500)
lambda_mm_cuts = df_mc['L_MM'] < 1500
lambda_z_cuts = (df_mc['L_ENDVERTEX_Z'] > 5500) & (df_mc['L_ENDVERTEX_Z'] < 8500)
lambda_dira_cuts = df_mc['L_BPVDIRA'] > 0.9999
lambda_ipchi2_cuts = df_mc['L_BPVIPCHI2'] < 200
lambda_vdchi2_cuts = df_mc['L_BPVVDCHI2'] < 2e7
lambda_chi2_cuts = df_mc['L_VFASPF_CHI2_VDOF'] < 750
lambda_pt_cuts = transverse_p_modulus(df_mc['L_PX'], df_mc['L_PY']) > 450

jpsi_m_cuts = abs(df_mc['Jpsi_M'] - jpsi_pdg_mass) < 90

lambdab_m_cuts = df_mc['Lb_M'] < 8500
lambdab_dira_cuts = abs(df_mc['Lb_BPVDIRA']) > 0.99
lambdab_ipchi2_cuts = df_mc['Lb_BPVIPCHI2'] < 1750
lambdab_chi2_cuts = df_mc['Lb_VFASPF_CHI2_VDOF'] < 150

dtf_success_cuts = df_mc['DTF_FixJPsiLambda_status'] == 'Success'

In [12]:
pre_selection_cuts = (
    pion_p_cuts &
    proton_p_cuts &
    proton_pt_cuts &
    lambda_m_cuts &
    lambda_mm_cuts &
    lambda_z_cuts &
    lambda_dira_cuts &
    lambda_ipchi2_cuts &
    lambda_vdchi2_cuts &
    lambda_chi2_cuts &
    lambda_pt_cuts &
    jpsi_m_cuts &
    lambdab_m_cuts &
    lambdab_dira_cuts &
    lambdab_ipchi2_cuts &
    lambdab_chi2_cuts
)

if dtf_success:
    pre_selection_cuts = pre_selection_cuts & dtf_success_cuts

df_mc[pre_selection_cuts]

Unnamed: 0_level_0,MCTRUTH_Lb_ENDVERTEX_X,MCTRUTH_Lb_ENDVERTEX_Y,MCTRUTH_Lb_ENDVERTEX_Z,MCTRUTH_L_ENDVERTEX_X,MCTRUTH_L_ENDVERTEX_Y,MCTRUTH_L_ENDVERTEX_Z,MCTRUTH_p_PX,MCTRUTH_p_PY,MCTRUTH_p_PZ,MCTRUTH_pim_PX,...,DTF_FixJPsiLambda_PIDSubs_p_PY,DTF_FixJPsiLambda_PIDSubs_p_PZ,DTF_FixJPsiLambda_PIDSubs_mum_PX,DTF_FixJPsiLambda_PIDSubs_mum_PY,DTF_FixJPsiLambda_PIDSubs_mum_PZ,DTF_FixJPsiLambda_PIDSubs_mup_PX,DTF_FixJPsiLambda_PIDSubs_mup_PY,DTF_FixJPsiLambda_PIDSubs_mup_PZ,TrackType_y,MC_key
Rec_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.9512,-0.1669,26.401600,343.742889,-192.429398,4751.548828,2031.640015,-1218.119995,29336.890625,445.510010,...,-186.566345,5411.963379,-804.469727,-535.944397,22858.960938,1438.071777,1619.347290,23634.302734,T,0
1,0.8237,-0.2123,3.514200,140.238602,325.837311,7463.102539,367.250000,1035.959961,24602.169922,174.279999,...,222.666214,4137.976074,-605.608093,594.454895,11170.495117,71.893692,-2873.631348,73489.484375,T,29
8,0.5442,-0.4049,-30.064199,-237.463104,-161.438004,5029.397461,-1999.180054,-1447.800049,43812.398438,-599.280029,...,-151.918823,7139.517578,100.262436,999.515259,50033.699219,-120.737198,-995.344788,6822.337402,T,277
14,0.9904,-0.2075,-9.769600,224.561996,56.081600,5921.048828,2894.560059,699.140015,74302.851562,293.929993,...,104.645416,10220.954102,-884.967468,195.843964,34903.839844,1645.699585,-1113.117432,24484.212891,T,367
17,0.8236,-0.2738,28.986401,-296.374695,-100.123802,5963.561523,-1268.760010,-522.340027,25478.310547,-184.460007,...,20.153040,3543.737305,-829.315674,-926.117676,37815.328125,2614.833252,-1553.520630,63936.839844,T,493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366374,1.9930,-0.9682,33.883499,454.162109,-444.944794,6047.499512,7053.950195,-6800.229980,92633.132812,949.369995,...,-1059.302368,13806.107422,2310.392578,-2437.838867,42988.765625,2057.492432,-877.707703,10357.686523,T,8921400
366378,1.2187,-0.0757,-10.080700,120.471603,-69.038300,7221.361328,1256.689941,-628.059998,71476.546875,189.229996,...,-259.237457,19192.976562,1663.037476,477.534760,14450.500000,-812.013245,-604.744568,27734.162109,T,8921503
366382,1.1575,-0.3940,43.910599,4.075100,-146.877396,7409.937500,133.949997,-2934.969971,147684.812500,-69.339996,...,-295.752747,15403.834961,196.509079,-679.625732,6668.261719,3547.472900,-3286.872803,190998.375000,T,8921603
366391,0.9741,0.2091,-11.659000,294.090698,39.875000,7125.287109,2412.750000,405.109985,60216.011719,395.200012,...,-18.306223,7767.518555,1504.345581,715.472656,49351.718750,-112.221489,2933.706299,37641.914062,T,8921673


In [13]:
df_signal = df_mc.loc[pre_selection_cuts].sample(frac=1, random_state=2022)
del(df_mc)

df_signal

Unnamed: 0_level_0,MCTRUTH_Lb_ENDVERTEX_X,MCTRUTH_Lb_ENDVERTEX_Y,MCTRUTH_Lb_ENDVERTEX_Z,MCTRUTH_L_ENDVERTEX_X,MCTRUTH_L_ENDVERTEX_Y,MCTRUTH_L_ENDVERTEX_Z,MCTRUTH_p_PX,MCTRUTH_p_PY,MCTRUTH_p_PZ,MCTRUTH_pim_PX,...,DTF_FixJPsiLambda_PIDSubs_p_PY,DTF_FixJPsiLambda_PIDSubs_p_PZ,DTF_FixJPsiLambda_PIDSubs_mum_PX,DTF_FixJPsiLambda_PIDSubs_mum_PY,DTF_FixJPsiLambda_PIDSubs_mum_PZ,DTF_FixJPsiLambda_PIDSubs_mup_PX,DTF_FixJPsiLambda_PIDSubs_mup_PY,DTF_FixJPsiLambda_PIDSubs_mup_PZ,TrackType_y,MC_key
Rec_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
112389,0.8821,-0.2157,-6.743900,297.045502,78.944801,5679.820801,2558.719971,587.020020,49348.621094,484.070007,...,224.009979,8847.545898,1099.589233,-517.493103,35295.410156,1671.059082,-1277.145508,11006.719727,T,2780866
18231,0.9552,-0.2070,41.641899,132.634506,66.710297,6339.666016,733.700012,432.779999,32919.890625,51.650002,...,-19.903521,5191.637695,2464.527832,-510.988953,46047.511719,2327.100830,-3238.740723,38406.609375,T,453921
281704,0.3725,0.2267,-60.226700,-441.861694,332.432587,7507.066895,-1959.140015,1455.089966,34633.648438,-542.260010,...,519.815796,10085.016602,-11408.463867,1619.126221,154854.328125,-1493.943848,-852.710754,18152.908203,T,6880137
208614,0.8608,-0.1570,13.512500,-433.567810,-53.026100,7750.630371,-1477.369995,-161.389999,24538.470703,-128.240005,...,-32.593716,3940.593994,2322.117432,-827.881531,26163.285156,-333.785706,543.608398,29025.000000,T,5117301
266697,0.4327,-0.0822,-13.252900,-486.890503,59.396400,6952.131836,-2271.629883,289.989990,33568.578125,-709.809998,...,18.994507,13547.239258,-2543.992188,-2103.184082,43431.542969,914.409546,-654.071777,18815.972656,T,6520075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259490,0.7749,0.0887,-36.664200,-400.696106,-74.323997,7553.503906,-6849.549805,-1363.109985,129408.718750,-958.640015,...,-90.226952,17637.841797,-1674.922729,552.791931,59144.449219,-3493.708740,344.834015,36964.968750,T,6347623
326638,0.9178,-1.8292,-38.851898,1.330300,-58.604198,735.571472,-2.010000,-2524.560059,34554.949219,-30.059999,...,-199.092392,2646.080566,1062.902466,-1994.391846,117534.156250,-855.403870,-854.333740,16566.935547,T,7959489
211142,0.8022,0.8332,-40.730801,-112.103302,141.206299,6570.481934,-633.979980,747.679993,32843.269531,-64.300003,...,96.541519,9408.616211,-974.564636,1290.845215,8331.611328,1279.404541,3380.212402,42173.265625,T,5179522
73778,5.5543,0.1213,70.668404,303.459900,22.707899,5364.738770,5143.390137,318.119995,91893.617188,591.059998,...,116.123955,9742.116211,2276.984375,-941.543762,17139.707031,4378.870605,1429.342041,58055.687500,T,1827933


# Real data

In [14]:
df_data = pd.read_hdf(input_data, key='LHCbData')

In [15]:
Lb_mass = 5620.2
distance = 600
width_low = 150
width_high = 300

In [16]:
if dtf_success:
    data_cuts = df_data['DTF_FixJPsiLambda_status'] == 'Success'
else:
    data_cuts = pd.Series(True, index=df_data.index)

Lb_mass_pdg = 5620.2
distance = 600
width_left = 150
width_right = 300

sideband_left = (df_data['DTF_FixJPsiLambda_Lb_M'] > Lb_mass - distance - width_left) &  (df_data['DTF_FixJPsiLambda_Lb_M'] < Lb_mass - distance)
sideband_right = (df_data['DTF_FixJPsiLambda_Lb_M'] > Lb_mass + distance) &  (df_data['DTF_FixJPsiLambda_Lb_M'] < Lb_mass + distance + width_right)
sideband_cuts = (data_cuts & (sideband_left | sideband_right))

df_data[sideband_cuts]

Unnamed: 0,Lb_ENDVERTEX_X,Lb_ENDVERTEX_Y,Lb_ENDVERTEX_Z,L_ENDVERTEX_X,L_ENDVERTEX_Y,L_ENDVERTEX_Z,Jpsi_ENDVERTEX_X,Jpsi_ENDVERTEX_Y,Jpsi_ENDVERTEX_Z,L_PX,...,DTF_FixJPsiLambda_PIDSubs_p_PX,DTF_FixJPsiLambda_PIDSubs_p_PY,DTF_FixJPsiLambda_PIDSubs_p_PZ,DTF_FixJPsiLambda_PIDSubs_mum_PX,DTF_FixJPsiLambda_PIDSubs_mum_PY,DTF_FixJPsiLambda_PIDSubs_mum_PZ,DTF_FixJPsiLambda_PIDSubs_mup_PX,DTF_FixJPsiLambda_PIDSubs_mup_PY,DTF_FixJPsiLambda_PIDSubs_mup_PZ,TrackType
0,0.632911,0.178342,5.049691,-255.844345,-56.212456,5932.708984,0.6329,0.1792,5.047500,-2260.323975,...,-388.787109,-100.817314,6799.227051,-793.328735,54.095398,4428.873047,552.359741,-1187.884155,58499.570312,T
3,0.974854,-0.085595,20.542496,-32.013988,97.482521,6381.826660,0.9740,-0.0862,20.537600,-520.050232,...,106.603920,171.913254,6129.576660,1989.267334,506.550293,31564.787109,3074.307373,855.257324,15494.530273,T
46,0.837129,-0.183775,-70.188431,201.357208,2.814038,7339.723145,0.8375,-0.1849,-70.248398,1290.401733,...,51.647758,-11.291741,7394.220703,1648.590332,188.089554,99033.968750,-1446.313232,122.616882,82335.640625,T
58,0.815876,-0.315838,-35.037354,-12.199903,105.030586,7130.378418,0.8144,-0.3108,-35.123199,-396.907684,...,-17.155571,62.270897,487.921875,-471.796265,277.703583,9612.227539,1178.372437,-4969.011230,87508.968750,T
76,1.124333,-0.006395,-29.910776,-260.306274,-15.027123,6847.313477,1.1248,-0.0059,-29.914801,-1211.243530,...,-630.688354,-91.685165,8782.107422,597.138550,220.792053,14923.067383,-1910.199097,-1680.531250,15761.559570,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43650348,0.904755,0.144735,16.554995,39.793755,110.245758,6012.803223,0.9044,0.1401,16.525801,752.307617,...,109.084793,353.827820,14525.286133,-26.405220,559.101013,35658.375000,168.072311,2089.706055,13261.994141,T
43650354,0.781269,0.018080,-5.907313,-88.863541,-138.597076,6176.588867,0.7812,0.0174,-5.903300,-790.058350,...,-220.048416,-264.653534,8832.120117,-39.217022,-1455.853638,9868.663086,-602.774780,693.663513,34186.746094,T
43650379,0.762093,0.419868,-2.965555,60.165688,96.357040,6660.636719,0.7621,0.4200,-2.965100,671.929871,...,42.575504,333.228882,15513.498047,-931.478455,863.817444,9631.739258,-130.733292,1424.853027,71398.562500,T
43650386,-0.071914,-0.328355,1.382966,-175.408615,-25.321419,6430.763672,-0.0718,-0.3283,1.381400,-960.433411,...,-124.437691,-64.283607,10632.637695,569.004272,-507.519562,8445.436523,-3965.748779,-1870.158081,61456.214844,T


In [17]:
df_bkg  = df_data.loc[sideband_cuts].sample(frac=1, random_state=2022)
del(df_data)

df_bkg

Unnamed: 0,Lb_ENDVERTEX_X,Lb_ENDVERTEX_Y,Lb_ENDVERTEX_Z,L_ENDVERTEX_X,L_ENDVERTEX_Y,L_ENDVERTEX_Z,Jpsi_ENDVERTEX_X,Jpsi_ENDVERTEX_Y,Jpsi_ENDVERTEX_Z,L_PX,...,DTF_FixJPsiLambda_PIDSubs_p_PX,DTF_FixJPsiLambda_PIDSubs_p_PY,DTF_FixJPsiLambda_PIDSubs_p_PZ,DTF_FixJPsiLambda_PIDSubs_mum_PX,DTF_FixJPsiLambda_PIDSubs_mum_PY,DTF_FixJPsiLambda_PIDSubs_mum_PZ,DTF_FixJPsiLambda_PIDSubs_mup_PX,DTF_FixJPsiLambda_PIDSubs_mup_PY,DTF_FixJPsiLambda_PIDSubs_mup_PZ,TrackType
31624792,0.914221,0.126998,-61.998459,192.521027,382.033691,7895.950684,0.9144,0.1271,-61.997601,348.714844,...,173.481735,728.819458,12571.287109,4380.675781,858.719238,41331.640625,1333.038818,1256.502197,6836.887695,T
20569615,0.666606,-0.069654,44.093254,-50.867920,6.514143,5954.807129,0.6656,-0.0692,44.097000,-755.508179,...,-104.061653,1.172514,5631.541016,-7074.285645,2511.462402,25146.677734,-1045.996826,-332.863800,8676.358398,T
5495933,-0.500642,0.173467,-1.718556,276.633972,-146.683411,8038.015625,-0.4970,0.1733,-1.733100,535.541626,...,271.833038,-358.267670,11733.443359,-1764.395020,253.079468,6924.846680,-655.287170,264.566803,26214.822266,T
34660792,0.453902,-0.106127,-21.121475,224.941757,-1006.443848,6799.378418,0.4539,-0.1061,-21.121500,407.494598,...,171.531052,-245.066559,1514.048340,447.359253,-1154.033081,5923.540527,-584.567139,175.146088,33261.527344,T
17650561,0.866350,-0.021333,31.028521,-11.588667,120.963303,7283.841797,0.8688,-0.0210,31.043600,-53.912380,...,-215.224670,368.756714,22802.812500,-213.089478,-460.232880,12219.019531,3531.126953,616.286865,22303.433594,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4691035,-0.430749,0.834396,-1.383964,8.312443,-92.761360,6538.767090,-0.4300,0.8338,-1.390100,428.208893,...,-153.813293,-272.943176,20270.667969,-1022.355225,892.190186,8700.723633,808.107727,-491.296936,32091.068359,T
23527424,0.694732,0.203607,68.214272,-213.639313,-82.388161,7073.303711,0.6949,0.2040,68.217598,-882.209167,...,-88.791466,-60.168125,5621.769531,762.355164,1469.601318,11291.772461,-2110.070312,3267.871582,52174.941406,T
1576401,0.681497,-0.482057,21.586573,-130.047714,2.067821,7356.126953,0.6823,-0.4835,21.620199,-485.701691,...,-51.932144,-3.735431,10155.750000,975.775696,-1035.578369,26516.400391,-1616.699219,-3078.315186,135885.281250,T
30842009,0.939074,0.228321,-67.928452,-16.723248,-112.167465,7398.245117,0.9395,0.2293,-67.896004,-105.834312,...,117.094589,-406.433228,26631.638672,329.593475,-811.027710,40508.128906,957.468994,2804.579102,91441.570312,T


# Add derived variables

In [18]:
def add_derived_variables(
    df: pd.DataFrame
) -> None:
    df['p_PT'] = transverse_p_modulus(df['p_PX'], df['p_PY'])
    df['DTF_FixJPsi_p_PT'] = transverse_p_modulus(df['DTF_FixJPsi_p_PX'], df['DTF_FixJPsi_p_PY'])
    df['DTF_FixJPsiLambda_p_PT'] = transverse_p_modulus(df['DTF_FixJPsiLambda_p_PX'], df['DTF_FixJPsiLambda_p_PY'])
    df['pim_PT'] = transverse_p_modulus(df['pim_PX'], df['pim_PY'])
    df['DTF_FixJPsi_pim_PT'] = transverse_p_modulus(df['DTF_FixJPsi_pim_PX'], df['DTF_FixJPsi_pim_PY'])
    df['DTF_FixJPsiLambda_pim_PT'] = transverse_p_modulus(df['DTF_FixJPsiLambda_pim_PX'], df['DTF_FixJPsiLambda_pim_PY'])
    df['Jpsi_PT'] = transverse_p_modulus(df['Jpsi_PX'], df['Jpsi_PY'])
    
    dtf_success_mapper = {
        'Success': 0.0,
        'Failed': 1.0,
        'NonConverged': 3.0
    }

    df.replace({'DTF_FixJPsi_status': dtf_success_mapper}, inplace=True)
    df.replace({'DTF_FixJPsiLambda_status': dtf_success_mapper}, inplace=True)

In [19]:
add_derived_variables(df_signal)
add_derived_variables(df_bkg)

In [20]:
df_signal['TYPE'] = 1
df_bkg['TYPE'] = 0

In [21]:
features = [
    'p_PT',
    'p_PZ',
    'DTF_FixJPsi_p_PT',
    'DTF_FixJPsi_p_PZ',
    'DTF_FixJPsiLambda_p_PT',
    'DTF_FixJPsiLambda_p_PZ',
    'pim_PT',
    'pim_PZ',
    'DTF_FixJPsi_pim_PT',
    'DTF_FixJPsi_pim_PZ',
    'DTF_FixJPsiLambda_pim_PT',
    'DTF_FixJPsiLambda_pim_PZ',
    'Jpsi_PT',
    'Jpsi_PZ',
    'L_ENDVERTEX_X',
    'L_ENDVERTEX_Y',
    'L_ENDVERTEX_Z',
    'L_BPVDIRA',
    'Lb_BPVDIRA',
    'L_VFASPF_CHI2_VDOF',
    'Lb_VFASPF_CHI2_VDOF',
    'L_BPVIPCHI2',
    'Lb_BPVIPCHI2',
    'L_BPVVDCHI2',
    'Lb_BPVVDCHI2',
    'DTF_FixJPsi_status',
    'DTF_FixJPsiLambda_status'
]

## Merge signal & background
For `balance` dataset, use all 90% of signal events and matching sideband events for training, all the rest for testing.
For `bkgskew` dataset, use 90% of both for training and 10% for testing.
Simple enough, right?

In [22]:
## Splitting points at 90% of the DF length
sig_split = int(len(df_signal) * 0.9)
if dataset_type == 'balance':
    bkg_split = sig_split
else:
    bkg_split = int(len(df_bkg) * 0.9)
    
print("Splitting signal at index", sig_split)
print("Splitting signal at index", bkg_split)

Splitting signal at index 72649
Splitting signal at index 72649


In [24]:
df_train = pd.concat(
    [df_signal.iloc[:sig_split], df_bkg.iloc[:bkg_split]], ignore_index=True
)[features + ['TYPE','DTF_FixJPsiLambda_Lb_M']]
df_train.dropna('columns', inplace=True)
df_train

Unnamed: 0,p_PT,p_PZ,DTF_FixJPsi_p_PT,DTF_FixJPsi_p_PZ,DTF_FixJPsiLambda_p_PT,DTF_FixJPsiLambda_p_PZ,pim_PT,pim_PZ,DTF_FixJPsi_pim_PT,DTF_FixJPsi_pim_PZ,...,L_VFASPF_CHI2_VDOF,Lb_VFASPF_CHI2_VDOF,L_BPVIPCHI2,Lb_BPVIPCHI2,L_BPVVDCHI2,Lb_BPVVDCHI2,DTF_FixJPsi_status,DTF_FixJPsiLambda_status,TYPE,DTF_FixJPsiLambda_Lb_M
0,2391.007324,53119.367188,3109.104248,55021.527344,2674.173096,50463.800781,989.212524,9728.215820,395.459991,9636.396484,...,1.999228,2.617834,7.783529,0.275361,9.396670e+02,43.821186,0.0,0.0,1,5642.995117
1,665.045532,33222.363281,858.999207,33044.460938,918.579163,35519.261719,264.080444,4548.199707,58.953770,4661.238281,...,3.834748,0.764857,2.052681,3.653825,1.111560e+04,43.061302,0.0,1.0,1,5725.535645
2,4809.560059,69587.210938,2526.715576,35863.093750,2477.713379,35118.335938,715.287170,8111.864746,718.622498,8492.106445,...,1.598892,4.070712,7.986773,15.390973,1.534330e+05,236.864914,0.0,0.0,1,5649.050781
3,787.348022,13282.785156,1318.262573,21621.804688,1366.567871,22419.171875,181.239487,4025.402100,139.180206,4125.206055,...,0.835794,0.104946,0.137557,1.657936,1.895357e+04,3631.897705,0.0,0.0,1,5532.559570
4,2990.258789,42095.085938,2862.283447,39731.078125,2839.418701,39281.101562,662.368652,9727.785156,620.543213,9689.310547,...,1.805324,0.836892,2.284361,1.377249,3.258634e+06,1370.535400,0.0,0.0,1,5886.123047
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145293,805.121277,23507.595703,814.090576,23055.335938,1359.963013,37917.859375,552.082092,24046.093750,506.498505,22791.193359,...,95.045044,1.236983,3.500154,66.592827,8.296238e+06,1136.414185,0.0,1.0,0,4926.012695
145294,653.301514,19669.052734,403.274017,18379.828125,336.683594,16323.865234,360.564453,8928.512695,84.689331,8790.658203,...,1.846003,25.181692,72.114159,22.795586,1.628411e+04,427.659424,0.0,0.0,0,4932.150391
145295,645.209595,13305.105469,432.491089,10379.948242,243.679337,6748.758301,239.370438,3528.149170,305.574707,4204.743652,...,38.324047,24.636105,67.084892,28.038248,3.932625e+06,120.323692,0.0,1.0,0,6270.460449
145296,815.101501,18208.404297,731.964661,16345.332031,2177.278320,49956.578125,120.630455,2712.889404,119.802505,2716.831787,...,0.025877,1.267947,0.395967,13.967916,1.833557e+05,288.725372,0.0,0.0,0,6501.849121


In [25]:
df_test = pd.concat(
    [df_signal.iloc[sig_split:], df_bkg.iloc[bkg_split:]], ignore_index=True
)[features + ['TYPE','DTF_FixJPsiLambda_Lb_M']]
df_test.dropna('columns', inplace=True)
df_test

Unnamed: 0,p_PT,p_PZ,DTF_FixJPsi_p_PT,DTF_FixJPsi_p_PZ,DTF_FixJPsiLambda_p_PT,DTF_FixJPsiLambda_p_PZ,pim_PT,pim_PZ,DTF_FixJPsi_pim_PT,DTF_FixJPsi_pim_PZ,...,L_VFASPF_CHI2_VDOF,Lb_VFASPF_CHI2_VDOF,L_BPVIPCHI2,Lb_BPVIPCHI2,L_BPVVDCHI2,Lb_BPVVDCHI2,DTF_FixJPsi_status,DTF_FixJPsiLambda_status,TYPE,DTF_FixJPsiLambda_Lb_M
0,3802.969727,32496.773438,5022.233398,42028.570312,4365.903809,36884.359375,603.782715,3548.854248,384.483734,3575.219238,...,11.445946,1.518785,3.788127,0.116450,2583.435547,487.001343,0.0,1.0,1,5609.720215
1,3309.829102,29165.937500,2514.102539,21862.863281,2672.662109,23123.589844,572.437073,2176.231934,262.199677,2262.454346,...,0.093210,2.236516,1.516899,0.545056,2518.564209,727.137756,0.0,1.0,1,5720.921387
2,3945.632080,42366.277344,3935.839111,42401.468750,3960.475586,42686.105469,274.731293,3882.562012,327.349091,3843.060059,...,0.007283,1.155756,1.756956,6.017722,9607.723633,217.517792,0.0,0.0,1,5518.159180
3,2209.906006,40773.085938,2922.173096,48958.382812,2981.473145,49736.199219,918.814392,9660.885742,617.249268,9534.126953,...,0.687938,1.323909,1.660285,0.015930,5940.551758,418.925354,0.0,0.0,1,5690.572266
4,4713.348633,89298.039062,2544.720459,48355.136719,3891.671143,73182.562500,363.490723,5198.387207,367.125793,5196.061035,...,0.000517,1.014582,2.714195,1.045764,891391.312500,12.841259,0.0,0.0,1,6682.464844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4015290,1578.699219,91183.453125,730.855469,50967.277344,527.207825,35936.726562,382.100586,18075.050781,273.884705,18706.072266,...,2.785461,6.188182,9.583981,1069.463135,15049.113281,103207.796875,0.0,0.0,0,4961.707520
4015291,920.719177,26135.546875,1465.078491,42237.433594,1331.812378,38478.769531,64.040855,6124.985352,105.621498,5821.173340,...,0.777651,22.842735,50.954292,46.082180,404028.406250,1169.987427,0.0,0.0,0,6296.701660
4015292,448.241089,22580.591797,534.955505,27879.978516,776.303284,38285.660156,43.548950,6153.875977,71.570152,6166.460938,...,36.631618,6.644543,17.403641,51.600163,229647.156250,580.023499,0.0,0.0,0,4890.033691
4015293,484.433594,33877.257812,570.967163,37443.843750,468.737030,31336.445312,262.678040,15982.551758,248.601746,15788.620117,...,4.101008,2.000728,2.358656,11.211392,174135.875000,102.474869,0.0,1.0,0,4896.677734


# Save HDF5 files

In [26]:
df_train.to_hdf(output_train, 'LHCb_Train', 'w')
df_test.to_hdf(output_test, 'LHCb_Test', 'w')

print(f"Output saved.")

Output saved.
