In [1]:
#%config InlineBackend.figure_format = "retina"

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import pickle
import gc

#import sys


In [None]:
print('Preparing data...')

file_path = f'data/event_data_sv_pad(5_16)_merged.pkl'
#file_path = f'data/event_data_sv_pad16.pkl'

with open(file_path, 'rb') as file:
    event_data = pickle.load(file)
    
df = pd.DataFrame(event_data)

del event_data
gc.collect()

df.head()

In [3]:
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42, stratify=df['jet_btag'])

df_train_l = df_train[df_train['hadron_flav'] == 0]
df_train_c= df_train[df_train['hadron_flav'] == 4]
df_train_b= df_train[df_train['hadron_flav'] == 5]

df_val_l = df_val[df_val['hadron_flav'] == 0]
df_val_c= df_val[df_val['hadron_flav'] == 4]
df_val_b= df_val[df_val['hadron_flav'] == 5]


In [None]:
############################ Jet Track and SV len Features ############################

features = [
    'jet_track_count', 'jet_sv_count'
]

# xmin = [0, 0, -5, -4, 0, 0]
# xmax = [200, 50, 5, 4, 50, 20]
bins = [55,12]

cmap = matplotlib.colormaps.get_cmap('Spectral')
fig, axes = plt.subplots(nrows=1, ncols = 2, figsize = (20, 10))

for i,feature in enumerate(features):
    #print("processing feature ", feature)
    all_values_df_val_l, all_values_df_val_c, all_values_df_val_b = [], [], []
    all_values_df_train_l, all_values_df_train_c, all_values_df_train_b = [], [], []

    # Iterate over the flattened arrays and collect all values
    for index, row in df_val_l.iterrows():
        all_values_df_val_l.append(row[feature])
    for index, row in df_val_c.iterrows():
        all_values_df_val_c.append(row[feature])
    for index, row in df_val_b.iterrows():
        all_values_df_val_b.append(row[feature])

    for index, row in df_train_l.iterrows():
        all_values_df_train_l.append(row[feature])
    for index, row in df_train_c.iterrows():
        all_values_df_train_c.append(row[feature])
    for index, row in df_train_b.iterrows():
        all_values_df_train_b.append(row[feature])

   # ax = axes[i//1, i%1]
    ax = axes[i]
    binning=np.linspace(min(all_values_df_train_l),max(all_values_df_train_l), bins[i])

    ax.hist(all_values_df_train_l, density = True, bins=binning, linestyle = '--', label='train udsg', color='red', histtype='step')
    ax.hist(all_values_df_train_c, density = True, bins=binning, linestyle = '--', label='train c', color='green', histtype='step')
    ax.hist(all_values_df_train_b, density = True, bins=binning, linestyle = '--', label='train b', color='blue', histtype='step')

    ax.hist(all_values_df_val_l, density = True, bins=binning, alpha=0.1, label='val udsg', color='red')#, histtype='step')
    ax.hist(all_values_df_val_c, density = True, bins=binning, alpha=0.1, label='val c', color='green')#, histtype='step')
    ax.hist(all_values_df_val_b, density = True, bins=binning, alpha=0.1, label='val b', color='blue')#, histtype='step')

    if feature == 'jet_track_count':
        feature = 'Number of Particles in a Jet'
    else:
        feature = 'Number of SV in a Jet'
    

   # ax.set_xlim([xmin[i],xmax[i]])
    ax.set_xlabel(f'{feature}', fontsize=20)
    ax.set_ylabel('p.d.f.', fontsize=20)
    ax.set_yscale('log')
    ax.legend()
#fig.suptitle(f"Number of Particles and SV's", fontsize=24)
fig.tight_layout(rect=[0, 0, 1, 0.99]) 
plt.savefig("plots/histo/jet_sv_track_figure.pdf")
plt.show()

#plt.savefig("plots/histo/jet_sv_track_figure.pdf")

with open('plots/histo/jet_sv_track_figure.pkl', 'wb') as f:
    pickle.dump(fig, f)


In [None]:
############################ Jet Features ############################

features = [
    'jet_pt', 'jet_mass', 'jet_eta', 'jet_phi',
    'jet_track_count', 'jet_sv_count'
]

feature_names = [
    'Jet $p_T$', 'Jet Mass', 'Jet $\eta$', 'Jet $\phi$',
    'Number of Particles in a Jet', 'Number of SV in a Jet'
]

xmin = [1, 0, -2.5, -3, 1, 1]
xmax = [1500, 200, 2.5, 3, 50, 12]
bins = [30,30,30,30,20,6]

cmap = matplotlib.colormaps.get_cmap('Spectral')
fig, axes = plt.subplots(nrows=2, ncols = 3, figsize = (20, 10))

for i,feature in enumerate(features):
    #print("processing feature ", feature)
    all_values_df_val_l, all_values_df_val_c, all_values_df_val_b = [], [], []
    all_values_df_train_l, all_values_df_train_c, all_values_df_train_b = [], [], []

    # Iterate over the flattened arrays and collect all values
    for index, row in df_val_l.iterrows():
        all_values_df_val_l.append(row[feature])
    for index, row in df_val_c.iterrows():
        all_values_df_val_c.append(row[feature])
    for index, row in df_val_b.iterrows():
        all_values_df_val_b.append(row[feature])

    for index, row in df_train_l.iterrows():
        all_values_df_train_l.append(row[feature])
    for index, row in df_train_c.iterrows():
        all_values_df_train_c.append(row[feature])
    for index, row in df_train_b.iterrows():
        all_values_df_train_b.append(row[feature])

    binning=np.linspace(min(all_values_df_train_l),max(all_values_df_train_l), bins[i])

    ax = axes[i//3, i%3]
    ax.hist(all_values_df_train_l, density = True, bins=binning, linestyle = '--', label='train udsg', color='red', histtype='step')
    ax.hist(all_values_df_train_c, density = True, bins=binning, linestyle = '--', label='train c', color='green', histtype='step')
    ax.hist(all_values_df_train_b, density = True, bins=binning, linestyle = '--', label='train b', color='blue', histtype='step')

    ax.hist(all_values_df_val_l, density = True, bins=binning, alpha=0.1, label='val udsg', color='red')#, histtype='step')
    ax.hist(all_values_df_val_c, density = True, bins=binning, alpha=0.1, label='val c', color='green')#, histtype='step')
    ax.hist(all_values_df_val_b, density = True, bins=binning, alpha=0.1, label='val b', color='blue')#, histtype='step')

    ax.set_xlim([xmin[i],xmax[i]])
    ax.set_xlabel(f'{feature_names[i]}',  fontsize=20)
    ax.set_ylabel('p.d.f.', fontsize=20)
    ax.set_yscale('log')
    ax.legend()
fig.tight_layout(rect=[0, 0, 1, 0.99]) 

plt.savefig("plots/histo/jet_features_figure.pdf")
plt.show()

with open('plots/histo/jet_features_figure.pkl', 'wb') as f:
    pickle.dump(fig, f)

In [None]:
############################ Track Features ############################

features = [
    'track_E', 'track_pt', 'track_deta', 'track_dphi',
    'track_pid', 'track_charge', 'track_d0', 'track_dz',
    'track_d0_sig', 'track_dz_sig'
]

feature_names = [
    'Track E', 'Track $p_T$', 'Track $\Delta\eta$', 'Track $\Delta\phi$',
    'Track PID', 'Track Charge', 'Track $d_0$', 'Track $d_z$',
    'Track $\sigma_{d_0}$', 'Track $\sigma_{d_z}$'
]

xmin = [0,0,-0.6,-0.6,-205,-1,-40,-80,0,0]
xmax = [800,400,0.6,0.6,205, 1,40,80,800,800]
bins = [30,30,30,30,30,30,30,30,30,30]
cmap = matplotlib.colormaps.get_cmap('Spectral')
fig, axes = plt.subplots(nrows=5, ncols = 2, figsize = (15, 25))

for i,feature in enumerate(features):
    all_values_df_val_l, all_values_df_val_c, all_values_df_val_b = [], [], []
    all_values_df_train_l, all_values_df_train_c, all_values_df_train_b = [], [], []

    # Iterate over the flattened arrays and collect all values
    for index, row in df_val_l.iterrows():
        track_count = int(row['jet_track_count']) 
        all_values_df_val_l.extend(row[feature][:track_count].flatten())
    for index, row in df_val_c.iterrows():
        track_count = int(row['jet_track_count']) 
        all_values_df_val_c.extend(row[feature][:track_count].flatten())
    for index, row in df_val_b.iterrows():
        track_count = int(row['jet_track_count']) 
        all_values_df_val_b.extend(row[feature][:track_count].flatten())

    for index, row in df_train_l.iterrows():
        track_count = int(row['jet_track_count']) 
        all_values_df_train_l.extend(row[feature][:track_count].flatten())
    for index, row in df_train_c.iterrows():
        track_count = int(row['jet_track_count']) 
        all_values_df_train_c.extend(row[feature][:track_count].flatten())
    for index, row in df_train_b.iterrows():
        track_count = int(row['jet_track_count']) 
        all_values_df_train_b.extend(row[feature][:track_count].flatten())

    binning=np.linspace(min(all_values_df_train_l),max(all_values_df_train_l), bins[i])

    ax = axes[i//2, i%2]
    ax.hist(all_values_df_train_l, density = True, bins=binning, linestyle = '--', label='train udsg', color='red', histtype='step')
    ax.hist(all_values_df_train_c, density = True, bins=binning, linestyle = '--', label='train c', color='green', histtype='step')
    ax.hist(all_values_df_train_b, density = True, bins=binning, linestyle = '--', label='train b', color='blue', histtype='step')

    ax.hist(all_values_df_val_l, density = True, bins=binning, alpha=0.1, label='val udsg', color='red')#, histtype='step')
    ax.hist(all_values_df_val_c, density = True, bins=binning, alpha=0.1, label='val c', color='green')#, histtype='step')
    ax.hist(all_values_df_val_b, density = True, bins=binning, alpha=0.1, label='val b', color='blue')#, histtype='step')

    ax.set_xlabel(f'{feature_names[i]}', fontsize=20)
    ax.set_xlim([xmin[i],xmax[i]])
    ax.set_ylabel('p.d.f.', fontsize=20)
    ax.set_yscale('log')
    ax.legend()
#fig.tight_layout(rect=[0, 0, 1, 0.99]) 
fig.tight_layout() 


plt.savefig("plots/histo/Track_features_figure.pdf")
plt.show()

with open('plots/histo/Track_features_figure.pkl', 'wb') as f:
    pickle.dump(fig, f)


In [None]:
############################ Secondary Vertex Features ############################
features = [
    'sv_pt', 'sv_ntracks', 'sv_mass',
    'sv_chi2', 'sv_ndof', 'sv_dxy', 'sv_dlen',
    'sv_dxy_sig', 'sv_dlen_sig'
]

feature_names = [
    'SV $p_T$', 'SV #tracks', 'SV Mass',
    'SV $\chi^2$', 'SV ndof', 'SV $d_{xy}$', 'SV $d_{len}$',
    'SV $\sigma_{d_{xy}}$', 'SV $\sigma_{d_{xy}}$'
]

xmin = [0, -2, 0, 0, 0, 0, 0, 0, 0, 0]
xmax = [200, 2, 20, 10, 25, 25, 25, 25, 500, 200]
bins = [50,10,10,30,50,40,30,30,40,40]

cmap = matplotlib.colormaps.get_cmap('Spectral')
#fig, axes = plt.subplot_mosaic("ABCDE;FGHI", figsize = (24, 8))
fig, axes = plt.subplots(nrows=3, ncols = 3, figsize = (24, 16))


for i,feature in enumerate(features):

    all_values_df_val_l, all_values_df_val_c, all_values_df_val_b = [], [], []
    all_values_df_train_l, all_values_df_train_c, all_values_df_train_b = [], [], []

    for index, row in df_val_l.iterrows():
        sv_count = int(row['jet_sv_count'])
        all_values_df_val_l.extend(row[feature][:sv_count].flatten())
    for index, row in df_val_c.iterrows():
        sv_count = int(row['jet_sv_count'])
        all_values_df_val_c.extend(row[feature][:sv_count].flatten())
    for index, row in df_val_b.iterrows():
        sv_count = int(row['jet_sv_count'])
        all_values_df_val_b.extend(row[feature][:sv_count].flatten())

    for index, row in df_train_l.iterrows():
        sv_count = int(row['jet_sv_count'])
        all_values_df_train_l.extend(row[feature][:sv_count].flatten())
    for index, row in df_train_c.iterrows():
        sv_count = int(row['jet_sv_count'])
        all_values_df_train_c.extend(row[feature][:sv_count].flatten())
    for index, row in df_train_b.iterrows():
        sv_count = int(row['jet_sv_count'])
        all_values_df_train_b.extend(row[feature][:sv_count].flatten())

    ax = axes[i//3, i%3]
  #  ax.figure(figsize=(8, 6))
    binning=np.linspace(min(all_values_df_train_l),max(all_values_df_train_l), bins[i])
    ax.hist(all_values_df_train_l, density = True, bins=binning, linestyle = '--', label='train udsg', color='red', histtype='step')
    ax.hist(all_values_df_train_c, density = True, bins=binning, linestyle = '--', label='train c', color='green', histtype='step')
    ax.hist(all_values_df_train_b, density = True, bins=binning, linestyle = '--', label='train b', color='blue', histtype='step')

    ax.hist(all_values_df_val_l, density = True, bins=binning, alpha=0.1, label='val udsg', color='red')#, histtype='step')
    ax.hist(all_values_df_val_c, density = True, bins=binning, alpha=0.1, label='val c', color='green')#, histtype='step')
    ax.hist(all_values_df_val_b, density = True, bins=binning, alpha=0.1, label='val b', color='blue')#, histtype='step')

    ax.set_xlabel(f'{feature_names[i]}', fontsize=20)
    ax.set_ylabel('p.d.f.', fontsize=20)
    ax.set_yscale('log')
    ax.legend()
#fig.suptitle(f"Secondary Vertex Feature Histograms", fontsize=24)
fig.tight_layout() 

plt.savefig("plots/histo/SV_features_figure.pdf")
plt.show()

with open('plots/histo/SV_features_figure.pkl', 'wb') as f:
    pickle.dump(fig, f)


In [None]:
############################ Indiced Track Features ############################

features = [
    'track_E', 'track_pt', 'track_deta', 'track_dphi',
    'track_pid', 'track_charge', 'track_d0', 'track_dz',
    'track_d0_sig', 'track_dz_sig'
]

feature_names = [
    'Track E', 'Track $p_T$', 'Track $\Delta\eta$', 'Track $\Delta\phi$',
    'Track PID', 'Track Charge', 'Track $d_0$', 'Track $d_z$',
    'Track $\sigma_{d_0}$', 'Track $\sigma_{d_z}$'
]

xmin = [0,0,-0.6,-0.6,-205,-1,-40,-80,0,0]
xmax = [800,400,0.6,0.6,205, 1,40,80,800,800]
bins = [30,30,30,30,30,30,30,30,30,30]



for j in range(3): 

    cmap = matplotlib.colormaps.get_cmap('Spectral')
    fig, axes = plt.subplots(nrows=5, ncols = 2, figsize = (15, 25))

    for i,feature in enumerate(features):
        all_values_df_val_l, all_values_df_val_c, all_values_df_val_b = [], [], []
        all_values_df_train_l, all_values_df_train_c, all_values_df_train_b = [], [], []


        # Iterate over the flattened arrays and collect all values
        for index, row in df_val_l.iterrows():
            track_count = int(row['jet_track_count']) 
            if j < track_count:
                all_values_df_val_l.extend(row[feature][j].flatten())
        for index, row in df_val_c.iterrows():
            track_count = int(row['jet_track_count']) 
            if j < track_count:
                all_values_df_val_c.extend(row[feature][j].flatten())
        for index, row in df_val_b.iterrows():
            track_count = int(row['jet_track_count']) 
            if j < track_count:
                all_values_df_val_b.extend(row[feature][j].flatten())

        for index, row in df_train_l.iterrows():
            track_count = int(row['jet_track_count']) 
            if j < track_count:
                all_values_df_train_l.extend(row[feature][j].flatten())
        for index, row in df_train_c.iterrows():
            track_count = int(row['jet_track_count']) 
            if j < track_count:
                all_values_df_train_c.extend(row[feature][j].flatten())
        for index, row in df_train_b.iterrows():
            track_count = int(row['jet_track_count']) 
            if j < track_count:
                all_values_df_train_b.extend(row[feature][j].flatten())

        binning=np.linspace(min(all_values_df_train_l),max(all_values_df_train_l), bins[i])

        ax = axes[i//2, i%2]
        ax.hist(all_values_df_train_l, density = True, bins=binning, linestyle = '--', label='train udsg', color='red', histtype='step')
        ax.hist(all_values_df_train_c, density = True, bins=binning, linestyle = '--', label='train c', color='green', histtype='step')
        ax.hist(all_values_df_train_b, density = True, bins=binning, linestyle = '--', label='train b', color='blue', histtype='step')

        ax.hist(all_values_df_val_l, density = True, bins=binning, alpha=0.1, label='val udsg', color='red')#, histtype='step')
        ax.hist(all_values_df_val_c, density = True, bins=binning, alpha=0.1, label='val c', color='green')#, histtype='step')
        ax.hist(all_values_df_val_b, density = True, bins=binning, alpha=0.1, label='val b', color='blue')#, histtype='step')

        ax.set_xlabel(f'{feature_names[i]}', fontsize=20)
        ax.set_xlim([xmin[i],xmax[i]])
        ax.set_ylabel('p.d.f.', fontsize=20)
        ax.set_yscale('log')
        ax.legend()
    #fig.tight_layout(rect=[0, 0, 1, 0.99]) 
    fig.tight_layout() 


    plt.savefig(f"plots/histo/Track_features_figure_{j}.pdf")
    plt.show()

    with open(f'plots/histo/Track_features_figure_{j}.pkl', 'wb') as f:
        pickle.dump(fig, f)


In [None]:
############################ Indıced Secondary Vertex Features ############################
features = [
    'sv_pt', 'sv_ntracks', 'sv_mass',
    'sv_chi2', 'sv_ndof', 'sv_dxy', 'sv_dlen',
    'sv_dxy_sig', 'sv_dlen_sig'
]

feature_names = [
    'SV $p_T$', 'SV #tracks', 'SV Mass',
    'SV $\chi^2$', 'SV ndof', 'SV $d_{xy}$', 'SV $d_{len}$',
    'SV $\sigma_{d_{xy}}$', 'SV $\sigma_{d_{xy}}$'
]

xmin = [0, -2, 0, 0, 0, 0, 0, 0, 0, 0]
xmax = [200, 2, 20, 10, 25, 25, 25, 25, 500, 200]
bins = [50,10,10,30,50,40,30,30,40,40]


for j in range(3): 
    cmap = matplotlib.colormaps.get_cmap('Spectral')
    fig, axes = plt.subplots(nrows=3, ncols = 3, figsize = (24, 16))

    for i,feature in enumerate(features):

        all_values_df_val_l, all_values_df_val_c, all_values_df_val_b = [], [], []
        all_values_df_train_l, all_values_df_train_c, all_values_df_train_b = [], [], []

        for index, row in df_val_l.iterrows():
            sv_count = int(row['jet_sv_count'])
            if j < sv_count:
                all_values_df_val_l.extend(row[feature][j].flatten())
        for index, row in df_val_c.iterrows():
            sv_count = int(row['jet_sv_count'])
            if j < sv_count:
                all_values_df_val_c.extend(row[feature][j].flatten())
        for index, row in df_val_b.iterrows():
            sv_count = int(row['jet_sv_count'])
            if j < sv_count:
                all_values_df_val_b.extend(row[feature][j].flatten())

        for index, row in df_train_l.iterrows():
            sv_count = int(row['jet_sv_count'])
            if j < sv_count:
                all_values_df_train_l.extend(row[feature][j].flatten())
        for index, row in df_train_c.iterrows():
            sv_count = int(row['jet_sv_count'])
            if j < sv_count:
                all_values_df_train_c.extend(row[feature][j].flatten())
        for index, row in df_train_b.iterrows():
            sv_count = int(row['jet_sv_count'])
            if j < sv_count:
                all_values_df_train_b.extend(row[feature][j].flatten())

        ax = axes[i//3, i%3]
        binning=np.linspace(min(all_values_df_train_l),max(all_values_df_train_l), bins[i])
        ax.hist(all_values_df_train_l, density = True, bins=binning, linestyle = '--', label='train udsg', color='red', histtype='step')
        ax.hist(all_values_df_train_c, density = True, bins=binning, linestyle = '--', label='train c', color='green', histtype='step')
        ax.hist(all_values_df_train_b, density = True, bins=binning, linestyle = '--', label='train b', color='blue', histtype='step')

        ax.hist(all_values_df_val_l, density = True, bins=binning, alpha=0.1, label='val udsg', color='red')#, histtype='step')
        ax.hist(all_values_df_val_c, density = True, bins=binning, alpha=0.1, label='val c', color='green')#, histtype='step')
        ax.hist(all_values_df_val_b, density = True, bins=binning, alpha=0.1, label='val b', color='blue')#, histtype='step')

        ax.set_xlabel(f'{feature_names[i]}', fontsize=20)
        ax.set_ylabel('p.d.f.', fontsize=20)
        ax.set_yscale('log')
        ax.legend()
    #fig.suptitle(f"Secondary Vertex Feature Histograms", fontsize=24)
    fig.tight_layout() 

    plt.savefig(f"plots/histo/SV_features_figure_{j}.pdf")
    plt.show()

    with open(f'plots/histo/SV_features_figure_{j}.pkl', 'wb') as f:
        pickle.dump(fig, f)
