In [2]:
import json
import pandas as pd
import h5py
import numpy as np
import os
import matplotlib.pyplot as plt
import math
import gzip
from tqdm import tqdm

In [3]:
def load_df (path):
    with gzip.open(path, 'r') as pf:
        df = pd.read_csv(pf,sep='\n', header=None)
    df = df.loc[:,0].str.split(",", n = 17, expand = True) 
    replace_cols = [3,10,11,16]
    for col in replace_cols:
        df[col] = df[col].str.replace('{', '')
        df[col] = df[col].str.replace('}', '')
    df.drop([17],axis=1,inplace=True)
    df = df.astype(float)
    df[2] = df[2].astype(int)
    return df

In [4]:
def load_df_tv (path):
    with gzip.open(path, 'r') as pf:
        df = pd.read_csv(pf,sep='\n', header=None)
    df = df.loc[:,0].str.split(",", n = 17, expand = True) 
    replace_cols = [3,10,11,16]
    for col in replace_cols:
        df[col] = df[col].str.replace('{', '')
        df[col] = df[col].str.replace('}', '')
    tv_series = df[17]
    df.drop([17],axis=1,inplace=True)
    df = df.astype(float)
    df[2] = df[2].astype(int)
    return pd.concat([df,tv_series],axis=1)

In [176]:
def data_cleaning (df, tv_series=False):
    if tv_series:
        tv = df[17]
        df2 = df.iloc[:,:-1]
    else:
        df2 = pd.DataFrame(df)
    column3 = ['track_2_d0_significance', 'track_3_d0_significance','track_2_z0_significance', 'track_3_z0_significance','n_tracks_over_d0_threshold', 'jet_prob', 'jet_width_eta', 'jet_width_phi']
    column4 = ['vertex_significance', 'n_secondary_vertices', 'n_secondary_vertex_tracks', 'delta_r_vertex', 'vertex_mass', 'vertex_energy_fraction']
    
    keys = df2.columns.values
    vals = ['jet_pt','jet_eta','flavor']+column3+column4
    rename_dict = dict(zip(keys,vals))
    df2.rename(columns=rename_dict, inplace=True)
   
    label_df = pd.get_dummies(df2.flavor,prefix='flavor')
    label_df.rename(columns={'flavor_0':"Light_Jet",'flavor_4':'Charm_Jet','flavor_5':'Bottom_Jet'},inplace=True)
    df2 = pd.concat([df2.drop(['flavor'],axis=1),label_df],axis=1)

    if tv_series:
        df2 = pd.concat([df2,tv],axis=1)

    df2 = df2.where(df2!=-np.inf,np.nan)
    df2 = df2.where(df2!=np.inf,np.nan)
    df2 = df2.where(df2!=-1,np.nan)
    df2 = df2.dropna()

    
    return df2
    


In [125]:
def bg_charm_deleted (df):
    '''
        To get ratio around 44:11:45, each bg should have only 75 Charm Jet.
        Since each bg has about 400 Charms, sort the df ascending and drop the last 325 rows.
    '''
    sorted_df = df.sort_values(by='Charm_Jet', ascending=True)
    return sorted_df.iloc[:-310,:]


In [126]:
def hist_plot(data,bins,log=False):
    plt.rcParams["figure.figsize"] = (15,10)
    features = data.iloc[:,:-3].columns.values
    labels = data.iloc[:,-3:].columns.values
    col = 4
    row = math.ceil(len(features)/4)
    fig, ax = plt.subplots(row,col)
    for i, feat in enumerate(features):
        ax[i//4,i%4].hist(data[data[labels[0]]==1][feat], bins, density=True, histtype='step', label=labels[0], log=log)
        ax[i//4,i%4].hist(data[data[labels[1]]==1][feat], bins, density=True, histtype='step', label=labels[1], log=log)
        ax[i//4,i%4].hist(data[data[labels[2]]==1][feat], bins, density=True, histtype='step', label=labels[2], log=log)
        ax[i//4,i%4].legend()
        ax[i//4,i%4].set_xlabel(feat)
        ax[i//4,i%4].set_ylabel('Fraction of Events')
    plt.tight_layout()
    plt.show()

In [None]:
# To find out the mean distribution in signal set and bg set.
nums = np.arange(1,10000,1)
sig_rand =np.random.choice(nums,size=50, replace=False)
bg_rand = np.random.choice(nums,size=50, replace=False)

sig_path = ["data/jet_flavor/signal/delphes._%04d.ntuple.txt.gz"%(i) for i in sig_rand]
bg_path = ["data/jet_flavor/bg/delphes._%04d.ntuple.txt.gz"%(i) for i in bg_rand]

mean_set = []
for path in tqdm(bg_path):
    mean_set.append(return_distr(data_cleaning(load_df(path)),True))

np.mean(mean_set,axis=0)

In [180]:
def concat (num, customize=False, tv_series=False):
    '''
    num: total number of jets.
    signal distribution:        Light:Charm:Bottom = 35:3:3100
    background distribution:    Light:Charm:Bottom = 300:400:10
    Target: 100k jets for each category.
    '''
    sig_dis = (36.06, 4.46, 3121.38)
    bg_dis = (323.3 , 391.42,   8.3)
    sig_num = int(num/3 // sig_dis[2])
    bg_num = int(num/3 // bg_dis[1])

    if customize:
        weight = .825      # rough calculation, using weight to approach.
        sig_num = int(num *45/100 // sig_dis[2])
        bg_num = int((num*44/100 - sig_num*sig_dis[0]) // bg_dis[0] * weight)
        # sig_num = int(num * 653/4648250)
        # bg_num = int(num * 5393/3718600)

    nums = np.arange(1,10000,1)
    sig_rand =np.random.choice(nums,size=sig_num, replace=False)
    bg_rand = np.random.choice(nums,size=bg_num, replace=False)

    sig_path = ["data/jet_flavor/signal/delphes._%04d.ntuple.txt.gz"%(i) for i in sig_rand]
    bg_path = ["data/jet_flavor/bg/delphes._%04d.ntuple.txt.gz"%(i) for i in bg_rand]

    # for i in bg_path:
    #     if not os.path.exists(i):
    #         print(i)
    # return

    data = pd.DataFrame()
   
    if tv_series:
        for path in tqdm(sig_path):
            data = pd.concat([data,data_cleaning(load_df_tv(path),tv_series=True)], ignore_index=True)

        for path in tqdm(bg_path):
            if customize:
                data = pd.concat([data,bg_charm_deleted(data_cleaning(load_df_tv(path),tv_series=True))], ignore_index=True)
            else:
                data = pd.concat([data,data_cleaning(load_df_tv(path),tv_series=True)], ignore_index=True)

    else:
        for path in tqdm(sig_path):
            data = pd.concat([data,data_cleaning(load_df(path))], ignore_index=True)

        for path in tqdm(bg_path):
            if customize:
                data = pd.concat([data,bg_charm_deleted(data_cleaning(load_df(path)))], ignore_index=True)
            else:
                data = pd.concat([data,data_cleaning(load_df(path))], ignore_index=True)


    return data


In [181]:
'''
copy the previous file to the following:
50, 404, 504, 1666, *8044, 9829, 7898, 7899, 4091, 6899, 0476, 8174,9062,5934,8611,5127,0928,5458, 3039, 5486, 9330, 4230, 3393, 6078, 5053, 555, 4953
'''

data_500k_with_tv = concat(5e5,customize=True,tv_series=True)
# data_1500k = concat(15e5,True)

100%|██████████| 72/72 [00:19<00:00,  3.71it/s]
100%|██████████| 554/554 [03:03<00:00,  3.01it/s]


In [143]:
def return_distr(df,ret=False):
    
    L_j = df["Light_Jet"].sum()
    C_j = df["Charm_Jet"].sum()
    B_j = df["Bottom_Jet"].sum()
    total = L_j + C_j + B_j

    if ret:
        return [L_j, C_j, B_j]

    print(L_j)
    print(C_j)
    print(B_j)

    print("Ratio L:C:B = %d : %d : %d" %(L_j/total*100,C_j/total*100,B_j/total*100))

In [183]:
return_distr(data_500k_with_tv)

180923
47118
230679
Ratio L:C:B = 39 : 10 : 50


In [184]:
out_dir = 'data/jet_flavor/data_500k_with_tv'
data_500k_with_tv.to_hdf(out_dir,key='data',mode='w')
    

In [None]:
hist_plot(data_300k,50)