# DATA PREPROCESSING

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# add grbttools to path
import sys

sys.path.append("../")

In [3]:
import os
import numpy as np
import pandas as pd
from grbtools import env

Load GRBs with Extended Emissions

In [4]:
# load extended emission catalog
path_cat_grb_ee = os.path.join(env.DIR_CATALOGS, "grb_ee.xlsx")
df_ee = pd.read_excel(path_cat_grb_ee, sheet_name="ee")

# show random rows
df_ee.sample(5)

Unnamed: 0,catalog,name,trigger_num
26,FERMI,GRB080807993,
44,SWIFT,GRB100522A,
14,BATSE,GRB 960906-,5592.0
45,SWIFT,GRB110207A,
18,BATSE,4B 931031,2611.0


How many EE GRBs for each catalog?

In [5]:
df_ee["catalog"].value_counts()

BATSE    19
SWIFT    16
FERMI    14
Name: catalog, dtype: int64

## BATSE

Load BATSE catalog

In [6]:
# load batse catalog
path_cat_batse = os.path.join(env.DIR_CATALOGS, "batse_catalog.xlsx")
df_batse = pd.read_excel(path_cat_batse, sheet_name="batsegrb")

# set trigger_num as index
df_batse.set_index("trigger_num", inplace=True)

# show random rows
df_batse.sample(5)

Unnamed: 0_level_0,name,ra,dec,lii,bii,day_trigger,time,flux_64,flux_256,flux_1024,...,t90,fluence_1,fluence_2,fluence_3,fluence_4,comments_quality,comments_otherobs,comments_general,comments_position,comments_duration
trigger_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7657,GRB 990716-,337.26,-44.61,351.87,-56.66,11375,51375.26984,2.789,2.5875,2.3832,...,51.84,1.3953e-06,1.4707e-06,3.2333e-06,2.0097e-06,,,,,
5556,4B 960731-,199.53,-18.29,311.77,44.12,10295,50295.140891,1.584,1.299,0.891,...,1.511,4.9773e-08,6.0206e-08,1.4857e-07,0.0,Data gap during burst accumulation interval,,,,Data gap from T+124 to T+245 s.
3751,4B 950810,30.38,5.76,152.78,-53.0,9939,49939.125098,2.486,1.983,0.87,...,0.535,4.8612e-09,3.2657e-08,1.9034e-07,3.8145e-07,,,Noisy background due to CYG X-1 in LAD 5; Occu...,,
1695,4B 920711-,276.54,73.38,104.2,27.71,8814,48814.672923,,,,...,,,,,,Limited datatypes available.,"Ulysses, PVO, DMS rate increase, WATCH/GRANAT,...",,Location Derived using only max rates,"Visual duration estimate of > 110 s, using MER..."
3914,4B 951120-,136.91,-33.19,258.24,9.64,10041,50041.141792,0.499,0.323,0.269,...,54.432,1.2991e-07,2.009e-07,4.9625e-07,5.6811e-09,Data gap during burst accumulation interval,,,,Data gap from T+170 to T+287 s.


Load redshift values for BATSE catalog

In [7]:
# load redshift values for batse catalog
path_cat_batse_redshift = os.path.join(env.DIR_CATALOGS, "batse_redshift.xlsx")
df_batse_redshift = pd.read_excel(path_cat_batse_redshift, sheet_name="redshift")

# set trigger_num as index
df_batse_redshift.set_index("trigger_num", inplace=True)

# show random rows
df_batse_redshift.sample(5)

Unnamed: 0_level_0,p256,t90,V,z,L4pi
trigger_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
907,3.57,158.1,0.0386,3.9,7.34e+52
2922,2.85,160.8,0.0399,4.6,8.19e+52
3523,21.57,59.1,0.0208,0.7,9.21e+51
1623,2.98,66.1,0.0291,2.7,2.85e+52
2897,2.94,28.0,0.0178,1.3,5.44e+51


Set EE flag

In [8]:
# add column for EE flag
df_batse["ee"] = False

# get name of BATSE GRBs with EE
batse_grbs_with_ee = df_ee.loc[df_ee["catalog"] == "BATSE", "name"].values

# set EE flag to True for BATSE GRBs with EE
df_batse.loc[df_batse["name"].isin(batse_grbs_with_ee), "ee"] = True

# how many BATSE GRBs with EE?
df_batse["ee"].value_counts()

False    2666
True       36
Name: ee, dtype: int64

Set *redshift* and *luminosity* values

In [9]:
# add missing columns
df_batse["redshift"] = np.nan
df_batse["luminosity"] = np.nan
# fill missing values
for triggerr_num, row in df_batse_redshift.iterrows():
    # make sure that GRB is in BATSE catalog
    assert (
        triggerr_num in df_batse.index
    ), "Could not find GRB in BATSE catalog for {}".format(triggerr_num)

    # get t90 from BATSE redshift catalog
    t90_1 = row["t90"]
    # get t90 from original BATSE
    t90_2 = df_batse.loc[triggerr_num, "t90"]

    # check if they are close to each other
    assert np.isclose(
        t90_1, t90_2, rtol=1e-2
    ), "t90 values are not close to each other for {}".format(triggerr_num)

    # fill redshift
    df_batse.loc[triggerr_num, "redshift"] = row["z"]
    # fill luminosity
    df_batse.loc[triggerr_num, "luminosity"] = row["L4pi"]

Calculate features

In [10]:
# calculate intrinsic t90
df_batse["t90_intrinsic"] = df_batse["t90"] / (df_batse["redshift"] + 1)
# take log of intrinsic t90
df_batse["lgT90i"] = df_batse["t90_intrinsic"].apply(np.log10)
# take log of t90
df_batse["lgT90"] = df_batse["t90"].apply(np.log10)

# calculate hardness
df_batse["hardness"] = df_batse["fluence_3"] / df_batse["fluence_1"]
# take log of hardness
df_batse["lgHrd"] = df_batse["hardness"].apply(np.log10)

# take log of luminosity
df_batse["lgLum"] = df_batse["luminosity"].apply(np.log10)

Save dataset

In [11]:
# save BATSE catalog
path_data_batse = os.path.join(env.DIR_DATASETS, "batse.xlsx")
df_batse.to_excel(
    path_data_batse, sheet_name="data", freeze_panes=(1, 0), engine="xlsxwriter"
)

## FERMI

Load FERMI catalog

In [12]:
# load fermi catalog
path_cat_fermi = os.path.join(env.DIR_CATALOGS, "fermi_catalog.xlsx")
df_fermi = pd.read_excel(path_cat_fermi, sheet_name="fermigbrst")

# show random rows
df_fermi.sample(5)

Unnamed: 0,name,ra,dec,trigger_time,t90,fluence,flux_1024,flux_1024_time,flux_64,lii,bii,t50,fluence_batse,flux_256,flux_batse_1024,flux_batse_64,flux_batse_256,flnc_comp_ampl,flnc_comp_epeak,flnc_comp_index
2519,GRB171212222,61.49,24.22,58099.22239,19.968,2.8735e-06,6.8823,0.32,8.78,169.8546,-20.4485,5.632,1.3505e-06,7.4117,1.6119,2.6338,1.9057,0.014127,53.32233,-1.226308
627,GRB120919309,214.768,-45.564,56189.308819,21.248,1.678e-05,24.5176,2.624,28.1425,318.5949,14.6258,2.816,9.515e-06,27.5658,11.8929,13.779,13.2647,0.024419,221.2393,-0.995339
1808,GRB140724533,314.73,-1.85,56862.533206,0.896,1.2694e-07,1.6223,-0.256,4.566,46.9493,-28.9726,0.128,8.2537e-08,3.1034,0.5297,2.5667,1.2351,0.041516,155.477,-0.185447
1207,GRB131119781,47.96,-24.01,56615.78111,34.816,1.8487e-06,2.277,8.0,4.0633,215.2597,-58.2851,20.224,1.1114e-06,3.3146,0.983,1.8252,1.3904,0.013101,96.6423,-0.573525
606,GRB100718160,121.83,-46.18,55395.159833,32.641,2.7472e-06,4.0487,3.664,6.6872,261.6109,-7.3932,8.576,1.5921e-06,5.8744,1.5549,2.9767,2.7037,0.005334,109.9589,-1.19258


Set EE flag

In [13]:
# add column for EE flag
df_fermi["ee"] = False

# get name of FERMI GRBs with EE
fermi_grbs_with_ee = df_ee.loc[df_ee["catalog"] == "FERMI", "name"].values

# set EE flag to True for FERMI GRBs with EE
df_fermi.loc[df_fermi["name"].isin(fermi_grbs_with_ee), "ee"] = True

# how many FERMI GRBs with EE?
df_fermi["ee"].value_counts()

False    3555
True       14
Name: ee, dtype: int64

Calculate hardness ratio

In [14]:
from scipy.integrate import quad

# calculate hardness
df_fermi["hardness"] = np.nan
for index, row in df_fermi.iterrows():
    epeak = row["flnc_comp_epeak"]
    alpha = row["flnc_comp_index"]
    amplitude = row["flnc_comp_ampl"]

    if np.isnan(epeak) or np.isnan(alpha) or np.isnan(amplitude):
        continue

    flux = (
        lambda energy: energy
        * amplitude
        * ((energy / 100) ** alpha)
        * np.exp((-1 * (alpha + 2) * energy / epeak))
    )

    fluence3, err = quad(flux, 100, 300)
    fluence1, err = quad(flux, 25, 50)

    # hardness
    df_fermi.loc[index, "hardness"] = fluence3 / fluence1

Log transform of features

In [15]:
df_fermi["lgT90"] = df_fermi["t90"].apply(np.log10)
df_fermi["lgHrd"] = df_fermi["hardness"].apply(np.log10)

Save dataset

In [16]:
# save FERMI catalog
path_data_fermi = os.path.join(env.DIR_DATASETS, "fermi.xlsx")
df_fermi.to_excel(
    path_data_fermi, sheet_name="data", freeze_panes=(1, 0), engine="xlsxwriter"
)

## SWIFT

Load SWIFT Catalog

In [17]:
# load fermi catalog
path_cat_swift = os.path.join(env.DIR_CATALOGS, "swift_catalog.xlsx")
df_swift = pd.read_excel(path_cat_swift)

# show random rows
df_swift.sample(5)

Unnamed: 0,name,z,z_comment,t90,t1s_best_model,t1s_pl_alpha,t1s_pl_norm,t1s_cpl_alpha,t1s_cpl_norm,t1s_cpl_epeak,...,flux,model_used,t100s_best_model,t100s_pl_fluence_25_50_kev,t100s_pl_fluence_100_150_kev,t100s_cpl_fluence_25_50_kev,t100s_cpl_fluence_100_150_kev,t100s_pl_hardness,t100s_cpl_hardness,hardness_ratio
924,GRB110519A,,,27.184,,-1.93051,0.03159,-1.17117,0.08331,48.388802,...,,,PL,1.224779e-06,6.42078e-07,,,0.52424,,0.52424
956,GRB110102A,,,265.915985,PL,-1.1817,0.06632,-1.1668,0.067323,9887.129883,...,,,PL,3.913973e-06,4.245589e-06,3.914334e-06,4.242264e-06,1.084726,1.083777,1.084726
665,GRB140331A,4.65,,209.656006,,-1.37838,0.001856,-1.41346,0.001851,9999.360352,...,1.801741e-08,P,PL,2.122432e-07,1.302458e-07,2.273284e-07,7.917604e-08,0.613663,0.348289,0.613663
635,GRB140703A,3.14,,68.643997,PL,-1.17099,0.023658,-0.518898,0.044838,137.886002,...,2.420305e-07,P,PL,1.049984e-06,8.911743e-07,1.048583e-06,8.967527e-07,0.84875,0.855204,0.84875
104,GRB201229A,,,53.248001,PL,-1.29081,0.014234,-0.635939,0.028492,116.556,...,,,PL,2.957551e-07,3.638499e-07,3.340967e-07,2.175515e-07,1.230241,0.651163,1.230241


Set EE flag

In [18]:
# add column for EE flag
df_swift["ee"] = False

# get name of FERMI GRBs with EE
swift_grbs_with_ee = df_ee.loc[df_ee["catalog"] == "SWIFT", "name"].values

# set EE flag to True for SWIFT GRBs with EE
df_swift.loc[df_swift["name"].isin(swift_grbs_with_ee), "ee"] = True

# how many SWIFT GRBs with EE?
df_swift["ee"].value_counts()

False    1509
True       16
Name: ee, dtype: int64

In [19]:
df_swift

Unnamed: 0,name,z,z_comment,t90,t1s_best_model,t1s_pl_alpha,t1s_pl_norm,t1s_cpl_alpha,t1s_cpl_norm,t1s_cpl_epeak,...,model_used,t100s_best_model,t100s_pl_fluence_25_50_kev,t100s_pl_fluence_100_150_kev,t100s_cpl_fluence_25_50_kev,t100s_cpl_fluence_100_150_kev,t100s_pl_hardness,t100s_cpl_hardness,hardness_ratio,ee
0,GRB220715B,,,40.408001,,-0.78456,0.011339,-0.794540,0.011458,9999.360352,...,,,2.745653e-07,3.548589e-07,2.841552e-07,3.089055e-07,1.292439,1.087101,1.292439,False
1,GRB220714B,,,49.040001,PL,-1.64781,0.029445,-1.448870,0.037503,131.455994,...,,PL,1.304739e-06,8.542476e-07,1.301649e-06,8.842270e-07,0.654727,0.679313,0.654727,False
2,GRB220711B,,,87.056000,,-1.43626,0.024784,0.162297,0.137160,76.114601,...,,PL,1.585870e-06,1.284126e-06,1.624006e-06,1.146457e-06,0.809730,0.705944,0.809730,False
3,GRB220708A,,,4.000000,PL,-1.91564,0.004836,-1.925260,0.004807,9995.040039,...,,PL,3.786866e-08,1.671822e-08,4.478472e-08,4.349708e-09,0.441479,0.097125,0.441479,False
4,GRB220706A,,,85.996002,PL,-1.63229,0.006975,-1.643620,0.006933,9999.360352,...,,PL,3.715433e-07,2.936091e-07,3.685780e-07,3.140360e-07,0.790242,0.852020,0.790242,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,GRB041220,,,5.584000,PL,-1.31802,0.014645,-1.106840,0.018479,227.095993,...,,PL,9.797765e-08,8.605123e-08,9.951924e-08,8.014509e-08,0.878274,0.805323,0.878274,False
1521,GRB041219C,,,10.000000,PL,-1.70236,0.017536,-1.275860,0.028918,85.261497,...,,PL,3.452719e-07,2.003278e-07,,,0.580203,,0.580203,False
1522,GRB041219B,,,9.856000,PL,-1.34096,0.011391,-0.633827,0.025028,101.986000,...,,PL,8.485209e-08,4.795263e-08,8.094358e-08,3.807786e-08,0.565132,0.470425,0.565132,False
1523,GRB041219A,,,,,,,,,,...,,,,,,,,,,False


Set 'Magnetar' flag

In [20]:
# first, load magnetar catalog
path_cat_magnetar = os.path.join(env.DIR_CATALOGS, "swift_magnetars.xlsx")
df_magnetar = pd.read_excel(path_cat_magnetar, sheet_name="magnetars")

# get name of magnetars
magnetar_names = df_magnetar["name"].values

# add column for magnetar flag
df_swift["magnetar"] = False

# set magnetar flag to True for SWIFT GRBs with magnetar
df_swift.loc[df_swift["name"].isin(magnetar_names), "magnetar"] = True

# how many SWIFT GRBs with magnetar?
df_swift["magnetar"].value_counts()

False    1513
True       12
Name: magnetar, dtype: int64

Calculate features

In [21]:
# compute intrinsic
df_swift["t90_intrinsic"] = df_swift["t90"] / (df_swift["z"] + 1)

# take logs
df_swift["lgT90"] = df_swift["t90"].apply(np.log10)
# take log of intrinsic t90
df_swift["lgT90i"] = df_swift["t90_intrinsic"].apply(np.log10)
# take log of hardness
df_swift["lgHrd"] = df_swift["hardness_ratio"].apply(np.log10)
# take log of luminosity
df_swift["lgLum"] = df_swift["lum_kcorr"].apply(np.log10)

Save dataset

In [22]:
# save SWIFT catalog
path_data_swift = os.path.join(env.DIR_DATASETS, "swift.xlsx")
df_swift.to_excel(
    path_data_swift, sheet_name="data", freeze_panes=(1, 0), engine="xlsxwriter"
)