# DATA PREPROCESSING

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# add grbttools to path
import sys
sys.path.append("../")

In [3]:
import os
import numpy as np
import pandas as pd
from grbtools import env

Load GRBs with Extended Emissions

In [4]:
# load extended emission catalog
path_cat_grb_ee = os.path.join(env.DIR_CATALOGS, "grb_ee.xlsx")
df_ee = pd.read_excel(path_cat_grb_ee, sheet_name="ee")

# show random rows
df_ee.sample(5)

Unnamed: 0,catalog,name,trigger_num
16,BATSE,4B 920722,1719.0
36,SWIFT,GRB061006,
41,SWIFT,GRB090531B,
18,BATSE,4B 931031,2611.0
11,BATSE,GRB 961017-,5634.0


How many EE GRBs for each catalog?

In [5]:
df_ee['catalog'].value_counts()

BATSE    19
SWIFT    16
FERMI    14
Name: catalog, dtype: int64

## BATSE

Load BATSE catalog

In [6]:
# load batse catalog
path_cat_batse = os.path.join(env.DIR_CATALOGS, "batse_catalog.xlsx")
df_batse = pd.read_excel(path_cat_batse, sheet_name="batsegrb")

# set trigger_num as index
df_batse.set_index("trigger_num", inplace=True)

# show random rows
df_batse.sample(5)

Unnamed: 0_level_0,name,ra,dec,lii,bii,day_trigger,time,flux_64,flux_256,flux_1024,...,t90,fluence_1,fluence_2,fluence_3,fluence_4,comments_quality,comments_otherobs,comments_general,comments_position,comments_duration
trigger_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5434,4B 960418-,91.12,7.91,200.47,-6.7,10191,50191.176281,0.95,0.92,0.791,...,35.04,2.9848e-07,2.2453e-07,7.8282e-07,1.8691e-06,,,,,
2152,4B 930131-,79.83,-2.6,204.41,-21.62,9018,49018.882769,0.922,0.741,0.421,...,4.416,1.6617e-07,1.5585e-07,4.4889e-08,0.0,,,Background source in both triggered detectors.,,
6386,GRB 970918-,29.39,-67.58,293.54,-48.32,10709,50709.669713,2.0008,1.5024,0.9752,...,0.832,2.1449e-08,2.1905e-08,2.1693e-07,9.7353e-07,,,,,
7272,GRB 981221-,286.58,-9.87,25.78,-7.76,11168,51168.041395,,,,...,,,,,,,,,,
2798,4B 940206-,144.2,-59.96,280.74,-5.76,9389,49389.005993,24.192,23.748,22.372,...,49.152,4.0022e-06,1.3225e-05,6.5986e-05,0.00014808,,"Ulysses, DMS rate increase, SIGMA/GRANAT, PHEB...",,,


Load redshift values for BATSE catalog

In [7]:
# load redshift values for batse catalog
path_cat_batse_redshift = os.path.join(env.DIR_CATALOGS, "batse_redshift.xlsx")
df_batse_redshift = pd.read_excel(path_cat_batse_redshift, sheet_name="redshift")

# set trigger_num as index
df_batse_redshift.set_index("trigger_num", inplace=True)

# show random rows
df_batse_redshift.sample(5)

Unnamed: 0_level_0,p256,t90,V,z,L4pi
trigger_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
222,3.99,73.1,0.038,3.6,6.92e+52
3345,6.76,40.4,0.0359,2.6,5.72e+52
2984,4.61,32.8,0.0302,2.4,3.21e+52
3352,3.71,46.3,0.0047,0.2,6.2e+49
3056,2.41,36.3,0.0196,1.7,7.58e+51


Set EE flag

In [8]:
# add column for EE flag
df_batse["ee"] = False

# get name of BATSE GRBs with EE
batse_grbs_with_ee = df_ee.loc[df_ee["catalog"] == "BATSE", 'name'].values

# set EE flag to True for BATSE GRBs with EE
df_batse.loc[df_batse['name'].isin(batse_grbs_with_ee), "ee"] = True

# how many BATSE GRBs with EE?
df_batse["ee"].value_counts()


False    2666
True       36
Name: ee, dtype: int64

Set *redshift* and *luminosity* values

In [9]:
# add missing columns 
df_batse["redshift"] = np.nan
df_batse["luminosity"] = np.nan
# fill missing values
for triggerr_num, row in df_batse_redshift.iterrows():
    
    # make sure that GRB is in BATSE catalog
    assert triggerr_num in df_batse.index, "Could not find GRB in BATSE catalog for {}".format(triggerr_num)

    # get t90 from BATSE redshift catalog
    t90_1 = row["t90"]
    # get t90 from original BATSE
    t90_2 = df_batse.loc[triggerr_num, "t90"]
    
    # check if they are close to each other
    assert np.isclose(t90_1, t90_2, rtol=1e-2), "t90 values are not close to each other for {}".format(triggerr_num) 

    # fill redshift
    df_batse.loc[triggerr_num, "redshift"] = row["z"]
    # fill luminosity
    df_batse.loc[triggerr_num, "luminosity"] = row["L4pi"]


Calculate features

In [10]:

# calculate intrinsic t90
df_batse["t90_intrinsic"] = df_batse["t90"]/(df_batse["redshift"]+1)
# take log of intrinsic t90
df_batse["lgT90i"] = df_batse["t90_intrinsic"].apply(np.log10) 
# take log of t90
df_batse["lgT90"] = df_batse["t90"].apply(np.log10)

# calculate hardness
df_batse["hardness"] = df_batse["fluence_3"]/df_batse["fluence_1"]
# take log of hardness
df_batse["lgHrd"] = df_batse["hardness"].apply(np.log10)

# take log of luminosity
df_batse["lgLum"] = df_batse["luminosity"].apply(np.log10)


Detect Outliers

Save dataset

In [11]:
# save BATSE catalog
path_data_batse = os.path.join(env.DIR_DATASETS, "batse.xlsx")
df_batse.to_excel(path_data_batse, sheet_name="data", freeze_panes=(1,0), engine="xlsxwriter")

## FERMI

Load FERMI catalog

In [12]:
# load fermi catalog
path_cat_fermi = os.path.join(env.DIR_CATALOGS, "fermi_catalog.xlsx")
df_fermi = pd.read_excel(path_cat_fermi, sheet_name="fermigbrst")

# show random rows
df_fermi.sample(5)

Unnamed: 0,name,ra,dec,trigger_time,t90,fluence,flux_1024,flux_1024_time,flux_64,lii,bii,t50,fluence_batse,flux_256,flux_batse_1024,flux_batse_64,flux_batse_256,flnc_comp_ampl,flnc_comp_epeak,flnc_comp_index
946,GRB110301214,229.35,29.4,55621.214387,5.693,3.6e-05,100.726,1.92,130.233,45.7119,57.9092,2.304,2.2e-05,119.832,35.7646,48.6915,44.7773,0.153943,117.2617,-1.013
1753,GRB110706202,100.08,6.14,55748.20213,12.032,3e-06,2.5973,2.624,4.0139,206.2026,0.3358,5.888,2e-06,3.4425,1.3741,2.3929,1.8404,0.005846,489.671,-0.977343
822,GRB170710340,43.122,42.679,57944.34037,42.24,3e-06,2.7198,5.376,4.7857,145.4682,-14.8338,17.152,2e-06,3.2529,0.8187,1.8798,1.1303,0.005393,237.8485,-1.003138
1399,GRB090117335,227.3,-41.5,54848.334748,27.264,1e-06,3.674,24.384,5.3805,328.8562,14.3343,25.152,1e-06,4.3677,1.2374,1.9683,1.5474,0.005656,233.5418,-1.383541
2309,GRB100718160,121.83,-46.18,55395.159833,32.641,3e-06,4.0487,3.664,6.6872,261.6109,-7.3932,8.576,2e-06,5.8744,1.5549,2.9767,2.7037,0.005334,109.9589,-1.19258


Set EE flag

In [13]:
# add column for EE flag
df_fermi["ee"] = False

# get name of FERMI GRBs with EE
fermi_grbs_with_ee = df_ee.loc[df_ee["catalog"] == "FERMI", 'name'].values

# set EE flag to True for FERMI GRBs with EE
df_fermi.loc[df_fermi['name'].isin(fermi_grbs_with_ee), "ee"] = True

# how many FERMI GRBs with EE?
df_fermi["ee"].value_counts()


False    2859
True       14
Name: ee, dtype: int64

Calculate hardness ratio

In [14]:
from scipy.integrate import quad

# calculate hardness
df_fermi["hardness"] = np.nan
for index, row in df_fermi.iterrows():
    epeak = row["flnc_comp_epeak"]
    alpha = row["flnc_comp_index"]
    amplitude = row["flnc_comp_ampl"] 
    
    if np.isnan(epeak) or np.isnan(alpha) or np.isnan(amplitude):
        continue
    
    flux = lambda energy: energy * amplitude * ((energy / 100)**alpha) * \
    np.exp((-1*(alpha + 2) * energy / epeak))
    
    fluence3, err = quad(flux, 100, 300)
    fluence1, err = quad(flux, 25, 50)
    
    # hardness
    df_fermi.loc[index, "hardness"] = fluence3/fluence1

Log transform of features

In [15]:
df_fermi["lgT90"] = df_fermi["t90"].apply(np.log10)
df_fermi["lgHrd"] = df_fermi["hardness"].apply(np.log10)

Save dataset

In [16]:
# save FERMI catalog
path_data_fermi = os.path.join(env.DIR_DATASETS, "fermi.xlsx")
df_fermi.to_excel(path_data_fermi, sheet_name="data", freeze_panes=(1,0), engine="xlsxwriter")

## SWIFT

Load SWIFT Catalog

In [17]:
# load fermi catalog
path_cat_swift = os.path.join(env.DIR_CATALOGS, "swift_catalog.xlsx")
df_swift = pd.read_excel(path_cat_swift)

# show random rows
df_swift.sample(5)

Unnamed: 0,name,z,z_comment,t90,t1s_best_model,t1s_pl_alpha,t1s_pl_norm,t1s_cpl_alpha,t1s_cpl_norm,t1s_cpl_epeak,...,flux,model_used,t100s_best_model,t100s_pl_fluence_25_50_kev,t100s_pl_fluence_100_150_kev,t100s_cpl_fluence_25_50_kev,t100s_cpl_fluence_100_150_kev,t100s_pl_hardness,t100s_cpl_hardness,hardness_ratio
1225,GRB080310,2.42743,,363.212006,PL,-1.91347,0.009161,-0.718262,0.045839,43.813999,...,8.423367e-08,P,PL,7.334445e-07,3.271653e-07,7.774766e-07,3.275271e-07,0.446067,0.421269,0.446067
135,GRB200901A,,,20.351999,CPL,-1.6938,0.057759,-0.977827,0.131943,73.811096,...,,,PL,7.877328e-07,6.636499e-07,6.344814e-07,4.202221e-07,0.842481,0.662308,0.842481
1290,GRB070529,2.4996,,108.900002,PL,-1.4869,0.01172,-0.236899,0.047629,73.667503,...,1.114228e-07,P,PL,5.711871e-07,7.312205e-07,5.748815e-07,7.156132e-07,1.280177,1.244801,1.280177
473,GRB160417A,,,14.552,PL,-1.49949,0.00772,-0.595282,0.021955,77.389503,...,,,PL,1.75377e-07,1.124663e-07,1.668755e-07,1.106248e-07,0.641283,0.662918,0.641283
1282,GRB070704,,,384.936005,PL,-0.868598,0.017423,-0.474669,0.025804,229.442993,...,,,PL,1.563761e-06,1.417393e-06,1.568737e-06,1.396498e-06,0.9064,0.890205,0.9064


Set EE flag

In [18]:
# add column for EE flag
df_swift["ee"] = False

# get name of FERMI GRBs with EE
swift_grbs_with_ee = df_ee.loc[df_ee["catalog"] == "SWIFT", 'name'].values

# set EE flag to True for SWIFT GRBs with EE
df_swift.loc[df_swift['name'].isin(swift_grbs_with_ee), "ee"] = True

# how many SWIFT GRBs with EE?
df_swift["ee"].value_counts()

False    1509
True       16
Name: ee, dtype: int64

In [19]:
df_swift

Unnamed: 0,name,z,z_comment,t90,t1s_best_model,t1s_pl_alpha,t1s_pl_norm,t1s_cpl_alpha,t1s_cpl_norm,t1s_cpl_epeak,...,model_used,t100s_best_model,t100s_pl_fluence_25_50_kev,t100s_pl_fluence_100_150_kev,t100s_cpl_fluence_25_50_kev,t100s_cpl_fluence_100_150_kev,t100s_pl_hardness,t100s_cpl_hardness,hardness_ratio,ee
0,GRB220715B,,,40.408001,,-0.78456,0.011339,-0.794540,0.011458,9999.360352,...,,,2.745653e-07,3.548589e-07,2.841552e-07,3.089055e-07,1.292439,1.087101,1.292439,False
1,GRB220714B,,,49.040001,PL,-1.64781,0.029445,-1.448870,0.037503,131.455994,...,,PL,1.304739e-06,8.542476e-07,1.301649e-06,8.842270e-07,0.654727,0.679313,0.654727,False
2,GRB220711B,,,87.056000,,-1.43626,0.024784,0.162297,0.137160,76.114601,...,,PL,1.585870e-06,1.284126e-06,1.624006e-06,1.146457e-06,0.809730,0.705944,0.809730,False
3,GRB220708A,,,4.000000,PL,-1.91564,0.004836,-1.925260,0.004807,9995.040039,...,,PL,3.786866e-08,1.671822e-08,4.478472e-08,4.349708e-09,0.441479,0.097125,0.441479,False
4,GRB220706A,,,85.996002,PL,-1.63229,0.006975,-1.643620,0.006933,9999.360352,...,,PL,3.715433e-07,2.936091e-07,3.685780e-07,3.140360e-07,0.790242,0.852020,0.790242,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,GRB041220,,,5.584000,PL,-1.31802,0.014645,-1.106840,0.018479,227.095993,...,,PL,9.797765e-08,8.605123e-08,9.951924e-08,8.014509e-08,0.878274,0.805323,0.878274,False
1521,GRB041219C,,,10.000000,PL,-1.70236,0.017536,-1.275860,0.028918,85.261497,...,,PL,3.452719e-07,2.003278e-07,,,0.580203,,0.580203,False
1522,GRB041219B,,,9.856000,PL,-1.34096,0.011391,-0.633827,0.025028,101.986000,...,,PL,8.485209e-08,4.795263e-08,8.094358e-08,3.807786e-08,0.565132,0.470425,0.565132,False
1523,GRB041219A,,,,,,,,,,...,,,,,,,,,,False


Set 'Magnetar' flag

In [20]:
# first, load magnetar catalog
path_cat_magnetar = os.path.join(env.DIR_CATALOGS, "swift_magnetars.xlsx")
df_magnetar = pd.read_excel(path_cat_magnetar, sheet_name="magnetars")

# get name of magnetars
magnetar_names = df_magnetar["name"].values

# add column for magnetar flag
df_swift["magnetar"] = False

# set magnetar flag to True for SWIFT GRBs with magnetar
df_swift.loc[df_swift['name'].isin(magnetar_names), "magnetar"] = True

# how many SWIFT GRBs with magnetar?
df_swift["magnetar"].value_counts()

False    1513
True       12
Name: magnetar, dtype: int64

Calculate features

In [21]:
# compute intrinsic 
df_swift["t90_intrinsic"] =  df_swift["t90"]/(df_swift["z"]+1)

# take logs
df_swift["lgT90"] = df_swift["t90"].apply(np.log10)
# take log of intrinsic t90
df_swift["lgT90i"] = df_swift["t90_intrinsic"].apply(np.log10)
# take log of hardness
df_swift["lgHrd"] = df_swift["hardness_ratio"].apply(np.log10)
# take log of luminosity
df_swift["lgLum"] = df_swift["lum_kcorr"].apply(np.log10)

Save dataset

In [22]:
# save SWIFT catalog
path_data_swift = os.path.join(env.DIR_DATASETS, "swift.xlsx")
df_swift.to_excel(path_data_swift, sheet_name="data", freeze_panes=(1,0), engine="xlsxwriter")