# DATA PREPROCESSING

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# add grbttools to path
import sys
sys.path.append("../")

In [3]:
import os
import numpy as np
import pandas as pd
from grbtools import env

Load GRBs with Extended Emissions

In [4]:
# load extended emission catalog
path_cat_grb_ee = os.path.join(env.DIR_CATALOGS, "grb_ee.xlsx")
df_ee = pd.read_excel(path_cat_grb_ee, sheet_name="ee")

# show random rows
df_ee.sample(5)

Unnamed: 0,catalog,name,trigger_num
40,SWIFT,GRB080503,
22,FERMI,GRB090131090,
46,SWIFT,GRB110402A,
15,BATSE,GRB 990605,7599.0
23,FERMI,GRB100522157,


How many EE GRBs for each catalog?

In [5]:
df_ee['catalog'].value_counts()

BATSE    19
SWIFT    16
FERMI    14
Name: catalog, dtype: int64

## BATSE

Load BATSE catalog

In [6]:
# load batse catalog
path_cat_batse = os.path.join(env.DIR_CATALOGS, "batse_catalog.xlsx")
df_batse = pd.read_excel(path_cat_batse, sheet_name="batsegrb")

# set trigger_num as index
df_batse.set_index("trigger_num", inplace=True)

# show random rows
df_batse.sample(5)

Unnamed: 0_level_0,name,ra,dec,lii,bii,day_trigger,time,flux_64,flux_256,flux_1024,...,t90,fluence_1,fluence_2,fluence_3,fluence_4,comments_quality,comments_otherobs,comments_general,comments_position,comments_duration
trigger_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7060,GRB 980903,139.91,-41.54,266.04,5.68,11059,51059.149346,3.688,1.3895,0.3676,...,0.146,1.153e-08,1.0782e-08,7.2158e-08,1.2181e-07,,,,,
7899,GRB 991210-,302.63,-45.58,354.23,-32.44,11522,51522.436158,2.0091,0.7682,0.2044,...,,8.0764e-09,3.2943e-09,4.2179e-08,0.0,,,,,
7601,GRB 990610-,62.35,-41.25,245.43,-47.33,11339,51339.517911,2.1264,1.6549,0.5429,...,0.276,6.1151e-09,1.4796e-08,1.1601e-07,1.2283e-07,,,,,
5671,GRB 961116,61.27,60.66,144.56,6.2,10403,50403.683342,,,,...,,,,,,,,,,
2380,4B 930607,163.44,35.64,186.8,63.6,9145,49145.880668,1.198,0.962,0.855,...,81.984,3.7078e-07,6.5001e-07,2.9055e-06,2.0131e-06,,,Occultation step (decay) at ~T+300s in LAD 3 (...,,


Load redshift values for BATSE catalog

In [7]:
# load redshift values for batse catalog
path_cat_batse_redshift = os.path.join(env.DIR_CATALOGS, "batse_redshift.xlsx")
df_batse_redshift = pd.read_excel(path_cat_batse_redshift, sheet_name="redshift")

# set trigger_num as index
df_batse_redshift.set_index("trigger_num", inplace=True)

# show random rows
df_batse_redshift.sample(5)

Unnamed: 0_level_0,p256,t90,V,z,L4pi
trigger_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5450,4.15,172.2,0.0712,10.3,5.69e+53
2061,2.19,174.7,0.0185,1.6,6.229999999999999e+51
4157,2.27,20.0,0.013,0.9,1.91e+51
3057,32.36,34.9,0.0256,0.8,1.84e+52
3015,1.75,26.8,0.0453,7.3,1.25e+53


Set EE flag

In [8]:
# add column for EE flag
df_batse["ee"] = False

# get name of BATSE GRBs with EE
batse_grbs_with_ee = df_ee.loc[df_ee["catalog"] == "BATSE", 'name'].values

# set EE flag to True for BATSE GRBs with EE
df_batse.loc[df_batse['name'].isin(batse_grbs_with_ee), "ee"] = True

# how many BATSE GRBs with EE?
df_batse["ee"].value_counts()


False    2666
True       36
Name: ee, dtype: int64

Set *redshift* and *luminosity* values

In [9]:
# add missing columns 
df_batse["redshift"] = np.nan
df_batse["luminosity"] = np.nan
# fill missing values
for triggerr_num, row in df_batse_redshift.iterrows():
    
    # make sure that GRB is in BATSE catalog
    assert triggerr_num in df_batse.index, "Could not find GRB in BATSE catalog for {}".format(triggerr_num)

    # get t90 from BATSE redshift catalog
    t90_1 = row["t90"]
    # get t90 from original BATSE
    t90_2 = df_batse.loc[triggerr_num, "t90"]
    
    # check if they are close to each other
    assert np.isclose(t90_1, t90_2, rtol=1e-2), "t90 values are not close to each other for {}".format(triggerr_num) 

    # fill redshift
    df_batse.loc[triggerr_num, "redshift"] = row["z"]
    # fill luminosity
    df_batse.loc[triggerr_num, "luminosity"] = row["L4pi"]


Calculate features

In [10]:

# calculate intrinsic t90
df_batse["t90_intrinsic"] = df_batse["t90"]/(df_batse["redshift"]+1)
# take log of intrinsic t90
df_batse["lgT90i"] = df_batse["t90_intrinsic"].apply(np.log10) 
# take log of t90
df_batse["lgT90"] = df_batse["t90"].apply(np.log10)

# calculate hardness
df_batse["hardness"] = df_batse["fluence_3"]/df_batse["fluence_1"]
# take log of hardness
df_batse["lgHrd"] = df_batse["hardness"].apply(np.log10)

# take log of luminosity
df_batse["lgLum"] = df_batse["luminosity"].apply(np.log10)


Save dataset

In [11]:
# save BATSE catalog
path_data_batse = os.path.join(env.DIR_DATASETS, "batse.xlsx")
df_batse.to_excel(path_data_batse, sheet_name="data", freeze_panes=(1,0), engine="xlsxwriter")

## FERMI

Load FERMI catalog

In [12]:
# load fermi catalog
path_cat_fermi = os.path.join(env.DIR_CATALOGS, "fermi_catalog.xlsx")
df_fermi = pd.read_excel(path_cat_fermi, sheet_name="fermigbrst")

# show random rows
df_fermi.sample(5)

Unnamed: 0,name,ra,dec,trigger_time,t90,fluence,flux_1024,flux_1024_time,flux_64,lii,bii,t50,fluence_batse,flux_256,flux_batse_1024,flux_batse_64,flux_batse_256,flnc_comp_ampl,flnc_comp_epeak,flnc_comp_index
87,GRB160424492,319.485,-60.6148,57502.492436,6.592,2.7276e-06,7.3002,0.832,9.6573,334.4563,-41.1356,1.536,2e-06,8.1606,3.5522,4.5438,3.8365,0.011253,204.4114,-1.005932
22,GRB141022087,119.39,-75.17,56952.086577,9.216,8.9994e-05,62.897,6.272,97.2009,287.5767,-22.085,4.608,4e-05,84.1565,33.0719,53.7563,46.1982,0.127996,503.7627,-0.840119
2170,GRB140408553,290.716,-12.5937,56755.552723,7.68,6.571e-07,2.3897,-0.32,5.0282,25.0861,-12.6003,3.072,0.0,3.11,0.7276,1.3932,0.866,0.003567,415485.3,-1.51726
2838,GRB191111547,239.41,-70.42,58798.546653,158.212,8.6363e-06,10.29,6.208,12.4335,317.3897,-12.9736,119.811,5e-06,11.3471,3.3653,4.3972,3.6091,,,
2784,GRB090720276,203.694,-10.335,55032.276485,4.48,2.914e-06,9.7817,0.512,13.5526,320.0546,51.0902,1.597,2e-06,10.7591,4.1331,5.7647,4.6536,0.045486,106.8206,-0.60863


Set EE flag

In [13]:
# add column for EE flag
df_fermi["ee"] = False

# get name of FERMI GRBs with EE
fermi_grbs_with_ee = df_ee.loc[df_ee["catalog"] == "FERMI", 'name'].values

# set EE flag to True for FERMI GRBs with EE
df_fermi.loc[df_fermi['name'].isin(fermi_grbs_with_ee), "ee"] = True

# how many FERMI GRBs with EE?
df_fermi["ee"].value_counts()


False    2859
True       14
Name: ee, dtype: int64

Calculate hardness ratio

In [14]:
from scipy.integrate import quad

# calculate hardness
df_fermi["hardness"] = np.nan
for index, row in df_fermi.iterrows():
    epeak = row["flnc_comp_epeak"]
    alpha = row["flnc_comp_index"]
    amplitude = row["flnc_comp_ampl"] 
    
    if np.isnan(epeak) or np.isnan(alpha) or np.isnan(amplitude):
        continue
    
    flux = lambda energy: energy * amplitude * ((energy / 100)**alpha) * \
    np.exp((-1*(alpha + 2) * energy / epeak))
    
    fluence3, err = quad(flux, 100, 300)
    fluence1, err = quad(flux, 25, 50)
    
    # hardness
    df_fermi.loc[index, "hardness"] = fluence3/fluence1

Log transform of features

In [15]:
df_fermi["lgT90"] = df_fermi["t90"].apply(np.log10)
df_fermi["lgHrd"] = df_fermi["hardness"].apply(np.log10)

Save dataset

In [16]:
# save FERMI catalog
path_data_fermi = os.path.join(env.DIR_DATASETS, "fermi.xlsx")
df_fermi.to_excel(path_data_fermi, sheet_name="data", freeze_panes=(1,0), engine="xlsxwriter")

## SWIFT

Load SWIFT-t1 data and calculate luminosity

In [26]:
from grbtools.swift.luminosity import calculate_luminosity

# load csv data
path_swift_t1 = os.path.join(env.DIR_CATALOGS, "swift_t1s.xlsx")
df_swift_t1 = pd.read_excel(path_swift_t1)

# calculate luminosity
df_swift_t1 = calculate_luminosity(df_swift_t1)

Load SWIFT-t100 data and calculate hardness ratio

In [27]:
from grbtools.swift.hardness import calculate_hardness

# load csv data
path_swift_t100 = os.path.join(env.DIR_CATALOGS, "swift_t100s.xlsx")
df_swift_t100 = pd.read_excel(path_swift_t100)

# calculate luminosity
df_swift_t100 = calculate_hardness(df_swift_t100)

# select only data column
df_swift_t100 = df_swift_t100[["grbname", "hardness_ratio"]]

Merge datasets and create SWIFT catalog

In [28]:
# merge dataframes
df_swift = pd.merge(df_swift_t1, df_swift_t100, on="grbname", how="left")

# rename first column to 'name'
df_swift.rename(columns={'grbname': 'name'}, inplace=True)

# show random rows
df_swift.sample(5)


Unnamed: 0,name,z,z_comment,t90,t1s_best_model,t1s_pl_alpha,t1s_pl_norm,t1s_cpl_alpha,t1s_cpl_norm,t1s_cpl_epeak,lum_kcorr,kcorr,flux,model_used,hardness_ratio
176,GRB180402A,,,0.18,PL,-0.878314,0.015836,-0.76844,0.017592,681.428,,,,,3.594927
1072,GRB080430,0.767,,13.872,PL,-1.75064,0.019695,-1.64516,0.022479,151.587,4.290552e+50,0.867659,1.817248e-07,PL,0.813655
779,GRB110709B,,,810.472,PL,-1.26216,0.03196,-0.945692,0.045509,175.264,,,,,1.24334
562,GRB131103A,0.5955,,15.208,PL,-1.37038,0.012904,-0.209332,0.052205,74.815,1.3636240000000002e+50,0.745164,1.25464e-07,PL,0.755719
784,GRB110530A,,,55.728,PL,-1.25195,0.003652,-1.24704,0.003668,9999.36,,,,,0.688256


Set EE flag

In [20]:
# add column for EE flag
df_swift["ee"] = False

# get name of FERMI GRBs with EE
swift_grbs_with_ee = df_ee.loc[df_ee["catalog"] == "SWIFT", 'name'].values

# set EE flag to True for SWIFT GRBs with EE
df_swift.loc[df_swift['name'].isin(swift_grbs_with_ee), "ee"] = True

# how many SWIFT GRBs with EE?
df_swift["ee"].value_counts()

False    1372
True       16
Name: ee, dtype: int64

Set 'Magnetar' flag

In [21]:
# first, load magnetar catalog
path_cat_magnetar = os.path.join(env.DIR_CATALOGS, "swift_magnetars.xlsx")
df_magnetar = pd.read_excel(path_cat_magnetar, sheet_name="magnetars")

# get name of magnetars
magnetar_names = df_magnetar["name"].values

# add column for magnetar flag
df_swift["magnetar"] = False

# set magnetar flag to True for SWIFT GRBs with magnetar
df_swift.loc[df_swift['name'].isin(magnetar_names), "magnetar"] = True

# how many SWIFT GRBs with magnetar?
df_swift["magnetar"].value_counts()

False    1376
True       12
Name: magnetar, dtype: int64

Calculate features

In [22]:
# compute intrinsic 
df_swift["t90_intrinsic"] =  df_swift["t90"]/(df_swift["z"]+1)

# take logs
df_swift["lgT90"] = df_swift["t90"].apply(np.log10)
# take log of intrinsic t90
df_swift["lgT90i"] = df_swift["t90_intrinsic"].apply(np.log10)
# take log of hardness
df_swift["lgHrd"] = df_swift["hardness_ratio"].apply(np.log10)
# take log of luminosity
df_swift["lgLum"] = df_swift["lum_kcorr"].apply(np.log10)

Save dataset

In [23]:
# save SWIFT catalog
path_data_swift = os.path.join(env.DIR_DATASETS, "swift.xlsx")
df_swift.to_excel(path_data_swift, sheet_name="data", freeze_panes=(1,0), engine="xlsxwriter")