## Scrape GCN circulars

**Caden Gobat, The George Washington University**

This notebook scrapes [GCN Circular announcements](https://gcn.gsfc.nasa.gov/) to flag potentially short GRBs for further analysis.

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import numpy as np
keywords = ["short burst", "short-burst","short-hard", "short/hard", "short hard", "short grb", "short gamma"]

In [2]:
grbs = []
short_flagged = []
for y in range(2004,2020):
    year = str(y)
    main_page = requests.get(f"https://gcn.gsfc.nasa.gov/selected_{year}.html") # each year has a main page that lists all GCN events
    soup = bs(main_page.text, 'html.parser')
    events = soup.find_all('b') # html headers
    for event in events:
        if "GRB " in event.text: # not all events are GRBs, but if it is...
            GRB_ID = event.text.split()[1].replace(":","")
            grbs.append(GRB_ID)
            target_url = f"https://gcn.gsfc.nasa.gov/other/{GRB_ID}.gcn3"
            try:
                circulars = requests.get(target_url).text # get the text of the GCN page for that particular GRB
                if any([search_term in circulars.lower() for search_term in keywords]): # if any keywords are present,
                    short_flagged.append(GRB_ID) # treat this as a candidate short GRB and flag it as such
            except:
                print("Couldn't get circulars for GRB",GRB_ID)
        
        else: # if event is not a GRB
            pass # do nothing
                
    print("Finished "+year+".")

Finished 2004.
Finished 2005.
Finished 2006.
Finished 2007.
Finished 2008.
Finished 2009.
Finished 2010.
Finished 2011.
Finished 2012.
Finished 2013.
Finished 2014.
Finished 2015.
Finished 2016.
Finished 2017.
Finished 2018.
Finished 2019.


For some reason, events since 2020 have not been compiled into their own pages, so we have to do 2020 and 2021 separately. I copied and pasted a list of events into [GCN_events_2020-2021.csv](./products/GCN_events_2020-2021.csv), which I use below to repeat the procedure from above.

In [4]:
new_ones = pd.read_csv("./products/GCN_events_2020-2021.csv",header=None).values.flatten() # events in 2020 and 2021
recent_grbs = []
for entry in new_ones:
    if "GRB " in entry:
        recent_grbs.append(entry)
        print(entry.split(":")[0])

GRB 211231A
GRB 211229C
GRB 211229B
GRB 211229A
GRB 211227A
GRB 211226A
GRB 211225C
GRB 211225B
GRB 211225A
GRB 211224A
GRB 211223C
GRB 211223B
GRB 211223A
GRB 211222A
GRB 211221A
GRB 211219B
GRB 211219A
GRB 211218A
GRB 211217A
GRB 211216B
GRB 211216A
GRB 211212B
GRB 211212A
GRB 211211B
GRB 211211A
GRB 211207A
GRB 211206B
GRB 211206A
GRB 211204C
GRB 211204B
GRB 211204A
GRB 211203A
GRB 211201A
GRB 211130A
GRB 211129A
GRB 211124A
GRB 211120B
GRB 211120A
GRB 211118A
GRB 211115A
GRB 211112A
GRB 211110A
GRB 211109C
GRB 211109B
GRB 211109A
GRB 211107B
GRB 211107A
GRB 211106A
GRB 211105A
GRB 211104A
GRB 211102B
GRB 211102A
GRB 211031A
GRB 212102A
GRB 211027A
GRB 211025A
GRB 211024B
GRB 211024A
GRB 211023B
GRB 211023A
GRB 211022A
GRB 211021A
GRB 211018A
GRB 211019A
GRB 211017A
GRB 211016A
GRB 211010A
GRB 211008A
GRB 210928A
GRB 210927B
GRB 211001A
GRB 210930A
GRB 210929B
GRB 210929A
GRB 210925B
GRB 210926A
GRB 210927A
GRB 210925A
GRB 210924A
GRB 210923B
GRB 210923A
GRB 210919B
GRB 210919A
GRB 

In [5]:
# now repeat the process from the previous years
for event in recent_grbs:
        GRB_ID = event.split()[1].replace(":","")
        grbs.append(GRB_ID)
        target_url = f"https://gcn.gsfc.nasa.gov/other/{GRB_ID}.gcn3"
        try:
            circulars = requests.get(target_url).text
            if any([search_term in circulars.lower() for search_term in keywords]):
                short_flagged.append(GRB_ID)
        except:
            print("Couldn't get circulars for GRB",GRB_ID)

Below this point has to do with cross-referencing the candidates list with events that are already present for some other reason in my sample. Not super relevant to the actual GCN-scraping aspect of this notebook.

In [6]:
XRT_obs = pd.read_csv("./products/all_XRT_observations.csv")["GRB"].tolist()
already_opt = pd.read_excel("./data/newData.xlsx")["GRB"].dropna().tolist()
current_sample = pd.read_csv("./products/Swift_sGRB_catalog.csv")["GRB"].tolist()
candidates = np.intersect1d(short_flagged, XRT_obs)
candidates = np.intersect1d(candidates,current_sample)
# ['050202', '050509B', '050709', '050724', '050724A', '050813', '050906', '050925', '051105A', '051114', '051210', '051221A', '060121', '060313', '060502B', '060801', '061006', '061201', '061210', '061217', '070209', '070406', '070429B', '070707', '070714A', '070714B', '070724A', '070729', '070809', '070810B', '070923', '071112B', '071227', '080121', '080123', '080426', '080503', '080702A', '080905A', '080913', '080919', '081024A', '081024B', '081101', '081211B', '081226A', '081226B', '090305', '090305A', '090417A', '090426', '090510', '090515', '090531B', '090607', '090621B', '090715A', '090815C', '090916', '091109B', '091117', '100117A', '100206A', '100213A', '100216A', '100625A', '100628A', '100702A', '100724A', '100816A', '101129A', '101219A', '101224A', '110112A', '110112B', '110402A', '110420B', '111020A', '111117A', '111126A', '120229A', '120305A', '120403A', '120521A', '120630A', '120804A', '120817B', '121226A', '130313A', '130515A', '130603B', '130626A', '130716A', '130822A', '130912A', '131004A', '131125A', '131126A', '140129B', '140320A', '140402A', '140414A', '140516A', '140606A', '140611A', '140619B', '140622A', '140903A', '140930B', '141205A', '141212A', '150101A', '150101B', '150120A', '150301A', '150423A', '150424A', '150710A', '150728A', '150831A', '151127A', '151205B', '151228A', '151229A', '160303A', '160408A', '160410A', '160411A', '160525B', '160601A', '160612A', '160624A', '160709A', '160714A', '160726A', '160821B', '160927A', '161001A', '161104A', '170112A', '170127B', '170325A', '170428A', '170524A', '170728A', '170728B', '171103A', '180204A', '180402A', '180418A', '180715A', '180718A', '180727A', '180805A', '180805B', '181123B', '181126A', '190326A', '190427A', '190610A', '190627A', '191031D', '200219A', '200325A', '200405B', '200411A', '200522A', '200623A', '200716C', '200907B', '201006A', '201214B', '201221D', '210119A', '210323A', '210413B'])

In [8]:
not_already = np.setdiff1d(candidates,already_opt)
ruled_out = ["050603","050815","051227","060717","061021","070208","080426","100724A","110715A",
             "131002A","140129B","140209A","160228A","191031C","201221A"] # manually verified as NOT short GRBs
to_do = np.setdiff1d(not_already,ruled_out)
print(len(to_do))
to_do

28


array(['131004A', '140611A', '160303A', '160410A', '160425A', '161001A',
       '161129A', '171007A', '171211A', '180805B', '200409A', '200512A',
       '200517A', '200729A', '200917A', '201015A', '210217A', '210618A',
       '210704A', '210708A', '210725B', '210726A', '210919A', '211023B',
       '211106A', '211207A', '211221A', '211227A'], dtype='<U7')

In [None]:
for grb in to_do:
    target_url = f"https://gcn.gsfc.nasa.gov/other/{grb}.gcn3"
    try:
        circulars = requests.get(target_url).text
        print(circulars.split("////////////////////////////////////////////////////////////////////////")[1])
    except:
        print("Couldn't get circulars for GRB",grb)
    print("\n")

In [24]:
classifications = pd.read_csv("./products/new2.csv")
classifications

Unnamed: 0,GRB,status,optical
0,150906B,real,no
1,150922A,real,no
2,151221A,possible,no
3,160111A,real,no
4,160219A,real,no
...,...,...,...
137,210528A,possible,yes
138,210529A,real,limit
139,210529B,real,yes
140,210601A,real,limit


In [19]:
print(np.unique(classifications[classifications["status"]=="real"]["GRB"].tolist() + ['150424A', '150922A', '151221A', '170219A', '170708A', '170817A', '170825A',
         '180418A', '181121A', '181225A', '190121A', '190606A', '190831B', '050509B',
         '050724', '051103', '051105', '051114', '051211A', '051221', '060427B',
         '060429', '060502B', '070124', '070201', '070429B', '070714B', '070810B',
         '071112B', '080905', '080913', '081211B', '081216', '081223', '090531B',
         '090715A', '090916A', '090927A', '091117A', '091126B', '100213A', '100216A',
         '100816A', '110402A', '110802A', '120811B', '120830A', '140428B', '140604A',
         '150118C', '150906B', '150922A', '160111A', '160219A', '160406A', '160620A',
         '160709A', '160820A', '160822A', '160825A', '170127C', '170206A', '170219A',
         '170222A', '170403A', '170616A', '170708A', '170728B', '170805B', '170816A',
         '170817A', '170825A', '171103A', '180317A', '180529A', '181121A', '181225A',
         '190121A', '190206A', '190606A', '190831B', '200128B', '200219A', '200325A',
         '200327A', '200405B', '200521A', '200623B', '200716C', '200805A', '200815A',
         '200907A', '201214B', '201227A', '210205B', '210307B', '210424B', '210425A',
         '210506A']).tolist())

['050509B', '050724', '051103', '051105', '051114', '051211A', '051221', '060427B', '060429', '060502B', '070124', '070201', '070429B', '070714B', '070810B', '071112B', '080905', '080913', '081211B', '081216', '081223', '090531B', '090715A', '090916A', '090927A', '091117A', '091126B', '100213A', '100216A', '100816A', '110402A', '110802A', '120811B', '120830A', '140428B', '140604A', '150118C', '150424A', '150906B', '150922A', '151221A', '160111A', '160219A', '160406A', '160620A', '160709A', '160820A', '160822A', '160825A', '160829A', '161004A', '170127C', '170206A', '170219A', '170220A', '170222A', '170403A', '170616A', '170708A', '170728B', '170805A', '170805B', '170816A', '170817A', '170822A', '170825A', '170826A', '170827A', '170827B', '170921B', '171030A', '171103A', '171106A', '171223A', '180317A', '180418A', '180529A', '180618A', '180626C', '180715B', '180716A', '180728B', '180824A', '181121A', '181126B', '181222B', '181225A', '190121A', '190206A', '190331C', '190606A', '190626B',

In [20]:
valid = ['050509B', '050724', '051103', '051105', '051114', '051211A', '051221', '060427B', '060429', '060502B', '070124', '070201', '070429B', '070714B', '070810B', '071112B', '080905', '080913', '081211B', '081216', '081223', '090531B', '090715A', '090916A', '090927A', '091117A', '091126B', '100213A', '100216A', '100816A', '110402A', '110802A', '120811B', '120830A', '140428B', '140604A', '150118C', '150424A', '150906B', '150922A', '151221A', '160111A', '160219A', '160406A', '160620A', '160709A', '160820A', '160822A', '160825A', '160829A', '161004A', '170127C', '170206A', '170219A', '170220A', '170222A', '170403A', '170616A', '170708A', '170728B', '170805A', '170805B', '170816A', '170817A', '170822A', '170825A', '170826A', '170827A', '170827B', '170921B', '171030A', '171103A', '171106A', '171223A', '180317A', '180418A', '180529A', '180618A', '180626C', '180715B', '180716A', '180728B', '180824A', '181121A', '181126B', '181222B', '181225A', '190121A', '190206A', '190331C', '190606A', '190626B', '190630A', '190719C', '190724A', '190810A', '190813A', '190830B', '190831B', '190903A', '190913A', '191017C', '191101B', '191116A', '191203A', '191221A', '200103A', '200128A', '200128B', '200129A', '200212A', '200219A', '200221A', '200224C', '200306B', '200307A', '200308A', '200313B', '200325A', '200327A', '200401A', '200405B', '200420A', '200423A', '200501A', '200506B', '200509B', '200514B', '200521A', '200605A', '200623B', '200626A', '200703A', '200706A', '200710A', '200714B', '200716C', '200718A', '200805A', '200815A', '200817A', '200824A', '200826A', '200903C', '200907A', '200908A', '200916B', '200920A', '200920B', '200923A', '200928A', '201103A', '201108A', '201109A', '201111A', '201130A', '201214B', '201221B', '201222A', '201227A', '210124B', '210205B', '210307B', '210326A', '210410A', '210421C', '210424B', '210425A', '210506A', '210510A', '210529A', '210529B', '210601A', '210605A']
valid.sort()