In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import re
from tqdm import tqdm
import os
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

from recon_files import Recon

In [2]:
def scrape_all_GTO():
    """
    Gets all science targets of all JWST GTO programs that have hit phase 2.
    Takes around 4 minutes on Arjun's mac.
    
    outputs:
        
    """
    URL = 'http://www.stsci.edu/jwst/observing-programs/approved-gto-programs'
    page = requests.get(URL)

    soup = BeautifulSoup(page.content, 'html.parser')

    all_targets = []

    gto_pages = []
    for link in soup.find_all('a'):
        if link.has_attr('href'):
            str_begin = '/jwst/observing-programs/program-information?id='
            if link.attrs['href'][:48] == str_begin:
                gto_page = 'http://www.stsci.edu/' + link.attrs['href'] # give better name
                gto_pages.append(gto_page)

    for gto_page in tqdm(gto_pages, position=0, leave=True):
        ID = gto_page[-4:]
        pdf_link = f'http://www.stsci.edu/jwst/phase2-public/{ID}.pdf'
        urlretrieve(pdf_link, "tmp.pdf")
        text = convert_pdf_to_txt("tmp.pdf")
        start = text.find("Science Target") + len("Science Target")
        end = text.find("ABSTRACT")
        target_table = text[start:end]
        targets = list(set(re.findall(r"""\(\d\)\ (\w+)""", target_table)))
#         targets += list(set(re.findall(r"""\(\d\)(\w+)""", target_table)))
        targets += list(set(re.findall(r"""\(\d\)\ (\w+-\w+)""", target_table)))
        targets += list(set(re.findall(r"""\(\d\)\ (\w+-\w+-\w+)""", target_table))) # for HAT-P-35, for example
        targets += list(set(re.findall(r"""\(\d\)\ (\w+ \w+)""", target_table)))
        all_targets += targets
        os.remove('tmp.pdf')
    return list(set(all_targets))

In [69]:
scrape_all_GTO()

100%|██████████| 120/120 [04:12<00:00,  2.11s/it]


['I-ZW-18',
 'J004629',
 'CHARIKLO',
 'HAT-P-1',
 'MRK231',
 '2003AZ84',
 '2MJ0443',
 'HFLS3',
 'HR8799',
 'MARS',
 'HD-15407',
 'I-ZW',
 '18660-GS-3',
 'B1A',
 'ARP220-BACKGROUND',
 'SCULPTOR-F1',
 '17264-COS',
 'J11193254-1137466',
 'URANUS-BACKGROUND',
 'SER-SMM',
 'LDN-663',
 'NGC-1514-SHELL1',
 'NGC-7293',
 'NAME-HH',
 'AA-TAU',
 '23170-GS-2',
 'MIRI-GC',
 'G-7-34',
 'SPRITE-TARGET-1',
 'TRAPPIST-1B',
 'W',
 'GRIEKE',
 'HD-30562',
 'WD0806-661B',
 'HH212',
 'GU-PSC-B',
 'MSD-MIRI-SURVEY',
 'DRACO-F3',
 'BRICK',
 'VEGA',
 'ULASJ1342',
 'VV',
 'GN-Z11',
 '10376',
 'J1148',
 'NGC2547',
 '5001',
 'MSD-MIRI',
 '2',
 'SPITZER',
 'GC-NIRSPEC-2',
 'DELSJ0411',
 'LBQS0302-0019',
 'CENA',
 'HH30',
 'AEGIS3_NEW',
 'GN20-NIRSPEC',
 'QUAOAR',
 'HH211-SW',
 'G',
 'HD-18511',
 'HZ2',
 'TWA',
 'HORSEHEAD',
 'MWC-758',
 'LQAC',
 'ALLSTARS',
 'PER',
 'MRK231-NUC',
 'TINYCAT-HST',
 '4891-GS-3',
 'DRACO-F0',
 'HAT-P-26',
 'LAWD',
 'NGC-1514-RINGS3',
 'HD149026',
 '2005RR43',
 'UCAC4',
 'ARP220-BACKGR

In [153]:
test = Recon("HD 219134")
test.scrape_all()
test.webb_approved, test.hst_approved

scraping arxiv: 100%|██████████| 127/127 [00:00<00:00, 187892.98it/s]


Scraping webb MAST
Scraping HST MAST


(False, True)

In [125]:
obs = Observations.query_object("GJ 357", radius=".02 deg")

In [128]:
obs[obs['obs_collection']=='JWST']

intentType,obs_collection,provenance_name,instrument_name,project,filters,wavelength_region,target_name,target_classification,obs_id,s_ra,s_dec,dataproduct_type,proposal_pi,calib_level,t_min,t_max,t_exptime,em_min,em_max,obs_title,t_obs_release,proposal_id,proposal_type,sequence_number,s_region,jpegURL,dataURL,dataRights,mtFlag,srcDen,obsid,distance
str7,str11,str12,str11,str4,str17,str16,str32,str11,str65,float64,float64,str10,str20,int64,float64,float64,float64,float64,float64,str95,float64,str10,str4,int64,str166,str172,str186,str6,bool,float64,str11,float64
SCIENCE,JWST,APT,NIRISS,JWST,;GR700XD,INFRARED,GJ-357,--,jw01201201001_xx102_00001_niriss,144.0074645833333,-21.66506388888888,spectrum,"Lafreniere, David",-1,,,10144.548,,,NIRISS Exploration of the Atmospheric diversity of Transiting exoplanets (NEAT),,1201,GTO,--,POLYGON ICRS 144.01105865126212 -21.70115867639322 144.00199669888482 -21.663984163423095 144.0080935315118 -21.662728515167714 144.0171563291471 -21.699903058429072,--,--,--,--,,10000003663,7.700469138581743
