In [35]:
from bs4 import BeautifulSoup
import urllib
import requests
import pandas as pd
import numpy as np
from IPython.lib import backgroundjobs as bg

In [50]:
def log_progress(sequence, every=None, size=None, name='Items'):
    """https://github.com/alexanderkuk/log-progress"""
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

def scrape_explorer(catalog, initial_iloc=0, final_iloc=None):
    if not final_iloc>initial_iloc:
        raise ValueError("final_iloc must be greater than initial_iloc")
    base_url = "http://skyserver.sdss.org/dr8/en/tools/explore/"
    para_sum = "obj.asp?plate={}&mjd={}&fiber={}"
    tables = "PhotoTag", "photoz", "galSpecLine"
    table_train = {kw:[] for kw in tables}
    itable_error = {kw:[] for kw in tables}
    final_iloc_ = final_iloc if final_iloc != None else catalog

    with requests.Session() as session:
        
        for i in log_progress(catalog.index[initial_iloc:final_iloc_], every=1, name="# galaxies"):
            try:
                mjd, plate, fiber = catalog.iloc[i]
                summary_url = urllib.basejoin(base_url,para_sum.format(plate,mjd,fiber))
                try:
                    sdss_explorer = session.get(summary_url)
                except:
                    for table in tables: itable_error[table] += [i]
                    continue
                soup = BeautifulSoup(sdss_explorer.content, "html.parser")

                frame = soup.select_one("[name=OETOC]")
                frame_url = urllib.basejoin(summary_url, frame["src"])
                try:
                    frame_resp = session.get(frame_url)
                except:
                    for table in tables: itable_error[table] += [i]
                    continue
                frame_soup = BeautifulSoup(frame_resp.content, "html.parser")

                for table in tables:
                    try:
                        table_url = urllib.basejoin(base_url, frame_soup.select_one("[href*={}]".format(table)).get("href"))
                        table_train[table] += pd.read_html(table_url, flavor="bs4", attrs={"cellpadding":2})
                    except:
                        itable_error[table] += [i]
                        continue
            except KeyboardInterrupt:
                break
    return i, table_train, itable_error

def process_row(row):
    row.index = row.get(0)
    return row.drop(0, axis=1).transpose()

In [3]:
sdss_sed_catalog = pd.read_csv("../data/sample-catalog.csv").drop("row", axis=1)
sdss_sed_catalog.head()

Unnamed: 0,mjd,plate,fiberID
0,51630,266,2
1,51630,266,4
2,51630,266,11
3,51630,266,13
4,51630,266,14


In [46]:
N_jobs = 5
job_size = 100
jobs_args = [(i*job_size,(i+1)*job_size) for i in xrange(N_jobs)]

jobs = bg.BackgroundJobManager()
for ini,fin in jobs_args: jobs.new(scrape_explorer, sdss_sed_catalog, ini, fin)
#i, table_train, itable_error = scrape_explorer(sdss_sed_catalog, initial_iloc=0, final_iloc=100)

Starting job # 0 in a separate thread.
Starting job # 2 in a separate thread.
Starting job # 3 in a separate thread.
Starting job # 4 in a separate thread.
Starting job # 5 in a separate thread.


A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

In [31]:
data_frames = {}
for table in tables: data_frames[table] = pd.concat(map(process_row, table_train[table])).reset_index().drop("index", axis=1)

In [32]:
data_frames["PhotoTag"].head()

Unnamed: 0,objID,skyVersion,run,rerun,camcol,field,obj,mode,nChild,type,...,cz,extinction_u,extinction_g,extinction_r,extinction_i,extinction_z,htmID,fieldID,specObjID,size
0,1.237651e+18,2.0,1239.0,301.0,2.0,176.0,291.0,1.0,0.0,3.0,...,-0.011383,0.432246,0.318042,0.230671,0.174911,0.124014,10552750000000.0,1.237651e+18,2.9949e+17,2.397614
1,1.237651e+18,2.0,1239.0,301.0,2.0,175.0,231.0,1.0,0.0,3.0,...,-0.013354,0.383948,0.282505,0.204896,0.155367,0.110157,10570140000000.0,1.237651e+18,2.994905e+17,3.661087
2,1.237651e+18,2.0,1239.0,301.0,2.0,177.0,219.0,1.0,0.0,3.0,...,-0.011701,0.434649,0.319811,0.231953,0.175883,0.124703,10552980000000.0,1.237651e+18,2.994924e+17,2.399178
3,1.237651e+18,2.0,1239.0,301.0,2.0,178.0,124.0,1.0,0.0,3.0,...,-0.013252,0.378597,0.278568,0.202041,0.153201,0.108622,10552840000000.0,1.237651e+18,2.99493e+17,2.254911
4,1.237652e+18,2.0,1473.0,301.0,2.0,26.0,184.0,1.0,0.0,3.0,...,-0.010335,0.529002,0.389235,0.282306,0.214064,0.151774,10552990000000.0,1.237652e+18,2.994933e+17,3.385857


In [33]:
data_frames["photoz"].head()

Unnamed: 0,objID,z,zErr,nnCount,nnVol,nnIsInside,nnObjID,nnSpecz,nnFarObjID,nnAvgZ,...,kcorrU01,kcorrG01,kcorrR01,kcorrI01,kcorrZ01,absMagU,absMagG,absMagR,absMagI,absMagZ
0,1.237651e+18,0.20926,0.013156,98.0,4e-06,1.0,1.23765e+18,0.198423,1.237653e+18,0.209602,...,0.451902,0.437862,0.0719,-0.048938,0.006186,-19.330799,-21.335501,-22.1014,-22.4765,-22.876101
1,1.237651e+18,0.062211,0.020261,99.0,1e-06,1.0,1.237665e+18,0.059433,1.237668e+18,0.064034,...,-0.201464,-0.224901,-0.164264,-0.143322,-0.133694,-17.542999,-19.299801,-20.053699,-20.4513,-20.8043
2,1.237651e+18,0.13553,0.008466,98.0,0.0,1.0,1.237658e+18,0.121902,1.237651e+18,0.135621,...,0.040522,0.053775,-0.038589,-0.078072,-0.070342,-19.2381,-21.0464,-21.824699,-22.186199,-22.5898
3,1.237651e+18,0.086761,0.025552,99.0,1e-06,1.0,1.237679e+18,0.124682,1.237662e+18,0.07891,...,-0.143679,-0.144754,-0.120678,-0.113365,-0.106596,-17.560801,-19.204201,-19.882401,-20.2029,-20.4639
4,1.237652e+18,0.068979,0.020641,99.0,2e-06,1.0,1.237661e+18,0.076338,1.237674e+18,0.069126,...,-0.160786,-0.163684,-0.127071,-0.122969,-0.10581,-17.402,-18.6476,-19.1364,-19.4496,-19.652


In [34]:
data_frames["galSpecLine"].head()

Unnamed: 0,specObjID,sigma_balmer,sigma_balmer_err,sigma_forbidden,sigma_forbidden_err,v_off_balmer,v_off_balmer_err,v_off_forbidden,v_off_forbidden_err,oii_3726_cont,...,oii_flux,oii_flux_err,oii_voff,oii_chi2,oiii_sigma,oiii_flux,oiii_flux_err,oiii_voff,oiii_chi2,spectofiber
0,2.9949e+17,1.0,0.0,1.0,0.0,14.759637,22.297083,53.125938,17.558704,3.792109,...,6.378588,3.439324,-35.159153,1.311002,38.119461,6.280257,2.43767,28.616022,0.980717,0.743768
1,2.994905e+17,98.050514,5.49248,115.732117,8.159125,28.086563,4.634161,20.281176,7.486269,7.589058,...,59.000435,7.624006,46.035427,1.319861,182.796005,27.404892,4.606291,85.032967,0.912974,0.756515
2,2.994924e+17,500.0,0.0,1.0,0.0,300.0,0.0,-13.196837,19.569338,7.650619,...,-6.564786,4.405112,-65.687569,0.399078,1.0,3.393926,2.260247,10.685112,0.367328,0.748085
3,2.99493e+17,1.0,0.0,34.766685,14.21303,-2.143243,6.909191,-25.382641,6.782214,6.48743,...,16.806694,5.098092,-19.457685,0.918896,46.432987,18.564543,2.752133,-27.014681,1.142547,0.737778
4,2.994933e+17,77.683304,2.026539,75.587021,3.761252,-1.080791,1.661206,-0.767384,2.991486,6.926846,...,107.588768,7.85816,11.031679,1.698479,92.667694,32.873699,3.518138,39.648891,1.239122,0.739431
