# WD Database for Python

The goal is to download the following database: http://vizier.u-strasbg.fr/viz-bin/VizieR?-source=J%2FMNRAS%2F455%2F3413 for use in a machine-learning inspired scheme to 

In [1]:
#Preamble. Standard packages for to load
import astropy
from astropy.table import Table, Column, MaskedColumn, vstack 
import numpy as np
from astroquery.vizier import Vizier
import matplotlib.pyplot as plt
import urllib2
# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 
from astroquery.sdss import SDSS
from astropy import units as u
from astropy import coordinates as coords
from astropy.io import fits
import astropy.io.ascii as ascii
import os
import random



In [2]:
catalog_list_1 = Vizier.find_catalogs('New white dwarf SDSS DR12')
catalog_list_2 = Vizier.find_catalogs('J/ApJS/204/5')
catalog_list_3 = Vizier.find_catalogs('J/MNRAS/446/4078')

In [3]:
print({k:v.description for k,v in catalog_list_1.items()})
print({k:v.description for k,v in catalog_list_2.items()})
print({k:v.description for k,v in catalog_list_3.items()})

{u'J/MNRAS/455/3413': u'New white dwarf and subdwarf stars in SDSS DR12 (Kepler+, 2016)'}
{u'J/ApJS/204/5': u'SDSS DR7 white dwarf catalog (Kleinman+, 2013)'}
{u'J/MNRAS/446/4078': u'New white dwarf stars in SDSS DR10 (Kepler+, 2015)'}


In [4]:
Vizier.ROW_LIMIT = -1
catalogs_1 = Vizier.get_catalogs(catalog_list_1.keys())
catalogs_2 = Vizier.get_catalogs(catalog_list_2.keys())
catalogs_3 = Vizier.get_catalogs(catalog_list_3.keys())



In [5]:
print(catalogs_1)
print(catalogs_2)
print(catalogs_3)

TableList with 1 tables:
	'0:J/MNRAS/455/3413/table6' with 34 column(s) and 6647 row(s) 
TableList with 1 tables:
	'0:J/ApJS/204/5/table2' with 45 column(s) and 20407 row(s) 
TableList with 1 tables:
	'0:J/MNRAS/446/4078/table6' with 30 column(s) and 9112 row(s) 


In [6]:
catalogs = vstack([catalogs_1[0], catalogs_2[0], catalogs_3[0]])



In [7]:
catalogs

_RAJ2000,_DEJ2000,PMF,SDSS,S_N,umag,e_umag,gmag,e_gmag,rmag,e_rmag,imag,e_imag,zmag,e_zmag,E_B-V_,pm,GLON,GLAT,SpType,Teff,e_Teff,logg,e_logg,Teff_3D_,e_Teff_3D_,logg_3D_,e_logg_3D_,Mass,e_Mass,Com,Sp,_RA.icrs,_DE.icrs,Plate,MJD,Fiber,RAJ2000,DEJ2000,SNg,f_umag,f_gmag,f_rmag,f_imag,f_zmag,f_pm,pmPA,Ag,GMT,Atype,log_g_,e_log_g_,chi2,m_Nsp,Nsp,q_Nsp,Type,WDcat,DR7,Sloan,Simbad,PMJ,_RA,_DE
deg,deg,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,mag,mag,mag,mag,mag,mag,mag,mag,mag,mag,mag,0.01 / yr,deg,deg,Unnamed: 19_level_1,K,K,[cm/s2],[cm/s2],K,K,[cm/s2],[cm/s2],Msun,Msun,Unnamed: 30_level_1,Unnamed: 31_level_1,deg,deg,Unnamed: 34_level_1,d,Unnamed: 36_level_1,deg,deg,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,deg,mag,"""datime""",Unnamed: 49_level_1,[cm/s2],[cm/s2],Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,deg,deg
float64,float64,str15,str19,int16,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,str9,int32,int32,float32,float32,int32,int32,float32,float32,float64,float64,str69,str2,float64,float64,int16,int32,int16,float64,float64,float32,int64,int64,int64,int64,int64,uint8,float32,float32,str20,str14,float32,float32,float32,str4,int16,str1,str19,int16,str3,str5,str6,str15,float64,float64
192.84360,-2.67170,0337-51997-0195,125122.06-023737.21,22,18.12,0.02,17.57,0.02,17.55,0.01,17.57,0.02,17.60,0.02,0.021,5.10,302.9,60.2,sdB,29108,463,5.409,0.087,--,--,--,--,--,--,,Sp,192.8436,-2.6717,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--
232.78030,55.01910,0614-53437-0049,153112.62+550139.60,22,17.61,0.01,17.83,0.02,18.32,0.02,18.67,0.02,18.96,0.04,0.018,8.30,88.1,49.9,sdB,29876,374,5.292,0.080,--,--,--,--,--,--,,Sp,232.7803,55.0191,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--
221.71480,58.15350,0790-52346-0634,144657.14+580920.12,46,15.84,0.01,15.67,0.01,15.67,0.02,15.70,0.02,15.76,0.02,0.010,3.90,97.9,53.0,sdB,30893,165,5.033,0.032,--,--,--,--,--,--,,Sp,221.7148,58.1535,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--
117.37760,28.08670,1059-52618-0515,074915.66+280641.01,11,18.91,0.02,19.10,0.03,19.52,0.02,19.86,0.03,20.15,0.11,0.035,4.60,192.5,24.3,sdB,29404,704,5.450,0.142,--,--,--,--,--,--,,Sp,117.3776,28.0867,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--
326.60840,-7.21670,1177-52824-0636,214627.88-071117.59,45,16.16,0.01,15.94,0.03,15.86,0.01,15.86,0.01,15.90,0.02,0.035,7.00,48.6,-41.9,sdB,30503,191,5.019,0.036,--,--,--,--,--,--,,Sp,326.6084,-7.2167,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--
63.90770,25.61750,1254-52972-0163,041536.05+253857.11,41,16.14,0.01,16.27,0.01,16.46,0.01,16.65,0.01,16.83,0.02,0.483,6.10,170.4,-17.9,He-sdO,64329,1429,6.118,0.073,--,--,--,--,--,--,,Sp,63.9077,25.6175,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--
251.22790,21.59420,1569-53168-0370,164453.35+213711.54,20,17.75,0.02,17.82,0.01,18.22,0.01,18.55,0.01,18.81,0.04,0.050,3.00,40.5,37.0,sdB,26349,889,5.114,0.104,--,--,--,--,--,--,,Sp,251.2279,21.5942,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--
244.92500,24.15590,1574-53476-0225,161942.83+240715.70,46,16.79,0.02,16.42,0.02,16.46,0.02,16.48,0.01,16.57,0.02,0.067,11.90,41.5,43.3,sdB,27741,342,5.227,0.048,--,--,--,--,--,--,,Sp,244.9250,24.1559,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--
242.94240,29.00110,1577-53495-0460,161132.90+290038.99,37,16.46,0.02,16.58,0.02,17.00,0.02,17.35,0.01,17.64,0.02,0.037,10.30,47.6,46.1,sdB,28344,511,6.000,0.500,--,--,--,--,--,--,,Sp,242.9424,29.0011,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--,--
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [8]:
#This is a way to add coordinates if we need to. I don't think we need to right now.
#catalogs['Coordinates'] = coords.SkyCoord(catalogs['_RAJ2000'], catalogs['_DEJ2000'], frame='icrs')

In [9]:
#Here we do clean-up trying to merge those columns which were not properly merged
#because they were named different things in different catalogs. These include
#SDSS identifiers, a weird underscore for a log(g) parameter, different ways of
#specifying spectral type, and different ways of calibrating signal to noise.

PMF = catalogs['PMF']

for ind,obj in enumerate(PMF):
    if type(obj) != np.ma.core.MaskedConstant:
        split_PMF = obj.split('-')
        catalogs['Plate'][ind] = split_PMF[0]
        catalogs['MJD'][ind] = split_PMF[1]
        catalogs['Fiber'][ind] = split_PMF[2]
        
PMJ = catalogs['PMJ']

for ind,obj in enumerate(PMJ):
    if type(obj) != np.ma.core.MaskedConstant:
        split_PMJ = obj.split('-')
        catalogs['Plate'][ind] = split_PMJ[0]
        catalogs['MJD'][ind] = split_PMJ[1]
        catalogs['Fiber'][ind] = split_PMJ[2]

log_g_ah = catalogs['log_g_']

for ind,obj in enumerate(log_g_ah):
    if type(catalogs['logg'][ind]) == np.ma.core.MaskedConstant:
        if type(obj) != np.ma.core.MaskedConstant:
            catalogs['logg'][ind] = obj
            catalogs['e_logg'][ind] = catalogs['e_log_g_'][ind] 

Types = catalogs['SpType']

for ind,obj in enumerate(Types):
    if type(catalogs['Type'][ind]) == np.ma.core.MaskedConstant:
        if type(obj) != np.ma.core.MaskedConstant:
            catalogs['Type'][ind] = obj
            
SN = catalogs['SNg']

for ind,obj in enumerate(SN):
    if type(catalogs['S_N'][ind]) == np.ma.core.MaskedConstant:
        if type(obj) != np.ma.core.MaskedConstant:
            catalogs['S_N'][ind] = obj

### Let's select a quality sample of WD spectra

In [10]:
WD = catalogs[catalogs['Type'] == 'DA']

print("We start with", len(WD), "WDs")

# First, we want to remove systems with NaN's - only found in log g
good_WD = WD[np.where(~np.isnan(WD['logg']))]
print("We removed", len(WD[np.isnan(WD['logg'])]), "systems with NaNs")

# Now, we want to remove systems in which the log g was assumed. These all have e_logg=0.0
good_WD = good_WD[good_WD['e_logg'] != 0.0]
print("Number with determined log g",len(good_WD))

# Next, we only want objects with a S/N above 10
good_WD = good_WD[good_WD['S_N']>10]
print("Number with S/N > 10",len(good_WD))

# Next, we want objects with log g uncertainties smaller than, say, 0.2
good_WD = good_WD[good_WD['e_logg']<0.2]
print("Number with log g error less than 0.2",len(good_WD))

# # Let's do the same with T_eff uncertainties - limit to 15% of T_eff
# good_WD = good_WD[good_WD['e_Teff']<0.15*good_WD['Teff']]
# print("Number with Teff uncertainties less than 15%",len(good_WD))

# Print the median Teff error
print("Median T_eff error:", np.median(good_WD['e_Teff']))

# Print the median log g error
print("Median log g error:", np.median(good_WD['e_logg']))

('We start with', 21539, 'WDs')
('We removed', 0, 'systems with NaNs')
('Number with determined log g', 21505)
('Number with S/N > 10', 8746)
('Number with log g error less than 0.2', 8664)
('Median T_eff error:', 161.0)
('Median log g error:', 0.05299999937415123)


In [37]:
def download_data(cat):
    directory = '../data/'
    cat['file'] = MaskedColumn(length=len(cat),dtype='S32')
    for ind,plate in enumerate(cat['Plate']):
        try:
            spec = SDSS.get_spectra_async(plate=plate, mjd=cat['MJD'][ind], fiberID=WD['Fiber'][ind])
            url_of_interest = str(spec[0]).split()[4]
            filename = directory+url_of_interest.split('/')[-1]       
        except:
            print "No spectra found in database:", plate, cat['MJD'][ind], cat['Fiber'][ind]
            pass
        if os.path.exists(filename): 
            cat['file'][ind] = filename
            continue
        try:
            spec = SDSS.get_spectra(plate=plate, mjd=cat['MJD'][ind], fiberID=cat['Fiber'][ind])
            spec[0].writeto(filename)
            cat['file'][ind] = filename
        except:
            print "Could not download spectra:", plate, cat['MJD'][ind], cat['Fiber'][ind]
            pass  

In [None]:
download_data(good_WD)



No spectra found in database: 3766 55213 262
Downloading http://data.sdss3.org/sas/dr12/boss/spectro/redux/v5_7_0/spectra/4394/spec-4394-55924-0504.fits [Done]
Downloading http://data.sdss3.org/sas/dr12/boss/spectro/redux/v5_7_0/spectra/4410/spec-4410-56187-0154.fits [Done]
Downloading http://data.sdss3.org/sas/dr12/boss/spectro/redux/v5_7_0/spectra/4720/spec-4720-55691-0422.fits [Done]
Downloading http://data.sdss3.org/sas/dr12/boss/spectro/redux/v5_7_0/spectra/4742/spec-4742-55660-0098.fits [Done]
Downloading http://data.sdss3.org/sas/dr12/boss/spectro/redux/v5_7_0/spectra/5042/spec-5042-55856-0164.fits [Done]
Downloading http://data.sdss3.org/sas/dr12/boss/spectro/redux/v5_7_0/spectra/5048/spec-5048-56218-0670.fits [Done]
Downloading http://data.sdss3.org/sas/dr12/boss/spectro/redux/v5_7_0/spectra/5057/spec-5057-56209-0258.fits [Done]
Downloading http://data.sdss3.org/sas/dr12/boss/spectro/redux/v5_7_0/spectra/5058/spec-5058-56208-0140.fits [Done]
Downloading http://data.sdss3.org/s

In [None]:
def get_filename(plate,mjd,fiber,wd):
    try:
        plwd = wd[wd['plate'] == plate]
        if len(plwd) == 0: raise Exception()
    except Exception:
        print 'No plate number'
        return ''
    try:
        mjwd = plwd[plwd['mjd'] == mjd]
        if len(mjwd) == 0: raise Exception()
    except Exception:
        print 'No mjd date'
        return ''
    try:
        fbwd = mjwd[mjwd['fiber'] == fiber]
        if len(fbwd) == 0: raise Exception()
    except Exception:
        print 'No fiber number'
        return ''
    name = fbwd['file']
    return str(name[0])

In [None]:
def plot_spec(plate,mjd,fiber,wd):
    fits_spec = fits.open(get_filename(plate,mjd,fiber,wd))
    wavelength = 10**fits_spec[1].data['loglam']
    flux = fits_spec[1].data['flux']
    fig, ax = plt.subplots(1, 2, figsize=(12,4))
    ax[0].plot(wavelength, flux)
    ax[1].plot(wavelength, flux)
    ax[1].set_xlim(3800, 4400)
    plt.show()

In [None]:
plot_spec(337,51997,195,WD)

In [None]:
plt.hist(WD['S_N'])
plt.xlabel('Signal to Noise')

### Create training, test, and validation sets

In [None]:
# Randomly shuffle indices
indices = np.arange(len(good_WD))
np.random.shuffle(indices)
good_shuffle_WD = good_WD[indices]

# Determine training, test, and validation sets
validation_WD = good_shuffle_WD[0:300]
test_WD = good_shuffle_WD[300:600]
training_WD = good_shuffle_WD[600:]

### Plot up systems in T_eff and log g space to see where they lie

In [None]:
plt.errorbar(training_WD['logg'], training_WD['Teff'], xerr=training_WD['e_logg'], 
             yerr=training_WD['e_Teff'], ls='none', fmt='', capsize=0, label='train')

plt.errorbar(test_WD['logg'], test_WD['Teff'], xerr=test_WD['e_logg'], 
             yerr=test_WD['e_Teff'], ls='none', fmt='', capsize=0, label='test')

plt.errorbar(validation_WD['logg'], validation_WD['Teff'], xerr=validation_WD['e_logg'], 
             yerr=validation_WD['e_Teff'], ls='none', fmt='', capsize=0, label='val')
plt.legend()


plt.ylabel(r'T$_{\rm eff}$')
plt.xlabel(r'Log $g$')

plt.yscale('log')
plt.ylim(5.0e3, 1.0e5)

plt.show()

Now we will look at just the DAs.

In [None]:
set(good_WD['SpType'])

In [None]:
DA_good = good_WD[good_WD['SpType']=='DA']

print("Number of DAs in sample",len(DA_good))

In [None]:
plt.errorbar(DA_good['Teff'], DA_good['logg'],yerr=DA_good['e_logg'], 
             xerr=DA_good['e_Teff'], ls='none', fmt='', capsize=0)


plt.xlabel(r'T$_{\rm eff}$')
plt.ylabel(r'Log $g$')

plt.xscale('log')
plt.xlim(5.0e3, 1.0e5)

plt.show()

In [None]:
ascii.write(DA_good, 'DA_good.csv', format='csv', include_names=['_RAJ2000','_DEJ2000','SDSS','S_N','umag','e_umag','gmag','e_gmag','rmag','e_rmag','imag','e_imag','zmag','e_zmag','E_B-V_','pm'])

In [None]:
ind = []
for num,thing in enumerate(good_WD['SpType']):
    if 'A' in thing:
        ind += [num]

All_A = good_WD[ind]
print("Number of As in sample",len(All_A))

In [None]:
All_A

In [None]:
plt.errorbar(All_A['Teff'], All_A['logg'],yerr=All_A['e_logg'], 
             xerr=All_A['e_Teff'], ls='none', fmt='', capsize=0, color='red')


plt.xlabel(r'T$_{\rm eff}$')
plt.ylabel(r'Log $g$')

plt.xscale('log')
plt.xlim(5.0e3, 1.0e5)

plt.show()