# Pre-process test data

This notebook takes you through the steps of how to preprocess a high S/N and low S/N test set
* required packages: numpy, h5py
* required data files: apStar_combined_main.h5

In [4]:
import numpy as np
import h5py
import os
import vos

In [6]:
datadir='/home/ubuntu/starnet_data/'  # or "/path/to/my/starnet/directory"

def starnet_download_file(filename):
    vclient = vos.Client()
    vclient.copy('vos:starnet/public/'+filename, datadir+filename)
    print(filename+' downloaded')

starnet_download_file('apStar_combined_main.h5')
f = h5py.File(datadir+'apStar_combined_main.h5','r')

apStar_combined_main.h5 downloaded


In [3]:
print('Dataset keys in file: \n')
f.keys()

Dataset keys in file: 

0_H
0_H_ERR
ALPHA_M
AL_H
AL_H_ERR
CA_H
CA_H_ERR
C_H
C_H_ERR
FE_H
FE_H_ERR
IDs
K_H
K_H_ERR
LOGG
LOGG_ERR
MG_H
MG_H_ERR
MN_H
MN_H_ERR
NA_H
NA_H_ERR
NI_H
NI_H_ERR
N_H
N_H_ERR
SI_H
SI_H_ERR
S_H
S_H_ERR
TEFF
TEFF_ERR
TI_H
TI_H_ERR
VRAD
VRAD_ERR
VSCATTER
V_H
V_H_ERR
aspcap_flag
error_spectrum
num_visits
spectrum
stacked_snr
star_flag
targ1_flag
targ2_flag


**Load the data into memory**

For the testing of StarNet, it is necessary to obtain the spectra, error spectra, combined S/N, and labels, but we need to make eliminations to the test set to obtain the labels of highest validity to compare with, so we will first include the APOGEE_IDs, the spectra, error spectra, the S/N of the combined spectra, $T_{\mathrm{eff}}$, $\log(g)$, [Fe/H], $V_{scatter}$, STARFLAGs, and ASPCAPFLAGs

In [4]:
ap_id = f['IDs'][:,0]

spectra = f['spectrum'][:]
error_spectra = f['error_spectrum'][:]
combined_snr = f['stacked_snr'][:]
starflag = f['star_flag'][:]
aspcapflag = f['aspcap_flag'][:]
teff = f['TEFF'][:]
logg = f['LOGG'][:]
fe_h = f['FE_H'][:]
vscatter = f['VSCATTER'][:]

print('Obtainined spectra and data for '+str(len(list(set(list(ap_id)))))+' stars.')

f.close()

Obtainined spectra and data for 142333 stars.


**Normalize the data**

Create a file that contains the mean and standard deviation for $T_{\mathrm{eff}}$, $\log(g)$, and  [Fe/H] in order to normalize labels during training and testing ignore values equal to -9999.

In [5]:
mean = np.array([np.mean(teff[teff!=-9999.]),np.mean(logg[logg!=-9999.]),np.mean(fe_h[fe_h!=-9999.])])
std = np.array([np.std(teff[teff!=-9999.]),np.std(logg[logg!=-9999.]),np.std(fe_h[fe_h!=-9999.])])
mean_and_std = np.row_stack((mean,std))
np.save(datadir+'mean_and_std', mean_and_std)

print('mean_and_std.npy saved')

mean_and_std.npy saved


**Separate out a dataset with good labels**. 
- STARFLAGs = 0
- ASPCAPFLAGs = 0
- 4000K < $T_{\mathrm{eff}}$ < 5500K
- -3.0 < [Fe/H]
- $\log(g)$ $\neq$ -9999. (value defined by ASPCAP when no ASPCAP labels are given)
- $V_{scatter}$ < 1.0 km/s

In [6]:
teff_min = 4000.
teff_max = 5500.
vscatter_max = 1.
fe_h_min = -3.

In [7]:
indices, cols = np.where((aspcapflag[:]==0.)&(starflag[:]==0.)&(vscatter[:]<vscatter_max)&(fe_h[:]>fe_h_min)&(teff[:]>teff_min)&(teff[:]<teff_max)&(logg[:]!=-9999.).reshape(len(ap_id),1))

ap_id = ap_id[indices]
spectra = spectra[indices]
error_spectra = error_spectra[indices]
teff = teff[indices]
logg = logg[indices]
fe_h = fe_h[indices]
combined_snr = combined_snr[indices]

print(str(len(list(set(list(ap_id)))))+' stars remain.')

34484 stars remain.


**Load high S/N APOGEE IDs**

Load a file that contains the APOGEE IDs for High S/N spectra that will be processed into the High S/N test set**

In [8]:
high_snr_test_ap_ids = np.load('high_snr_test_apids.npy')

**Separate data for High S/N test set**

In [9]:
indices = [i for i, item in enumerate(high_snr_test_ap_ids) if item in ap_id]

high_snr_ap_id = ap_id[indices]
high_snr_spectra = spectra[indices]
high_snr_error_spectra = error_spectra[indices]
high_snr_teff = teff[indices]
high_snr_logg = logg[indices]
high_snr_fe_h = fe_h[indices]
high_snr_combined_snr = combined_snr[indices]

print('High S/N test set includes '+str(len(high_snr_ap_id))+' combined spectra')


High S/N test set includes 2780 combined spectra


**Normalize spectra**:
1. separate into three chips
2. divide by median value in each chip
3. recombine each spectrum into a vector of 7214 flux values
4. Error spectra must also be normalized with the same median values for error propagation

In [10]:
# Define edges of detectors
blue_chip_begin = 322
blue_chip_end = 3242
green_chip_begin = 3648
green_chip_end = 6048   
red_chip_begin = 6412
red_chip_end = 8306 

In [11]:
# Separate spectra into chips

blue_sp = high_snr_spectra[:,blue_chip_begin:blue_chip_end]
green_sp = high_snr_spectra[:,green_chip_begin:green_chip_end]
red_sp = high_snr_spectra[:,red_chip_begin:red_chip_end]

#Normalize spectra by chips

blue_sp_med = np.median(blue_sp, axis=1)
green_sp_med = np.median(green_sp, axis=1)
red_sp_med = np.median(red_sp, axis=1)

blue_sp = (blue_sp.T/blue_sp_med).T
green_sp = (green_sp.T/green_sp_med).T
red_sp = (red_sp.T/red_sp_med).T  

# Recombine spectra

high_snr_spectra = np.column_stack((blue_sp,green_sp,red_sp))

print('High S/N spectra dataset now contains '+str(high_snr_spectra.shape[0])+' spectra, each with '+str(high_snr_spectra.shape[1])+' wavelength bins')

High S/N spectra dataset now contains 2780 spectra, each with 7214 wavelength bins


In [12]:
# Separate error spectra into chips

blue_sp = high_snr_error_spectra[:,blue_chip_begin:blue_chip_end]
green_sp = high_snr_error_spectra[:,green_chip_begin:green_chip_end]
red_sp = high_snr_error_spectra[:,red_chip_begin:red_chip_end]

# Normalize error spectra by chips

blue_sp = (blue_sp.T/blue_sp_med).T
green_sp = (green_sp.T/green_sp_med).T
red_sp = (red_sp.T/red_sp_med).T

# Recombine error spectra

high_snr_error_spectra = np.column_stack((blue_sp,green_sp,red_sp))

print('High S/N error spectra dataset now contains '+str(high_snr_error_spectra.shape[0])+' error spectra, each with '+str(high_snr_error_spectra.shape[1])+' wavelength bins')

High S/N error spectra dataset now contains 2780 error spectra, each with 7214 wavelength bins


**Save the new High S/N test data file**

with APOGEE IDs, spectra, error spectra, combined S/N and labels

In [14]:
savename = datadir + 'high_snr_test_data.h5'
# if path already exist, you must remove it first using os.remove(savename) 
#os.remove(savename)
dt = h5py.special_dtype(vlen=bytes)
with h5py.File(savename, "a") as f:
     
    spectra_ds = f.create_dataset('spectra', high_snr_spectra.shape, dtype="f")
    error_spectra_ds = f.create_dataset('error_spectra', high_snr_error_spectra.shape, dtype="f")
    teff_ds = f.create_dataset('TEFF', high_snr_teff.shape, dtype="f")
    logg_ds = f.create_dataset('LOGG', high_snr_logg.shape, dtype="f")
    fe_h_ds = f.create_dataset('FE_H', high_snr_fe_h.shape, dtype="f")
    combined_snr_ds = f.create_dataset('combined_snr', high_snr_combined_snr.shape, dtype="f")
    ap_id_ds = f.create_dataset('Ap_IDs', high_snr_ap_id.shape, dtype="S18")
    
    spectra_ds[:] = high_snr_spectra
    error_spectra_ds[:] = high_snr_error_spectra
    teff_ds[:] = high_snr_teff
    logg_ds[:] = high_snr_logg
    fe_h_ds[:] = high_snr_fe_h
    combined_snr_ds[:] = high_snr_combined_snr
    ap_id_ds[:] = high_snr_ap_id.tolist()
    
print(savename+' has been saved as the High S/N test set to be used in 5_Test_Model.ipynb')

high_snr_test_data.h5 has been saved as the High S/N test set to be used in 5_Test_Model.ipynb


**Create a low S/N test set**

1. Add a cut to combined S/N < 200
2. Normalize the spectra as before
3. Save the spectra just like above.

In [15]:
snr_max = 200.

In [16]:
indices, cols = np.where((combined_snr[:]<snr_max).reshape(len(ap_id),1))

low_snr_ap_id = ap_id[indices]
low_snr_spectra = spectra[indices]
low_snr_error_spectra = error_spectra[indices]
low_snr_teff = teff[indices]
low_snr_logg = logg[indices]
low_snr_fe_h = fe_h[indices]
low_snr_combined_snr = combined_snr[indices]

print('Low S/N test set includes '+str(len(low_snr_ap_id))+' combined spectra')

Low S/N test set includes 17506 combined spectra


In [17]:
# Define edges of detectors
blue_chip_begin = 322
blue_chip_end = 3242
green_chip_begin = 3648
green_chip_end = 6048   
red_chip_begin = 6412
red_chip_end = 8306 

In [18]:
# Separate spectra into chips/home/ubuntu/conda/lib/python2.7/site-packages/keras/metrics.py

blue_sp = low_snr_spectra[:,blue_chip_begin:blue_chip_end]
green_sp = low_snr_spectra[:,green_chip_begin:green_chip_end]
red_sp = low_snr_spectra[:,red_chip_begin:red_chip_end]

#Normalize spectra by chips

blue_sp_med = np.median(blue_sp, axis=1)
green_sp_med = np.median(green_sp, axis=1)
red_sp_med = np.median(red_sp, axis=1)

blue_sp = (blue_sp.T/blue_sp_med).T
green_sp = (green_sp.T/green_sp_med).T
red_sp = (red_sp.T/red_sp_med).T  

# Recombine spectra

low_snr_spectra = np.column_stack((blue_sp,green_sp,red_sp))
print('Low S/N spectra dataset now contains '+str(low_snr_spectra.shape[0])+' spectra, each with '+str(low_snr_spectra.shape[1])+' wavelength bins')

Low S/N spectra dataset now contains 17506 spectra, each with 7214 wavelength bins


In [19]:
# Separate error spectra into chips

blue_sp = low_snr_error_spectra[:,blue_chip_begin:blue_chip_end]
green_sp = low_snr_error_spectra[:,green_chip_begin:green_chip_end]
red_sp = low_snr_error_spectra[:,red_chip_begin:red_chip_end]

# Normalize error spectra by chips

blue_sp = (blue_sp.T/blue_sp_med).T
green_sp = (green_sp.T/green_sp_med).T
red_sp = (red_sp.T/red_sp_med).T

# Recombine error spectra

low_snr_error_spectra = np.column_stack((blue_sp,green_sp,red_sp))

print('Low S/N error spectra dataset now contains '+str(low_snr_error_spectra.shape[0])+' error spectra, each with '+str(low_snr_error_spectra.shape[1])+' wavelength bins')

Low S/N error spectra dataset now contains 17506 error spectra, each with 7214 wavelength bins


In [21]:
savename = datadir+'low_snr_test_data.h5'
# if path already exist, you must remove it first using os.remove(savename) 
#os.remove(savename)
dt = h5py.special_dtype(vlen=bytes)
with h5py.File(savename, "a") as f:
     
    spectra_ds = f.create_dataset('spectra', low_snr_spectra.shape, dtype="f")
    error_spectra_ds = f.create_dataset('error_spectra', low_snr_error_spectra.shape, dtype="f")
    teff_ds = f.create_dataset('TEFF', low_snr_teff.shape, dtype="f")
    logg_ds = f.create_dataset('LOGG', low_snr_logg.shape, dtype="f")
    fe_h_ds = f.create_dataset('FE_H', low_snr_fe_h.shape, dtype="f")
    combined_snr_ds = f.create_dataset('combined_snr', low_snr_combined_snr.shape, dtype="f")
    ap_id_ds = f.create_dataset('Ap_IDs', low_snr_ap_id.shape, dtype="S18")
    
    spectra_ds[:] = low_snr_spectra
    error_spectra_ds[:] = low_snr_error_spectra
    teff_ds[:] = low_snr_teff
    logg_ds[:] = low_snr_logg
    fe_h_ds[:] = low_snr_fe_h
    combined_snr_ds[:] = low_snr_combined_snr
    ap_id_ds[:] = low_snr_ap_id.tolist()

print(savename + ' has been saved as the Low S/N test set to be used in 5_Test_Model.ipynb')

low_snr_test_data.h5 has been saved as the Low S/N test set to be used in 5_Test_Model.ipynb
