# Obtain training data and separate out a High S/N Test set

## This notebook takes you through the steps of how to preprocess the training data into the form necessary for training StarNet
## required packages:
### - numpy
### - h5py
## required data files:
### - apStar_visits_main.h5 (can be downloaded in $1\_Download\_Data.ipynb$)

In [2]:
import numpy as np
import h5py
import os

## Load apStar_visits_main.h5, a file that contains individual visit spectra along with APOGEE data associated with each star.  File can be downloaded in $1\_Download\_Data.ipynb$

In [3]:
filename = 'apStar_visits_main.h5'
f = h5py.File(filename,"r")

In [4]:
print('Dataset keys in file: \n')
for i in f.keys(): print i

Dataset keys in file: 

0_H
0_H_ERR
ALPHA_M
AL_H
AL_H_ERR
CA_H
CA_H_ERR
C_H
C_H_ERR
FE_H
FE_H_ERR
IDs
K_H
K_H_ERR
LOGG
LOGG_ERR
MG_H
MG_H_ERR
MN_H
MN_H_ERR
NA_H
NA_H_ERR
NI_H
NI_H_ERR
N_H
N_H_ERR
SI_H
SI_H_ERR
S_H
S_H_ERR
TEFF
TEFF_ERR
TI_H
TI_H_ERR
VRAD
VRAD_ERR
VSCATTER
V_H
V_H_ERR
aspcap_flag
bluegreen_persist
error_spectrum
greenred_persist
num_visits
spectrum
stacked_snr
star_flag
targ1_flag
targ2_flag
visit_snr


### For the training of StarNet, it is only necessary to obtain the spectra and labels, but we need to set restrictions on the training set to obtain the labels of highest validity so we will first include the $APOGEE\_IDs$, the spectra, the $S/N$ of the combined spectra, $T_{\mathrm{eff}}$, $\log(g)$,  $[Fe/H]$,  $V_{scatter}$,  $STARFLAGs$, and $ASPCAPFLAGs$

In [5]:
ap_id = f['IDs'][:,0]
spectra = f['spectrum'][:]
combined_snr = f['stacked_snr'][:]
starflag = f['star_flag'][:]
aspcapflag = f['aspcap_flag'][:]
teff = f['TEFF'][:]
logg = f['LOGG'][:]
fe_h = f['FE_H'][:]
vscatter = f['VSCATTER'][:]

f.close()

print('Obtained spectra and data for '+str(len(ap_id))+' from '+str(len(list(set(list(ap_id)))))+' stars.')


Obtained spectra and data for 559359 from 142333 stars.


# Separate out a dataset with good labels:
## Default restrictions:
### - combined spectral S/N $\geq$ 200
### - STARFLAG = 0
### - ASPCAPFLAG = 0
### - 4000K < $T_{\mathrm{eff}}$ < 5500K
### - -3.0 dex < $[Fe/H]$
### - $\log(g)$ $\neq$ -9999. (value defined by ASPCAP when no ASPCAP labels are given)
### -$V_{scatter}$ < 1.0 km/s

In [15]:
snr_min = 200.
teff_min = 4000.
teff_max = 5500.
vscatter_max = 1.
fe_h_min = -3.

In [8]:
indices, cols = np.where((aspcapflag[:]==0.)&(starflag[:]==0.)&(combined_snr[:]>=snr_min)&(vscatter[:]<vscatter_max)&(fe_h[:]>fe_h_min)&(teff[:]>teff_min)&(teff[:]<teff_max)&(logg[:]!=-9999.).reshape(len(ap_id),1))

ap_id = ap_id[indices]
spectra = spectra[indices]
teff = teff[indices]
logg = logg[indices]
fe_h = fe_h[indices]

print(str(len(ap_id))+' spectra remain from '+str(len(list(set(list(ap_id)))))+' stars.')

64076 spectra remain from 22773 stars.


## Select the first $num\_train$ visits for the reference set (later to be split into training and cross-validation sets)
### default:
#### - num_train = 50000

In [9]:
num_train = 50000

ap_id_train = ap_id[0:num_train]
spectra = spectra[0:num_train]
teff = teff[0:num_train]
logg = logg[0:num_train]
fe_h = fe_h[0:num_train]

print('Reference set includes '+str(len(ap_id_train))+' individual visit spectra from '+str(len(set(ap_id_train)))+' stars.')


Reference set includes 50000 individual visit spectra from 17841 stars


# Separate a test set of $APOGEE\_IDs$ to be processed in $3\_Preprocessing\_of\_Test\_Data.ipynb$ as StarNet's $High\_S/N\_Test\_Set$
## make sure there are no duplicates from test set that are also in training set (this is necessary because there are some duplicates in the APOGEE v603.fits file)

In [10]:
ap_id_test = ap_id[50000:]

ap_id_test = list(set(ap_id_test)-set(ap_id_train))
print(str(len(ap_id_test))+' stars to be processed for the High S/N test set.')

4932 stars to be processed for the High S/N test set


## save APOGEE IDs for High S/N Test set

In [11]:
np.save('high_snr_test_apids', ap_id_test)
print('APOGEE IDs for the high S/N test set are saved to be used in 3_Preprocessing_of_Test_Data.ipynb')

APOGEE IDs for the high S/N test set are saved to be used in 2_Preprocessing_of_Test_Data.ipynb


# Normalize spectra:
### 1. separate into three chips
### 2. divide by median value in each chip
### 3. recombine each spectrum into a vector of 7214 flux values

In [12]:
# Define edges of detectors
blue_chip_begin = 322
blue_chip_end = 3242
green_chip_begin = 3648
green_chip_end = 6048   
red_chip_begin = 6412
red_chip_end = 8306 

In [13]:
# Separate spectra into chips

blue_sp = spectra[:,blue_chip_begin:blue_chip_end]
green_sp = spectra[:,green_chip_begin:green_chip_end]
red_sp = spectra[:,red_chip_begin:red_chip_end]

# Normalize spectra by chips

blue_sp = (blue_sp.T/np.median(blue_sp, axis=1)).T
green_sp = (green_sp.T/np.median(green_sp, axis=1)).T
red_sp = (red_sp.T/np.median(red_sp, axis=1)).T 

# Recombine spectra

spectra = np.column_stack((blue_sp,green_sp,red_sp))

print('Reference spectra dataset now contains ' + str(spectra.shape[0])+' spectra, each with '+str(spectra.shape[1])+' wavelength bins')

Reference spectra dataset now contains 50000 spectra, each with 7214 wavelength bins


# Save new training data file with APOGEE IDs, spectra, and labels

In [14]:
savename = 'training_data.h5'
# if path already exist, you must remove it first using os.remove(path) 
#os.remove(savename)
dt = h5py.special_dtype(vlen=bytes)
with h5py.File(savename, "a") as f:
     
    spectra_ds = f.create_dataset('spectra', spectra.shape, dtype="f")
    teff_ds = f.create_dataset('TEFF', teff.shape, dtype="f")
    logg_ds = f.create_dataset('LOGG', logg.shape, dtype="f")
    fe_h_ds = f.create_dataset('FE_H', fe_h.shape, dtype="f")
    ap_id_ds = f.create_dataset('Ap_IDs', ap_id_train.shape, dtype="S18")
    
    spectra_ds[:] = spectra
    teff_ds[:] = teff
    logg_ds[:] = logg
    fe_h_ds[:] = fe_h
    ap_id_ds[:] = ap_id_train.tolist()

print(savename+' has been saved as the reference set to be used in 4_Train_Model.ipynb')

training_data.h5 has been saved as the reference set to be used in 3_Train_Model.ipynb
