# Obtain test data

## This notebook takes you through the steps of how to preprocess a high S/N and low S/N test set
## downloading apStar_combined_main.h5 is necessary before beginning this procedure
### download link: [url]
## required packages:
### - vos
### - numpy
### - h5py

In [None]:
import numpy as np
import h5py
import os

# Download apStar visits data file

In [None]:
filename='apStar_visits_main.h5'
vclient = vos.Client()
vclient.copy('vos:starnet/public/'+filename, filename)

### Obtain training data and separate out a High S/N Test set
#### apStar_combined_main.h5 is a data file created by pulling apStar combined spectra from the APOGEE v603.fits file
#### instructions on dealing with apogee data can be found here: https://github.com/jobovy/apogee

In [None]:
filename = 'apStar_combined_main.h5'
path = filename
f = h5py.File(path,"r")

In [None]:
print('Dataset keys in file: \n')
for i in f.keys(): print i

### for the testing of StarNet, it is necessary to obtain the spectra, error spectra, combined S/N, and labels, but we need to make eliminations to the test set to obtain the labels of highest validity to compare with, so we will first include the$APOGEE\_IDs$, $S/N$ of the combined spectra, spectra, error spectra, $T_{\mathrm{eff}}$,$\log(g)$, $[Fe/H]$, $V_{scatter}$, $STARFLAGs$, and $ASPCAPFLAGs$

In [None]:
ap_id = f['IDs'][:,0]

spectra = f['spectrum'][:]
error_spectra = f['error_spectrum'][:]
combined_snr = f['stacked_snr'][:]
starflag = f['star_flag'][:]
aspcapflag = f['aspcap_flag'][:]
teff = f['TEFF'][:]
logg = f['LOGG'][:]
fe_h = f['FE_H'][:]
vscatter = f['VSCATTER'][:]

print('Obtainined spectra and data for '+str(len(list(set(list(ap_id)))))+' stars.')

f.close()

## Create a file that contains the mean and std for $T_{\mathrm{eff}}$,$\log(g)$,and  $[Fe/H]$ in order to normalize labels during training and testing
#### ignore values equal to -9999.

In [None]:
mean = np.array([np.mean(teff[teff!=-9999.]),np.mean(logg[logg!=-9999.]),np.mean(fe_h[fe_h!=-9999.])])
std = np.array([np.std(teff[teff!=-9999.]),np.std(logg[logg!=-9999.]),np.std(fe_h[fe_h!=-9999.])])
mean_and_std = np.row_stack((mean,std))
np.save('mean_and_std', mean_and_std)

print('mean_and_std.npy saved')

## separate out a dataset with good labels
## Default initial restrictions:
### - $STARFLAGs$ = 0
### -  $ASPCAPFLAGs$ = 0
### - 4000K < $T_{\mathrm{eff}}$ < 5500K
### - -3.0 < $[Fe/H]$
### - $\log(g)$ != -9999. (value defined by ASPCAP when no ASPCAP labels are given)
### - $V_{scatter}$ < 1.

In [None]:
teff_min = 4000.
teff_max = 5500.
vscatter_max = 1.
fe_h_min = -3.

In [None]:
indices, cols = np.where((aspcapflag[:]==0.)&(starflag[:]==0.)&(vscatter[:]<vscatter_max)&(fe_h[:]>fe_h_min)&(teff[:]>teff_min)&(teff[:]<teff_max)&(logg[:]!=-9999.).reshape(len(ap_id),1))

ap_id = ap_id[indices]
spectra = spectra[indices]
error_spectra = error_spectra[indices]
teff = teff[indices]
logg = logg[indices]
fe_h = fe_h[indices]
combined_snr = combined_snr[indices]

print(str(len(list(set(list(ap_id)))))+' stars remain.')

### load $APOGEE\_IDs$ for High S/N test set obtained in $1\_Preprocessing\_of\_Training\_Data$

In [None]:
high_snr_test_ap_ids = np.load('high_snr_test_apids.npy')

## separate data for High S/N test set

In [None]:
indices = [i for i, item in enumerate(high_snr_test_ap_ids) if item in ap_id]

high_snr_ap_id = ap_id[indices]
high_snr_spectra = spectra[indices]
high_snr_error_spectra = error_spectra[indices]
high_snr_teff = teff[indices]
high_snr_logg = logg[indices]
high_snr_fe_h = fe_h[indices]
high_snr_combined_snr = combined_snr[indices]

print('High S/N test set includes '+str(len(high_snr_ap_id))+' combined spectra')


### Normalize spectra:
#### 1. separate into three chips
#### 2. divide by median value in each chip
#### 3. recombine into vector of 7214 flux values

In [None]:
# Define edges of detectors
blue_chip_begin = 322
blue_chip_end = 3242
green_chip_begin = 3648
green_chip_end = 6048   
red_chip_begin = 6412
red_chip_end = 8306 

In [None]:
# Separate spectra into chips

blue_sp = high_snr_spectra[:,blue_chip_begin:blue_chip_end]
green_sp = high_snr_spectra[:,green_chip_begin:green_chip_end]
red_sp = high_snr_spectra[:,red_chip_begin:red_chip_end]

#Normalize spectra by chips

blue_sp_med = np.median(blue_sp, axis=1)
green_sp_med = np.median(green_sp, axis=1)
red_sp_med = np.median(red_sp, axis=1)

blue_sp = (blue_sp.T/blue_sp_med).T
green_sp = (green_sp.T/green_sp_med).T
red_sp = (red_sp.T/red_sp_med).T  

# Recombine spectra

high_snr_spectra = np.column_stack((blue_sp,green_sp,red_sp))

print('High S/N spectra dataset now contains '+str(high_snr_spectra.shape[0])+' spectra, each with '+str(high_snr_spectra.shape[1])+' wavelength bins')

### Error spectra also must be normalized with the same median values for error propagaton

In [None]:
# Separate error spectra into chips

blue_sp = high_snr_error_spectra[:,blue_chip_begin:blue_chip_end]
green_sp = high_snr_error_spectra[:,green_chip_begin:green_chip_end]
red_sp = high_snr_error_spectra[:,red_chip_begin:red_chip_end]

# Normalize error spectra by chips

blue_sp = (blue_sp.T/blue_sp_med).T
green_sp = (green_sp.T/green_sp_med).T
red_sp = (red_sp.T/red_sp_med).T

# Recombine error spectra

high_snr_error_spectra = np.column_stack((blue_sp,green_sp,red_sp))

print('High S/N error spectra dataset now contains '+str(high_snr_error_spectra.shape[0])+' error spectra, each with '+str(high_snr_error_spectra.shape[1])+' wavelength bins')

# Save new High S/N test data file with APOGEE IDs, spectra, error spectra, combined S/N and labels

In [None]:
savename = 'high_snr_test_data.h5'
path = savename
# if path already exist, you must remove it first using os.remove(path) 
os.remove(path)
dt = h5py.special_dtype(vlen=bytes)
with h5py.File(path, "a") as f:
     
    spectra_ds = f.create_dataset('spectra', high_snr_spectra.shape, dtype="f")
    error_spectra_ds = f.create_dataset('error_spectra', high_snr_error_spectra.shape, dtype="f")
    teff_ds = f.create_dataset('TEFF', high_snr_teff.shape, dtype="f")
    logg_ds = f.create_dataset('LOGG', high_snr_logg.shape, dtype="f")
    fe_h_ds = f.create_dataset('FE_H', high_snr_fe_h.shape, dtype="f")
    combined_snr_ds = f.create_dataset('combined_snr', high_snr_combined_snr.shape, dtype="f")
    ap_id_ds = f.create_dataset('Ap_IDs', high_snr_ap_id.shape, dtype="S18")
    
    spectra_ds[:] = high_snr_spectra
    error_spectra_ds[:] = high_snr_error_spectra
    teff_ds[:] = high_snr_teff
    logg_ds[:] = high_snr_logg
    fe_h_ds[:] = high_snr_fe_h
    combined_snr_ds[:] = high_snr_combined_snr
    ap_id_ds[:] = high_snr_ap_id.tolist()
    
print(savename+' has been saved as the High S/N test set to be used in 4_Test_Model.ipynb')

# Now create Low S/N test set
## default additional restrictions:
### - combined S/N < 200

In [None]:
snr_max = 200.

In [None]:
indices, cols = np.where((combined_snr[:]<snr_max).reshape(len(ap_id),1))

low_snr_ap_id = ap_id[indices]
low_snr_spectra = spectra[indices]
low_snr_error_spectra = error_spectra[indices]
low_snr_teff = teff[indices]
low_snr_logg = logg[indices]
low_snr_fe_h = fe_h[indices]
low_snr_combined_snr = combined_snr[indices]

print('Low S/N test set includes '+str(len(low_snr_ap_id))+' combined spectra')

## Normalize spectra

In [None]:
# Define edges of detectors
blue_chip_begin = 322
blue_chip_end = 3242
green_chip_begin = 3648
green_chip_end = 6048   
red_chip_begin = 6412
red_chip_end = 8306 

In [None]:
# Separate spectra into chips

blue_sp = low_snr_spectra[:,blue_chip_begin:blue_chip_end]
green_sp = low_snr_spectra[:,green_chip_begin:green_chip_end]
red_sp = low_snr_spectra[:,red_chip_begin:red_chip_end]

#Normalize spectra by chips

blue_sp_med = np.median(blue_sp, axis=1)
green_sp_med = np.median(green_sp, axis=1)
red_sp_med = np.median(red_sp, axis=1)

blue_sp = (blue_sp.T/blue_sp_med).T
green_sp = (green_sp.T/green_sp_med).T
red_sp = (red_sp.T/red_sp_med).T  

# Recombine spectra

low_snr_spectra = np.column_stack((blue_sp,green_sp,red_sp))
print('Low S/N spectra dataset now contains '+str(low_snr_spectra.shape[0])+' spectra, each with '+str(low_snr_spectra.shape[1])+' wavelength bins')

## Normalize error spectra

In [None]:
# Separate error spectra into chips

blue_sp = low_snr_error_spectra[:,blue_chip_begin:blue_chip_end]
green_sp = low_snr_error_spectra[:,green_chip_begin:green_chip_end]
red_sp = low_snr_error_spectra[:,red_chip_begin:red_chip_end]

# Normalize error spectra by chips

blue_sp = (blue_sp.T/blue_sp_med).T
green_sp = (green_sp.T/green_sp_med).T
red_sp = (red_sp.T/red_sp_med).T

# Recombine error spectra

low_snr_error_spectra = np.column_stack((blue_sp,green_sp,red_sp))

print('Low S/N error spectra dataset now contains '+str(low_snr_error_spectra.shape[0])+' error spectra, each with '+str(low_snr_error_spectra.shape[1])+' wavelength bins')

# Save new Low S/N test data file with APOGEE IDs, spectra, error spectra, combined S/N and labels

In [None]:
savename = 'low_snr_test_data.h5'
path = savename
# if path already exist, you must remove it first using os.remove(path) 
os.remove(path)
dt = h5py.special_dtype(vlen=bytes)
with h5py.File(path, "a") as f:
     
    spectra_ds = f.create_dataset('spectra', low_snr_spectra.shape, dtype="f")
    error_spectra_ds = f.create_dataset('error_spectra', low_snr_error_spectra.shape, dtype="f")
    teff_ds = f.create_dataset('TEFF', low_snr_teff.shape, dtype="f")
    logg_ds = f.create_dataset('LOGG', low_snr_logg.shape, dtype="f")
    fe_h_ds = f.create_dataset('FE_H', low_snr_fe_h.shape, dtype="f")
    combined_snr_ds = f.create_dataset('combined_snr', low_snr_combined_snr.shape, dtype="f")
    ap_id_ds = f.create_dataset('Ap_IDs', low_snr_ap_id.shape, dtype="S18")
    
    spectra_ds[:] = low_snr_spectra
    error_spectra_ds[:] = low_snr_error_spectra
    teff_ds[:] = low_snr_teff
    logg_ds[:] = low_snr_logg
    fe_h_ds[:] = low_snr_fe_h
    combined_snr_ds[:] = low_snr_combined_snr
    ap_id_ds[:] = low_snr_ap_id.tolist()

print(savename+' has been saved as the Low S/N test set to be used in 4_Test_Model.ipynb')