In [1]:
from astropy.io import fits
from astropy.table import Table, join
import numpy as np
import pylab as plt
import random
from scipy import stats
from sklearn.neighbors import KDTree
import time
from sklearn.metrics import mean_squared_error
from astropy.cosmology import FlatLambdaCDM
from os import listdir
import scipy
from desitarget.targetmask import desi_mask, bgs_mask, mws_mask
from desitarget.cmx.cmx_targetmask import cmx_mask
plt.rcParams['figure.figsize'] = [25, 10]

In [2]:
server = 1 # 0 is perlmutter, 1 is cori
server_paths = ['/pscratch/sd/a/ashodkh/', '/global/cscratch1/sd/ashodkh/']
## DATA ##
## I'm combining fastphot,fastspect, and ztile to make sure I use the same data everywhere

test = False
train = True

if test:
    sv = '3'
if train:
    sv = '1'
zall_path = "/global/cfs/cdirs/desi/spectro/redux/fuji/zcatalog/ztile-sv"+sv+"-bright-cumulative.fits"
data1 = Table.read(zall_path,hdu=1)
needed1 = ["TARGETID", "SV"+sv+"_BGS_TARGET", "SPECTYPE", "DELTACHI2", "Z", "ZWARN", "FIBER", "PETAL_LOC", "TILEID"]

fastspec_path = "/global/cfs/cdirs/desi/spectro/fastspecfit/fuji/catalogs/fastspec-fuji-sv"+sv+"-bright.fits"
data2 = Table.read(fastspec_path,hdu=1)
data2.rename_column('CONTINUUM_COEFF', 'CONTINUUM_COEFF_FASTSPEC')
data2.rename_column('CONTINUUM_AV', 'CONTINUUM_AV_FASTSPEC')

needed2 = ["TARGETID", "OII_3726_EW", "OII_3729_EW", "HGAMMA_EW", "HBETA_EW", "OIII_4959_EW", "OIII_5007_EW", "NII_6548_EW", "HALPHA_EW", "NII_6584_EW", "SII_6716_EW", "SII_6731_EW",\
           "FLUX_SYNTH_G", "FLUX_SYNTH_R", "FLUX_SYNTH_Z", 'CONTINUUM_COEFF_FASTSPEC', 'CONTINUUM_AV_FASTSPEC',\
           "OII_3726_EW_IVAR", "OII_3729_EW_IVAR", "HGAMMA_EW_IVAR", "HBETA_EW_IVAR", "OIII_4959_EW_IVAR", "OIII_5007_EW_IVAR", "NII_6548_EW_IVAR", "HALPHA_EW_IVAR", "NII_6584_EW_IVAR",\
           "SII_6716_EW_IVAR", "SII_6731_EW_IVAR", "CONTINUUM_SMOOTHCORR_B", "CONTINUUM_SMOOTHCORR_R", "CONTINUUM_SMOOTHCORR_Z"]


fastphot_path = "/global/cfs/cdirs/desi/spectro/fastspecfit/fuji/catalogs/fastphot-fuji-sv"+sv+"-bright.fits"
data3 = Table.read(fastphot_path,hdu=1)
data3.rename_column('CONTINUUM_COEFF', 'CONTINUUM_COEFF_FASTPHOT')
data3.rename_column('CONTINUUM_AV', 'CONTINUUM_AV_FASTPHOT')

needed3 = ["TARGETID", "ABSMAG_SDSS_U", "ABSMAG_SDSS_G", "ABSMAG_SDSS_R", "ABSMAG_SDSS_I", "ABSMAG_SDSS_Z", 'ABSMAG_W1', 'CONTINUUM_COEFF_FASTPHOT', 'CONTINUUM_AV_FASTPHOT']

data4 = join(data1[needed1], data2[needed2], keys="TARGETID")
data = join(data4, data3[needed3], keys="TARGETID")

N=len(data['TARGETID'])

## Adding the sum of OII doublets to use them as a single line
data.add_column(data["OII_3726_EW"]+data["OII_3729_EW"], name='OII_DOUBLET_EW')
data.add_column(1/(data["OII_3726_EW_IVAR"]+data["OII_3729_EW_IVAR"]), name='OII_DOUBLET_EW_IVAR')

not_used, ind = np.unique(data['TARGETID'], return_index=True)
data = data[ind]

  data.add_column(1/(data["OII_3726_EW_IVAR"]+data["OII_3729_EW_IVAR"]), name='OII_DOUBLET_EW_IVAR')


In [5]:
lines = ["OII_DOUBLET_EW", "HGAMMA_EW", "HBETA_EW", "OIII_4959_EW", "OIII_5007_EW", "NII_6548_EW", "HALPHA_EW", "NII_6584_EW", "SII_6716_EW", "SII_6731_EW", "test"]
lines_ivar = ["OII_DOUBLET_EW_IVAR", "HGAMMA_EW_IVAR", "HBETA_EW_IVAR", "OIII_4959_EW_IVAR", "OIII_5007_EW_IVAR", "NII_6548_EW_IVAR", "HALPHA_EW_IVAR", "NII_6584_EW_IVAR",\
              "SII_6716_EW_IVAR", "SII_6731_EW_IVAR"]

magnitude_names = ["ABSMAG_SDSS_U", "ABSMAG_SDSS_G", "ABSMAG_SDSS_R", "ABSMAG_SDSS_I", "ABSMAG_SDSS_Z", "ABSMAG_W1"]
    
# calculating minimum redshift to have de-redshifted wavelengths be in the interval w1,w2 A
w1 = 3400
w_min = 3600
z_min = w_min/w1-1
w2 = 8500
w_max = 9824
#z_max = w_max/w2-1
z_max = 0.3

# target selection index. run should be changed if anything in select changed and/or the number of data points changed
if test:
    run = 2
if train:
    run = 2 # old run=0 was sv1 with no bgs selection and n=30k. new run=0 is sv1 with bgs selection. There's only about 25k per line so I will use them for training only.
        # run 1 is sv3 with bgs selection. So far I'm only using it for testing so only l=10.
        # run 2 is no zero EWs for test set and line_ews and line_ivar are better structured.

## I am splitting data into training and testing before applying snr_cuts, and then saving them separately

    
select = ((data["SV"+sv+"_BGS_TARGET"] & bgs_mask.mask("BGS_BRIGHT"))>0)*(data["SPECTYPE"]=="GALAXY")*(data["DELTACHI2"]>=25)\
         *(data["Z"]>z_min)*(data["Z"]<z_max)*(data["ZWARN"]==0)

select_size = len(np.where(select)[0])
print('data size after selection: ' + str(select_size))
data_select = data[select]

if test:
    # no_zeros = (data_select[lines[0]] != 0)
    # for l in range(1,len(lines)-1):
    #     no_zeros = no_zeros * (data_select[lines[l]] != 0)
    
    for l in range(len(lines)-1):
        no_zeros = (data_select[lines[l]] != 0)
        data_test = data_select[no_zeros]
        print('length of data_test is: ' + str(len(data_test)))

        n_test = 20*10**3 # size of testing set I'm keeping

        target_ids = data_test["TARGETID"][:n_test]
        fiber_ids = data_test["FIBER"][:n_test]
        petal_locs = data_test["PETAL_LOC"][:n_test]
        tile_ids = data_test["TILEID"][:n_test]
        zs = data_test["Z"][:n_test]

        coeffs_fastspec = data_test['CONTINUUM_COEFF_FASTSPEC'][:n_test]
        AV_fastspec = data_test['CONTINUUM_AV_FASTSPEC'][:n_test]

        coeffs_fastphot = data_test['CONTINUUM_COEFF_FASTPHOT'][:n_test]
        AV_fastphot = data_test['CONTINUUM_AV_FASTPHOT'][:n_test]

        line_ews = np.array([data_test[ll][:n_test] for ll in lines[:-1]])
        line_ivars = np.array([data_test[ll][:n_test] for ll in lines_ivar])

        smooth_correction_b = data_test['CONTINUUM_SMOOTHCORR_B'][:n_test]
        smooth_correction_r = data_test['CONTINUUM_SMOOTHCORR_R'][:n_test]
        smooth_correction_z = data_test['CONTINUUM_SMOOTHCORR_Z'][:n_test]
        #l = 10 # l=10 corresponds to test set that has no snr cuts. This is a good way of separating those files without changing much of my code.
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_target_ids_selection" + str(run) + "_" + str(lines[l]) + ".txt", target_ids)
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_fiber_ids_selection" + str(run) + "_" + str(lines[l]) + ".txt", fiber_ids)
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_petal_locs_selection" + str(run) + "_" + str(lines[l]) + ".txt", petal_locs)
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_tile_ids_selection" + str(run) + "_" + str(lines[l]) + ".txt", tile_ids)
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_zs_selection" + str(run) + "_" + str(lines[l]) + ".txt", zs)
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_line_ews_selection"+str(run)+"_" + str(lines[l]) + ".txt", line_ews)
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_line_ivars_selection"+str(run)+"_" + str(lines[l]) + ".txt", line_ivars)


        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_fastspec_coeffs_selection" + str(run) + "_" + str(lines[l]) + ".txt", coeffs_fastspec)
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_fastspec_AV_selection" + str(run) + "_" + str(lines[l]) + ".txt", AV_fastspec)

        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_fastphot_coeffs_selection" + str(run) + "_" + str(lines[l]) + ".txt", coeffs_fastphot)
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_fastphot_AV_selection" + str(run) + "_" + str(lines[l]) + ".txt", AV_fastphot)

        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_smooth_correction_b_selection" + str(run) + "_" + str(lines[l]) + ".txt", smooth_correction_b)
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_smooth_correction_r_selection" + str(run) + "_" + str(lines[l]) + ".txt", smooth_correction_r)
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_smooth_correction_z_selection" + str(run) + "_" + str(lines[l]) + ".txt", smooth_correction_z)
        # storing magnitudes to use in EW-from-ugriz-train-test
        mags = np.zeros([n_test, len(magnitude_names)])
        for i in range(len(magnitude_names)):
            mags[:,i] = data_test[magnitude_names[i]][:n_test]
            np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_" + magnitude_names[i] + "_selection" + str(run) + "_" + str(lines[l]) + ".txt", mags[:,i])

if train:
    lines = ["OII_DOUBLET_EW", "HGAMMA_EW", "HBETA_EW", "OIII_4959_EW", "OIII_5007_EW", "NII_6548_EW", "HALPHA_EW", "NII_6584_EW", "SII_6716_EW", "SII_6731_EW"]
    
    ## choosing what snr cuts to make on data_train
    N = len(data_select)
    print('data_train size before any snr cuts: ' + str(N))
    snr_cut = 1
    # calculating snr for all lines and setting the snr cut boolean as select_snr
    snr_all = np.zeros([N,len(lines)])
    snr_all[:,0] = data_select[lines[0]]*np.sqrt(data_select[lines[0]+"_IVAR"])

    cut_Halpha = True # cut Halpha on all data and later cut snr>0 for every line separately
    cut_all = False # cut on all lines together, which extremely restricts the data
    for i in range(1,len(lines)):
        snr_all[:,i] = data_select[lines[i]]*np.sqrt(data_select[lines[i]+"_IVAR"])
        if cut_all:
            select_snr = select_snr*(snr_all[:,i]>snr_cut)
    if cut_Halpha:
        select_snr = snr_all[:,6]>snr_cut
    print('size after halpha cut: ' + str(len(np.where(select_snr)[0])))
    
    n_train = 25*10**3
    for l in range(len(lines)):
        select_snr_both = select_snr*(snr_all[:,l]>0)
        data_train = data_select[select_snr_both]
        print('size after snr cuts: ' + str(len(data_train)))

        target_ids = data_train["TARGETID"][:n_train]
        fiber_ids = data_train["FIBER"][:n_train]
        petal_locs = data_train["PETAL_LOC"][:n_train]
        tile_ids = data_train["TILEID"][:n_train]
        zs = data_train["Z"][:n_train]

        coeffs_fastspec = data_train['CONTINUUM_COEFF_FASTSPEC'][:n_train]
        AV_fastspec = data_train['CONTINUUM_AV_FASTSPEC'][:n_train]

        coeffs_fastphot = data_train['CONTINUUM_COEFF_FASTPHOT'][:n_train]
        AV_fastphot = data_train['CONTINUUM_AV_FASTPHOT'][:n_train]

        line_ews = np.array([data_train[ll][:n_train] for ll in lines])
        line_ivars = np.array([data_train[ll][:n_train] for ll in lines_ivar])
    
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_target_ids_selection" + str(run) + "_" + str(lines[l]) + ".txt", target_ids)
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_fiber_ids_selection" + str(run) + "_" + str(lines[l]) + ".txt", fiber_ids)
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_petal_locs_selection" + str(run) + "_" + str(lines[l]) + ".txt", petal_locs)
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_tile_ids_selection" + str(run) + "_" + str(lines[l]) + ".txt", tile_ids)
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_zs_selection" + str(run) + "_" + str(lines[l]) + ".txt", zs)
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_line_ews_selection"+str(run)+"_" + str(lines[l]) + ".txt", line_ews)
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_line_ivars_selection"+str(run)+"_" + str(lines[l]) + ".txt", line_ivars)

        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_fastspec_coeffs_selection" + str(run) + "_" + str(lines[l]) + ".txt", coeffs_fastspec)
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_fastspec_AV_selection" + str(run) + "_" + str(lines[l]) + ".txt", AV_fastspec)

        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_fastphot_coeffs_selection" + str(run) + "_" + str(lines[l]) + ".txt", coeffs_fastphot)
        np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_fastphot_AV_selection" + str(run) + "_" + str(lines[l]) + ".txt", AV_fastphot)
        
        # storing magnitudes to use in EW-from-ugriz-train-test
        mags = np.zeros([n_train, len(magnitude_names)])
        for i in range(len(magnitude_names)):
            mags[:,i] = data_train[magnitude_names[i]][:n_train]
            np.savez_compressed(server_paths[server]+"target_selection/sv" + sv + "_" + magnitude_names[i] + "_selection" + str(run) + "_" + str(lines[l]) + ".txt", mags[:,i])

# if train:
#     for l in range(len(lines)):
#         # select = ((data["SV3_BGS_TARGET"] & bgs_mask.mask("BGS_BRIGHT"))>0)*(data["SPECTYPE"]=="GALAXY")*(data["DELTACHI2"]>=25)\
#         #          *(data["Z"]>z_min)*(data["Z"]<z_max)*(data["ZWARN"]==0)*(select_snr)*(snr_all[:,l]>0)
#         #snr_cut = (select_snr)*(snr_all[:,l]>0)
#         snr_cut = [True]*(len(data_train))
#         print('size after snr cuts: ' + str(len(np.where(snr_cut)[0])))

#         n_train = 25*10**3 # size of data set I'm keeping

#         target_ids = data_train["TARGETID"][snr_cut][:n_train]
#         fiber_ids = data_train["FIBER"][snr_cut][:n_train]
#         petal_locs = data_train["PETAL_LOC"][snr_cut][:n_train]
#         tile_ids = data_train["TILEID"][snr_cut][:n_train]
#         zs = data_train["Z"][snr_cut][:n_train]

#         coeffs_fastspec = data_train['CONTINUUM_COEFF_FASTSPEC'][snr_cut][:n_train]
#         AV_fastspec = data_train['CONTINUUM_AV_FASTSPEC'][snr_cut][:n_train]

#         coeffs_fastphot = data_train['CONTINUUM_COEFF_FASTPHOT'][snr_cut][:n_train]
#         AV_fastphot = data_train['CONTINUUM_AV_FASTPHOT'][snr_cut][:n_train]

#         line_ews = np.array(data_train[lines][snr_cut][:n_train])
#         line_ivars = np.array(data_train[lines_ivar][snr_cut][:n_train])

#         np.savez_compressed(server_paths[server]+"target_selection/target_ids_selection" + str(run) + "_" + str(lines[l]) + ".txt", target_ids)
#         np.savez_compressed(server_paths[server]+"target_selection/fiber_ids_selection" + str(run) + "_" + str(lines[l]) + ".txt", fiber_ids)
#         np.savez_compressed(server_paths[server]+"target_selection/petal_locs_selection" + str(run) + "_" + str(lines[l]) + ".txt", petal_locs)
#         np.savez_compressed(server_paths[server]+"target_selection/tile_ids_selection" + str(run) + "_" + str(lines[l]) + ".txt", tile_ids)
#         np.savez_compressed(server_paths[server]+"target_selection/zs_selection" + str(run) + "_" + str(lines[l]) + ".txt", zs)
#         np.savez_compressed(server_paths[server]+"target_selection/line_ews_selection"+str(run)+"_" + str(lines[l]) + ".txt", line_ews)
#         np.savez_compressed(server_paths[server]+"target_selection/line_ivars_selection"+str(run)+"_" + str(lines[l]) + ".txt", line_ivars)

#         np.savez_compressed(server_paths[server]+"target_selection/fastspec_coeffs_selection" + str(run) + "_" + str(lines[l]) + ".txt", coeffs_fastspec)
#         np.savez_compressed(server_paths[server]+"target_selection/fastspec_AV_selection" + str(run) + "_" + str(lines[l]) + ".txt", AV_fastspec)

#         np.savez_compressed(server_paths[server]+"target_selection/fastphot_coeffs_selection" + str(run) + "_" + str(lines[l]) + ".txt", coeffs_fastphot)
#         np.savez_compressed(server_paths[server]+"target_selection/fastphot_AV_selection" + str(run) + "_" + str(lines[l]) + ".txt", AV_fastphot)
        
#         # storing magnitudes to use in EW-from-ugriz-train-test
#         mags = np.zeros([n_train, len(magnitude_names)])
#         for i in range(len(magnitude_names)):
#             mags[:,i] = data_train[magnitude_names[i]][snr_cut][:n_train]
#             np.savez_compressed(server_paths[server]+"target_selection/" + magnitude_names[i] + "_selection" + str(run) + "_" + str(lines[l]) + ".txt", mags[:,i])

data size after selection: 36223
data_train size before any snr cuts: 36223
size after halpha cut: 28521
size after snr cuts: 26730


  snr_all[:,0] = data_select[lines[0]]*np.sqrt(data_select[lines[0]+"_IVAR"])


size after snr cuts: 25142
size after snr cuts: 26666
size after snr cuts: 26684
size after snr cuts: 26686
size after snr cuts: 28393
size after snr cuts: 28521
size after snr cuts: 28394
size after snr cuts: 27568
size after snr cuts: 27568


In [4]:
magnitude_names

['ABSMAG_SDSS_U',
 'ABSMAG_SDSS_G',
 'ABSMAG_SDSS_R',
 'ABSMAG_SDSS_I',
 'ABSMAG_SDSS_Z',
 'ABSMAG_W1']

In [None]:
from astropy.visualization.mpl_normalize import ImageNormalize
from astropy.visualization import *
import mpl_scatter_density

norm = ImageNormalize(vmin=0.01, vmax=40, stretch=PowerStretch(1.5))
fig = plt.figure(1)
ax = fig.add_subplot(1, 1, 1, projection='scatter_density')
density=ax.scatter_density(line_ews[6], line_ews[8], cmap=plt.cm.jet,dpi=40,norm=norm)
plt.xlim([0,25])
plt.ylim([0,20])

In [None]:
lines

In [None]:
[True]*10