In [9]:
from astropy.io import fits
from astropy.table import Table,join
import numpy as np
import pylab as plt
import random
from scipy import stats
from sklearn.neighbors import KDTree
import time
from sklearn.metrics import mean_squared_error
from desitarget.targetmask import desi_mask, bgs_mask, mws_mask
from LLR import LLR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from tensorflow import keras
from tensorflow.keras import layers
import xgboost as xgb
from LLR import LLR

  from pandas import MultiIndex, Int64Index


In [10]:
## DATA ##
## I'm combining fastphot,fastspect, and ztile to make sure I use the same data everywhere ##

zall_path="/project/projectdirs/desi/spectro/redux/everest/zcatalog/ztile-main-bright-cumulative.fits"
data1=Table.read(zall_path,hdu=1)
needed1=["TARGETID","BGS_TARGET","SPECTYPE","DELTACHI2","Z","ZWARN"]

fastspec_path = "/project/projectdirs/desi/spectro/fastspecfit/everest/catalogs/fastspec-everest-main-bright.fits"
data2=Table.read(fastspec_path,hdu=1)
needed2=["TARGETID","OII_3726_EW","OII_3729_EW","HGAMMA_EW","HBETA_EW","OIII_4959_EW","OIII_5007_EW","NII_6548_EW","HALPHA_EW","NII_6584_EW","SII_6716_EW","SII_6731_EW",\
        "OII_3726_EW_IVAR","OII_3729_EW_IVAR","HGAMMA_EW_IVAR","HBETA_EW_IVAR","OIII_4959_EW_IVAR","OIII_5007_EW_IVAR","NII_6548_EW_IVAR","HALPHA_EW_IVAR","NII_6584_EW_IVAR","SII_6716_EW_IVAR","SII_6731_EW_IVAR"]

file_path = "/project/projectdirs/desi/spectro/fastspecfit/everest/catalogs/fastphot-everest-main-bright.fits"
data3=Table.read(file_path,hdu=1)
needed3=["TARGETID","ABSMAG_SDSS_U","ABSMAG_SDSS_G","ABSMAG_SDSS_R","ABSMAG_SDSS_I","ABSMAG_SDSS_Z"]

data4=join(data1[needed1],data2[needed2],keys="TARGETID")
data=join(data4,data3[needed3],keys="TARGETID")

## Adding the sum of OII doublets to use them as a single line
data.add_column(data["OII_3726_EW"]+data["OII_3729_EW"],name='OII_DOUBLET_EW')
data.add_column(1/(1/data["OII_3726_EW_IVAR"]+1/data["OII_3729_EW_IVAR"]),name='OII_DOUBLET_EW_IVAR')

  data.add_column(1/(1/data["OII_3726_EW_IVAR"]+1/data["OII_3729_EW_IVAR"]),name='OII_DOUBLET_EW_IVAR')


In [12]:
## Selecting data and doing LLR to predict lines ##
lines=["OII_DOUBLET_EW","HGAMMA_EW","HBETA_EW","OIII_4959_EW","OIII_5007_EW","NII_6548_EW","HALPHA_EW"\
       ,"NII_6584_EW","SII_6716_EW","SII_6731_EW"]

magnitude_names=["ABSMAG_SDSS_U","ABSMAG_SDSS_G","ABSMAG_SDSS_R","ABSMAG_SDSS_I","ABSMAG_SDSS_Z"]
 
N=len(data["TARGETID"])
snr_cut=1 # signal to noise ratio cut
n=30*10**3 # initial selection size. Should be smaller later after selecting flux>0 from raw data to make sure same data is used.

# calculating snr for all lines and setting the snr cut as select_snr
snr_all=np.zeros([N,len(lines)])
snr_all[:,0]=data[lines[0]]*np.sqrt(data[lines[0]+"_IVAR"])

cut_Halpha=True
cut_all=False
for i in range(1,len(lines)):
    snr_all[:,i]=data[lines[i]]*np.sqrt(data[lines[i]+"_IVAR"])
    if cut_all:
        select_snr=select_snr*(snr_all[:,i]>snr_cut)
if cut_Halpha:
    select_snr=snr_all[:,6]>snr_cut

# calculating minimum redshift to have de-redshifted wavelengths be in the interval 3400,7000 A
w1=3400
w_min=3600
z_min=w_min/w1-1

#parameters
ll=6
N=16
run_flux=1
run_out=2
loga=True     # if true then predicts log(EW)
m=3           # model index. 0 is LLR, 1 is RandomForest, 2 is GradientBoosting from sklearn, 3 is XGboost, 4 is neural network

# getting flux cut from everest target selection to make sure same data is used
select_fluxes=np.load("/global/cscratch1/sd/ashodkh/results/select_positive_fluxes_selection"+str(run_flux)+"_"+str(lines[ll])+"_bins"+str(N)+".txt.npz")["arr_0"]
for l in range(1):
    l=ll
    # initial target selection that doesn't include flux cut
    select=((data["BGS_TARGET"] & bgs_mask.mask("BGS_BRIGHT"))>0)*(data["SPECTYPE"]=="GALAXY")*(data["DELTACHI2"]>=25)\
        *(data["Z"]>z_min)*(data["Z"]<0.3)*(data["ZWARN"]==0)*select_snr*(snr_all[:,l]>0)
    target_ids=data["TARGETID"][select]
    print(len(np.where(select==True)[0]))
    target_pos=np.where(select==True)[0][:n] 
    
    # flux cut after initial target selection and taking first n data
    n=25*10**3
    target_pos=target_pos[select_fluxes][:n]

    # assigning features as colors and standardizing them. I also add ones to include the y-intercept as part of the parameter matrix if m==0 (LLR).
    magnitudes_s=data[magnitude_names][target_pos]  
    magnitudes=np.zeros([n,len(magnitude_names)])
    for j in range(len(magnitude_names)):
        magnitudes[:,j]=magnitudes_s[magnitude_names[j]][:n]

    ones=np.ones([n,1])
    x=np.zeros([n,len(magnitude_names)-1])
    for i in range(n):
        for j in range(len(magnitude_names)-1):
            x[i,j]=magnitudes[i,j]-magnitudes[i,j+1]
    av_x=np.zeros(x.shape[1])
    std_x=np.zeros(x.shape[1])
    for i in range(x.shape[1]):
        av_x[i]=np.average(x[:,i])
        std_x[i]=np.std(x[:,i])
        x[:,i]=(x[:,i]-av_x[i])/std_x[i]
    
    if m==0:
        x=np.concatenate((ones,x),axis=1)
    
    # assigning outcomes as EW (equivalent width) and getting their inverse variance
    if loga:
        EW=np.log10(data[lines[l]][target_pos])
    else:
        EW=data[lines[l]][target_pos]
    ivar=data[lines[l]+"_IVAR"][target_pos]
    
    ## doing cross-validation by splitting data into N_cv intervals. I store all the outcomes in EW_fit_all, ivar_all, etc...
    N_cv=10
    x_split=np.split(x,N_cv)
    EW_split=np.split(EW,N_cv)
    ivar_split=np.split(ivar,N_cv)
    
    EW_fit_all=[]
    EW_obs_all=[]
    ivar_all=[]
    
    spearman_all=[]
    nmad_all=[]
    for i in range(N_cv):
        ## assigning the training and validation sets
        x_valid=x_split[i]
        EW_valid=EW_split[i]
        ivar_valid=ivar_split[i]
        x_to_combine=[]
        EW_to_combine=[]
        for j in range(N_cv):
            if j!=i:
                x_to_combine.append(x_split[j])
                EW_to_combine.append(EW_split[j])
        x_train=np.concatenate(tuple(x_to_combine),axis=0)
        EW_train=np.concatenate(tuple(EW_to_combine),axis=0)
        
        # predicting EWs using LLR
        if m==0:
            EW_fit,zeros=LLR.LLR(x_valid, x_train, EW_train, 100, 'inverse_distance')
        if m==1:
            model=RandomForestRegressor(n_estimators=200)
            model.fit(x_train, EW_train)
            EW_fit=model.predict(x_valid)
        if m==2:
            model=GradientBoostingRegressor(n_estimators=100)
            model.fit(x_train, EW_train)
            EW_fit=model.predict(x_valid)
        if m==3:
            model=xgb.XGBRegressor(n_estimators=1000, learning_rate=0.05)
            model.fit(x_train, EW_train, early_stopping_rounds=5, eval_set=[(x_valid,EW_valid)], verbose=False)
            EW_fit=model.predict(x_valid)
            print(model.best_ntree_limit)
        if m==4:
            model=keras.Sequential([
                layers.Dense(units=10, activation='sigmoid', input_shape=[x.shape[1]]),
                layers.Dense(units=5, activation='sigmoid'),
                layers.Dense(units=3, activation='sigmoid'),
                layers.Dense(units=1),
            ])
            model.compile(optimizer='Adam', loss='mse')
            model.fit(x_train, EW_train,batch_size=5)
            EW_fit=model.predict(x_valid)
        
        # removing points that are on top of each other from y_valid and its ivar
        EW_valid=np.delete(EW_valid,obj=zeros,axis=0)
        ivar_valid=np.delete(ivar_valid,obj=zeros,axis=0)
        
        # calculating spearman coefficient and nmad for fit.
        nmad=np.abs(EW_fit-EW_valid)

        EW_fit_all.append(EW_fit)
        EW_obs_all.append(EW_valid)
        ivar_all.append(ivar_valid)
        
        spearman_all.append(stats.spearmanr(EW_fit,EW_valid)[0])
        nmad_all.append(1.48*np.median(nmad))

    print(lines[l])
    print(spearman_all)
    print("av spearman = "+str(np.average(spearman_all)))
    print(nmad_all)
    print("av nmad = "+str(np.average(nmad_all)))

    print("\n")

    if loga:
        np.savez_compressed("/global/cscratch1/sd/ashodkh/results/logEW_fit_classical_selection"+str(run_out)+"_line"+str(lines[l])+"_bins"+str(N)+"_model"+str(m)+".txt",EW_fit_all)
        np.savez_compressed("/global/cscratch1/sd/ashodkh/results/logEW_obs_classical_selection"+str(run_out)+"_line"+str(lines[l])+"_bins"+str(N)+"_model"+str(m)+".txt",EW_obs_all)
        np.savez_compressed("/global/cscratch1/sd/ashodkh/results/logEW_ivar_classical_selection"+str(run_out)+"_line"+str(lines[l])+"_bins"+str(N)+"_model"+str(m)+".txt",ivar_all)
    else:
        np.savez_compressed("/global/cscratch1/sd/ashodkh/results/EW_fit_classical_selection"+str(run_out)+"_line"+str(lines[l])+"_bins"+str(N)+"_model"+str(m)+".txt",EW_fit_all)
        np.savez_compressed("/global/cscratch1/sd/ashodkh/results/EW_obs_classical_selection"+str(run_out)+"_line"+str(lines[l])+"_bins"+str(N)+"_model"+str(m)+".txt",EW_obs_all)
        np.savez_compressed("/global/cscratch1/sd/ashodkh/results/EW_ivar_classical_selection"+str(run_out)+"_line"+str(lines[l])+"_bins"+str(N)+"_model"+str(m)+".txt",ivar_all)

450905
88
84
100
128
102
116
109
108
121
117
HALPHA_EW
[0.8286405322456807, 0.8437531784409005, 0.83503858154145, 0.8388695934183125, 0.8319204025108918, 0.8305854259888491, 0.8306374543715562, 0.8343620521136281, 0.8480888446476605, 0.854672413037204]
av spearman = 0.8376568478316134
[0.23274305880069732, 0.2445535969734192, 0.23675756335258483, 0.23262191772460938, 0.23531399309635162, 0.2277190452814102, 0.22133491277694703, 0.22613953590393066, 0.23803266525268554, 0.22177841305732726]
av nmad = 0.23169947022199633


