In [1]:
import numpy as np
import pylab as plt
from scipy import stats, signal
from sklearn.neighbors import KDTree
import time
from sklearn.metrics import mean_squared_error
from os import listdir
import scipy
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from tensorflow import keras
from tensorflow.keras import layers
from LLR import LLR
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
plt.rcParams['figure.figsize'] = [25, 10]

  from pandas import MultiIndex, Int64Index


In [14]:
def features_and_outcomes(x_in, y_in, n_out, ivar):
    magnitudes = np.zeros([n_out,x_in.shape[1]])
    EWs = np.zeros([n_out,len(lines)])
    
    select_fluxes = x_in[:,0]>0
    for i in range(1, x_in.shape[1]):
        select_fluxes = select_fluxes*(x_in[:,i]>0)
    
    x_in = x_in[select_fluxes,:]
    y_in = y_in[select_fluxes]
    ivar = ivar[select_fluxes]
    
    for i in range(n_out):
        magnitudes[i,:] = -2.5*np.log10(x_in[i,:])
        for j in range(len(lines)):
            EWs[i,j] = y_in[i][j]
    
    ones = np.ones([n_out,1])
    scalar = StandardScaler()
    x_out = np.zeros([n_out,x_in.shape[1]-1])
    for j in range(x_in.shape[1]-1):
        x_out[:,j] = magnitudes[:,j] - magnitudes[:,j+1]
    x_out = scalar.fit_transform(x_out)
    
    if (m == 0 or m == 5) or (m == 6 or m == 7):
        x_out = np.concatenate((ones,x_out), axis=1)
        
    if loga:
        y_out = np.log10(EWs[:,l])
    else:
        y_out = EWs[:,l]
        
    return x_out, y_out, ivar
        

In [15]:
server = 1 # 0 is perlmutter, 1 is cori
server_paths = ['/pscratch/sd/a/ashodkh/', '/global/cscratch1/sd/ashodkh/']

## reading fluxes and equivalent widths
lines = ["OII_DOUBLET_EW","HGAMMA_EW","HBETA_EW","OIII_4959_EW","OIII_5007_EW","NII_6548_EW"\
         ,"HALPHA_EW","NII_6584_EW","SII_6716_EW","SII_6731_EW"]
l = 6

run = 0
m = 7
loga = True

data = 0 # 0 is raw_masked, 1 is raw_unmasked, 2 is fastspec, 3 is fastphot
data_file_names = ['raw_masked', 'raw_unmasked', 'fastspec', 'fastphot']
data_flux_names = ['fluxes', 'fluxes', 'fluxes_fastspec', 'fluxes_fastphot']

Ns = [6, 11, 16, 21, 26, 31, 41, 51]
decades = 3 ## number of 10k galaxy files I want to load and combine
for N in Ns:
    n = 10*10**3
    fluxes_bin = np.zeros([25*10**3, N-1]) ## fluxes are separated into groups of 10k galaxies
    for i in range(decades):
        if i == 2:
            n = 5*10**3
            fluxes_bin[10**4*i:25*10**3,:] =  np.load(server_paths[server] + "fluxes_from_spectra/" + data_file_names[data] + "/" + data_flux_names[data]\
                                                +str(i)+ "_selection"+str(run)+"_"+str(lines[l])+"_bins"+str(N)+".txt.npz")["arr_0"]
        else:
            fluxes_bin[10**4*i:n*(i+1),:] = np.load(server_paths[server] + "fluxes_from_spectra/" + data_file_names[data] + "/" + data_flux_names[data]\
                                            +str(i)+ "_selection"+str(run)+"_"+str(lines[l])+"_bins"+str(N)+".txt.npz")["arr_0"]

        zs = np.load("/global/cscratch1/sd/ashodkh/target_selection/zs_selection" + str(run) + "_" + str(lines[l]) + ".txt.npz")["arr_0"][:decades*10*10**3]
        target_lines = np.load("/global/cscratch1/sd/ashodkh/target_selection/line_ews_selection" + str(run) + "_" + str(lines[l]) + ".txt.npz")["arr_0"][:decades*10*10**3]
        line_ivars = np.load("/global/cscratch1/sd/ashodkh/target_selection/line_ivars_selection" + str(run) + "_" + str(lines[l]) + ".txt.npz")["arr_0"][:decades*10*10**3]

    x, EW, line_ivars = features_and_outcomes(fluxes_bin, target_lines, 23*10**3,line_ivars) 
    
    N_cv = 10
    x_split = np.split(x,N_cv)
    EW_split = np.split(EW,N_cv)

    EW_fit_all = []
    EW_obs_all = []

    spearman_all = []
    rms_all = []
    nmad_all = []
    nmad2_all = []
    for i in range(N_cv):
        ## assigning the training and validation sets
        x_valid = x_split[i]
        EW_valid = EW_split[i]

        x_to_combine = []
        EW_to_combine = []
        for j in range(N_cv):
            if j != i:
                x_to_combine.append(x_split[j])
                EW_to_combine.append(EW_split[j])
        x_train=np.concatenate(tuple(x_to_combine),axis=0)
        EW_train=np.concatenate(tuple(EW_to_combine),axis=0)

        # predicting EWs using different models
        if m == 0:
            EW_fit,zeros = LLR.LLR(x_valid, x_train, EW_train, 100, 'inverse_distance')
        if m == 1:
            model = RandomForestRegressor(n_estimators=200)
            model.fit(x_train, EW_train)
            EW_fit = model.predict(x_valid)
        if m == 2:
            model = GradientBoostingRegressor(n_estimators=100)
            model.fit(x_train, EW_train)
            EW_fit = model.predict(x_valid)
        if m == 3:
            model = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.05)
            model.fit(x_train, EW_train, early_stopping_rounds=5, eval_set=[(x_valid,EW_valid)], verbose=False)
            EW_fit = model.predict(x_valid)
            print(model.best_ntree_limit)
        if m == 4:
            model_input = layers.Input(shape=x.shape[1])
            h1 = layers.Dense(units=100, kernel_initializer="he_normal")(model_input)
            a1 = layers.PReLU()(h1)
            h2 = layers.Dense(units=100, kernel_initializer="he_normal")(a1)
            a2 = layers.PReLU()(h2)
            h3 = layers.Dense(units=100, kernel_initializer="he_normal")(a2)
            a3 = layers.PReLU()(h3)
            output_layer = layers.Dense(1, activation='linear')(a3)
            model = keras.models.Model(inputs=model_input, outputs=output_layer)

            model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3), loss='mse', metrics='mse')

            n_epochs = 100
            batch_size = 100
            history = model.fit(x_train, EW_train, batch_size=batch_size, epochs=n_epochs, verbose=0, validation_data=(x_valid, EW_valid))
            EW_fit = model.predict(x_valid)
        if m == 5:
            EW_fit,zeros = LLR.LLR(x_valid, x_train, EW_train, 200, 'inverse_distance')
        if m == 6:
            EW_fit,zeros = LLR.LLR(x_valid, x_train, EW_train, 800, 'inverse_distance')
        if m == 7:
            EW_fit,zeros = LLR.LLR(x_valid, x_train, EW_train, 1000, 'inverse_distance')
            
        # calculating spearman coefficient and nmad for fit. nmad2 has the error in it.
        nmad = np.abs(EW_fit-EW_valid)
        nmad2 = np.abs(EW_fit-EW_valid)

        EW_fit_all.append(EW_fit)
        EW_obs_all.append(EW_valid)

        spearman_all.append(stats.spearmanr(EW_fit,EW_valid)[0])
        rms_all.append(np.sqrt(mean_squared_error(EW_fit,EW_valid)))
        nmad_all.append(1.48*np.median(nmad))
        nmad2_all.append(1.48*np.median(nmad2))

    print(lines[l])
    print(spearman_all)
    print('spearman_average= '+str(np.average(spearman_all)))
    # print(rms_all)
    # print(np.average(rms_all))
    print(nmad_all)
    print('nmad_average= '+str(np.average(nmad_all)))
    print("\n")

    if loga:
        np.savez_compressed(server_paths[server] + "ew_results/" + data_file_names[data] + "/m" + str(m) + "/logEW_fit_" + data_file_names[data] + "_selection" + str(run) + \
                            "_line" + str(lines[l]) + "_bins" + str(N) + "_ML" + str(m) + ".txt", EW_fit_all)
        np.savez_compressed(server_paths[server] + "ew_results/" + data_file_names[data] + "/m" + str(m) + "/logEW_obs_" + data_file_names[data] + "_selection" + str(run) + \
                            "_line" + str(lines[l]) + "_bins" + str(N) + "_ML" + str(m) + ".txt", EW_obs_all)
        np.savez_compressed(server_paths[server] + "ew_results/" + data_file_names[data] + "/m" + str(m) + "/line_ivars_" + data_file_names[data] + "_selection" + str(run) + \
                            "_line" + str(lines[l]) + "_bins" + str(N) + "_ML" + str(m) + ".txt", line_ivars)
    else:
        np.savez_compressed(server_paths[server] + "ew_results/" + data_file_names[data] + "/m" + str(m) + "/EW_fit_" + data_file_names[data] + "_selection" + str(run) + \
                            "_line" + str(lines[l]) + "_bins" + str(N) + "_ML" + str(m) + ".txt", EW_fit_all)
        np.savez_compressed(server_paths[server] + "ew_results/" + data_file_names[data] + "/m" + str(m) + "/EW_obs_" + data_file_names[data] + "_selection" + str(run) + \
                            "_line" + str(lines[l]) + "_bins" + str(N) + "_ML" + str(m) + ".txt", EW_obs_all)
        np.savez_compressed(server_paths[server] + "ew_results/" + data_file_names[data] + "/m" + str(m) + "/line_ivars_" + data_file_names[data] + "_selection" + str(run) + \
                            "_line" + str(lines[l]) + "_bins" + str(N) + "_ML" + str(m) + ".txt", line_ivars)


[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
HALPHA_EW
[0.8239675708710646, 0.8555385526783092, 0.8258609872179065, 0.7971838055003495, 0.8433848616573666, 0.8076193495129861, 0.8278853048521575, 0.8376198729781957, 0.8350893637466537, 0.845338375629018]
spearman_average= 0.8299488044644008
[0.27616096731961165, 0.2915424976341893, 0.28499469911660325, 0.3148310685551887, 0.28103630270653485, 0.3252640448439384, 0.30119539904259135, 0.31387914090594715, 0.2935715990286196, 0.3098011520767034]
nmad_average= 0.29922768712299275


[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
HALPHA_EW
[0.8610964232367259, 0.8878549866001573, 0.8763679483559038, 0.856240342677142, 0.8937045105219341, 0.8601805271977654, 0.8589338909795304, 0.8902040081753281, 0.8942228549940466, 0.8996695896474114]
spearman_average= 0.8778475082385946
[0.2639974233741571, 0.2477331137968838, 0.25123689902737395, 0.25944016249558816, 0.23437524140311414, 0.2847293554748391, 0.28305302264757276, 0.26381082513697807, 0.22665009439384154, 0.251903627299165

In [None]:
np.where(EWs[:,l]==np.max(EWs[:,l]))

In [None]:
len(np.where(select_fluxes)[0])

In [None]:
#fastspec=0.795, fastphot=0.8153 ,raw=0.785, raw_unmasked=0.898 this is for 10k
#fastspec=0.787, fastphot=0.821, raw=0.785, raw_unmasked=0.9 this is for 30k

In [None]:
plt.plot(EW_obs_all[0][:],EW_fit_all[0][:],'*',alpha=0.1)

In [None]:
if m == 4:
    plt.plot(history.history["val_loss"])
    plt.plot(history.history["loss"])


In [None]:
if perlmutter:
    spectra = np.load("/pscratch/sd/a/ashodkh/spectra_from_targets/raw/raw_spectra" +str(0)+ "_selection"+str(run)+"_"+str(lines[l])+".txt.npz")["arr_0"]
    #spectra = np.load("/pscratch/sd/a/ashodkh/spectra_from_targets/fastphot/fastphot_spectra" +str(1)+ "_selection"+str(run)+"_"+str(lines[l])+".txt.npz")["arr_0"]
    #spectra = np.load("/pscratch/sd/a/ashodkh/spectra_from_targets/fastspec/fastspec_spectra" +str(1)+ "_selection"+str(run)+"_"+str(lines[l])+".txt.npz")["arr_0"]

    #spectra = spectra[select_fluxes[10*10**3:20*10**3],:]
    spectra = spectra[select_fluxes[:10*10**3],:]
if cori:
    if fastspec:
        spectra = np.load("/global/cscratch1/sd/ashodkh/spectra_from_targets/fastspec/fastspec_spectra" +str(0)+ "_selection"+str(run)+"_"+str(lines[l])+".txt.npz")["arr_0"]
    if fastphot:
        spectra = np.load("/global/cscratch1/sd/ashodkh/spectra_from_targets/fastphot/fastphot_spectra" +str(0)+ "_selection"+str(run)+"_"+str(lines[l])+".txt.npz")["arr_0"]
    if raw:
        spectra = np.load("/global/cscratch1/sd/ashodkh/spectra_from_targets/raw/raw_spectra" +str(0)+ "_selection"+str(run)+"_"+str(lines[l])+".txt.npz")["arr_0"]
    if raw_unmasked:
        spectra = np.load("/global/cscratch1/sd/ashodkh/spectra_from_targets/raw/raw_spectra" +str(0)+ "_selection"+str(run)+"_"+str(lines[l])+".txt.npz")["arr_0"]
    spectra = spectra[select_fluxes[:10*10**3],:]

In [None]:
w1, w2 = 3400, 7000
d = np.average(.8/(1+zs))
Ns = [16]
pivot = []
effective_waves = []
wavelength = np.arange(3600, 9824+.8, .8)
c=3*10**18
for N in Ns:
    print(N)
    bin_ws = np.linspace(w1,w2,N)
    small_bins = []
    pivot_bins = []
    effective_waves_bins = []
    for i in range(N-1):
        small_bins.append(np.arange(bin_ws[i],bin_ws[i+1],d))
        pivot_bins.append(np.sqrt(np.average(small_bins[i])/np.average(1/small_bins[i])))
        effective_waves_bins.append(np.average(small_bins[i]))

    pivot.append(pivot_bins)
    effective_waves.append(effective_waves_bins)

i = 4305

plt.figure(1)
#plt.plot(wavelength/(1+zs[select_fluxes][i]), signal.medfilt(spectra[i-len(np.where(select_fluxes[:10*10**3])[0]),:], kernel_size=15)*(1+zs[select_fluxes][i]))
plt.plot(wavelength/(1+zs[select_fluxes][i]), signal.medfilt(spectra[i,:], kernel_size=15)*(1+zs[select_fluxes][i]))
plt.plot(effective_waves[0], fluxes_bin[i,:]*c/np.array(pivot[0])[:]**2, 'o')

In [None]:
w1, w2 = 3400, 7000
d = np.average(.8/(1+zs))
Ns = [16]
pivot = []
effective_waves = []
wavelength = np.arange(3600, 9824+.8, .8)
c=3*10**18
for N in Ns:
    print(N)
    bin_ws = np.linspace(w1,w2,N)
    small_bins = []
    pivot_bins = []
    effective_waves_bins = []
    for i in range(N-1):
        small_bins.append(np.arange(bin_ws[i],bin_ws[i+1],d))
        pivot_bins.append(np.sqrt(np.average(small_bins[i])/np.average(1/small_bins[i])))
        effective_waves_bins.append(np.average(small_bins[i]))

    pivot.append(pivot_bins)
    effective_waves.append(effective_waves_bins)

i = 4999

plt.figure(1)
#plt.plot(wavelength/(1+zs[select_fluxes][i]), signal.medfilt(spectra[i-len(np.where(select_fluxes[:10*10**3])[0]),:], kernel_size=15)*(1+zs[select_fluxes][i]))
plt.plot(wavelength/(1+zs[select_fluxes][i]), signal.medfilt(spectra[i,:], kernel_size=15)*(1+zs[select_fluxes][i]))
plt.plot(effective_waves[0], fluxes_bin[i,:]*c/np.array(pivot[0])[:]**2, 'o')