# LRG IR selection correlations with systematics 

Use LASSO to do variable selection and find correlations of LRG density with systematics.

In [3]:
import pandas as pd
import numpy as np
import healpy as hp

import matplotlib.pyplot as plt
import matplotlib.lines as lines

from astropy.table import Table as T
from astropy.coordinates import SkyCoord

from scipy.stats import binned_statistic, iqr

from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, median_absolute_error

Load the data

In [2]:
hpTable = T.read("/home/bid13/code/desi/DESI-LASSO/data_new/heapix_map_lrg_ir_nominal_20191024_clean_combined_128.fits")
pix_area = hp.pixelfunc.nside2pixarea(128, degrees=True)

#Moving to pandas
data=hpTable.to_pandas()
data=data.dropna()
data=data.reset_index(drop=True)
data["region"] = data["region"].str.decode("utf-8")

In [4]:
#put in galactic long and lat
coords = SkyCoord(ra = data.ra, dec =data.dec, unit = "deg")
data["cos(l)"] = coords.galactic.l.radian
data["cos(b)"] =coords.galactic.b.radian

data["cos(l)"] = np.cos(data["cos(l)"])
data["cos(b)"] = np.cos(data["cos(b)"])

#The regression is weighted using the fraction of area occupied in the pixel
data["weight"] = data["pix_frac"]/data["pix_frac"].max()

data["pix_area"] = pix_area*data["pix_frac"]
data["pix_pop"] = data["density"]*data["pix_area"]

#Columns to keep
columns = ['EBV', 'galdepth_gmag', 'galdepth_rmag', 'galdepth_zmag','psfdepth_w1mag', 'PSFSIZE_G', 'PSFSIZE_R', 'PSFSIZE_Z', 'stardens_log',"cos(l)","cos(b)"]

select the two regions

In [6]:
data_bm = data[data.region=="bm"] #Bass MzLS
array_bm = np.array(data_bm[columns])

scaler_bm=StandardScaler()
scaled_bm=scaler_bm.fit_transform(array_bm)

data_de = data[data.region!="bm"] #DES+DECaLS
array_de = np.array(data_de[columns])
array_des = np.array(data[data.region=="decals"][columns])
array_decals = np.array(data[data.region=="des"][columns])

scaler_de=StandardScaler()
scaled_de=scaler_de.fit_transform(array_de)

## Calculate the Predicted Surface Density LASSO

### Analysis for BASS+MzLS Region

In [7]:
#Weighted LASSO
lasso_bm = SGDRegressor(loss="squared_loss", penalty="l1", l1_ratio=1)

#CV
num_alphas=1000
alphas=np.logspace(-5,10,num_alphas)
cv = GridSearchCV(estimator=lasso_bm, param_grid={"alpha":alphas}, n_jobs=-1, cv=5)
cv.fit(scaled_bm, data_bm.density, sample_weight=data_bm["weight"])

GridSearchCV(cv=5, error_score=nan,
             estimator=SGDRegressor(alpha=0.0001, average=False,
                                    early_stopping=False, epsilon=0.1,
                                    eta0=0.01, fit_intercept=True, l1_ratio=1,
                                    learning_rate='invscaling',
                                    loss='squared_loss', max_iter=1000,
                                    n_iter_no_change=5, penalty='l1',
                                    power_t=0.25, random_state=None,
                                    shuffle=True, tol=0.001,
                                    validation_fraction=0.1, verbose=0,
                                    warm_start=F...
       5.18459354e+09, 5.36697695e+09, 5.55577622e+09, 5.75121707e+09,
       5.95353313e+09, 6.16296626e+09, 6.37976681e+09, 6.60419396e+09,
       6.83651600e+09, 7.07701066e+09, 7.32596543e+09, 7.58367791e+09,
       7.85045620e+09, 8.12661920e+09, 8.41249705e+09, 8.70843150e+09,
     

Select $\alpha$ which maximizes $R^2$ while decreasing the fraction of predictors chosen.

In [None]:
#Find fraction of Non zero coefficients
coeffs = np.zeros((num_alphas, len(columns)))

for i, alpha in enumerate(alphas):
    lasso = SGDRegressor(alpha=alpha, loss="squared_loss", penalty="l1", l1_ratio=1)
    lasso.fit(scaledData, data.density,sample_weight=data["weight"])
    coeffs[i] = lasso.coef_
frac = (coeffs!=0).sum(axis=1)/len(columns)

### Analysis for DES+DECaLS Region

In [8]:
#Weighted LASSO
lasso_de = SGDRegressor(loss="squared_loss", penalty="l1", l1_ratio=1)
#CV
num_alphas=1000
alphas=np.logspace(-5,10,num_alphas)
cv = GridSearchCV(estimator=lasso_de, param_grid={"alpha":alphas}, n_jobs=-1, cv=5)
cv.fit(scaled_de, data_de.density, sample_weight=data_de["weight"])

GridSearchCV(cv=5, error_score=nan,
             estimator=SGDRegressor(alpha=0.0001, average=False,
                                    early_stopping=False, epsilon=0.1,
                                    eta0=0.01, fit_intercept=True, l1_ratio=1,
                                    learning_rate='invscaling',
                                    loss='squared_loss', max_iter=1000,
                                    n_iter_no_change=5, penalty='l1',
                                    power_t=0.25, random_state=None,
                                    shuffle=True, tol=0.001,
                                    validation_fraction=0.1, verbose=0,
                                    warm_start=F...
       5.18459354e+09, 5.36697695e+09, 5.55577622e+09, 5.75121707e+09,
       5.95353313e+09, 6.16296626e+09, 6.37976681e+09, 6.60419396e+09,
       6.83651600e+09, 7.07701066e+09, 7.32596543e+09, 7.58367791e+09,
       7.85045620e+09, 8.12661920e+09, 8.41249705e+09, 8.70843150e+09,
     

Select $\alpha$ which maximizes $R^2$ while decreasing the fraction of predictors chosen.