# Measuring the Galaxy-LyaForest Cross-Correlation from CLAMATO DR2

Here, we carry out the first cross-correlation measurement of the Ly-alpha forest measured by CLAMATO DR2, and coeval galaxies from MOSDEF, 3D-HST and zCOSMOS-Deep (separately for each sample). We use the simple estimator from Font-Ribera+ 2012 DLA-Forest cross-correlation paper:

\begin{equation*}
\xi_A = \frac{\sum_{i\in A} w_i \delta_{Fi}}{\sum_{i\in A} w_i},
\end{equation*}

where 

\begin{equation*}
w_i =  \left[\sigma^2_F(z_i) + \frac{\sigma_{N,i}^2}{C_i^2 \bar{F}^2(z_i)}\right]^{-1}
\end{equation*}

and $\sigma_F^2(z_i) = 0.065 [(1+z_i)/3.25]^{3.8}$.

This uses the np.histogram2d function twice, to compute the numerator and denominator of the estimator around each galaxy.

The pixel data needs to be generated using the IDL script GEN_CROSSCORR_INPUT.PRO, and the mean-flux shoudl first be calculated with CALC_MEANFLUX.IPYNB

### Read in Ly-a forest pixels

In [None]:
import numpy as np
import time as time
import os

import lyafxcorr_kg as xcorr
import constants
constants.DATA_VERSION = 'v0'

import pandas as pd
# Set up matplotlib and use a nicer set of plot parameters
%config InlineBackend.rc = {}
import matplotlib as mpl
mpl.rc('mathtext',fontset='stixsans')
mpl.rc('figure', facecolor="white")
#matplotlib.rc_file("../../templates/matplotlibrc")
import matplotlib.pyplot as plt
#import matplotlib.colors as colors
%matplotlib inline

import astropy.table
from astropy.cosmology import FlatLambdaCDM
from astropy.io import fits
from astropy.io import ascii
from astropy.table import Table
from astropy import units as u
from astropy.coordinates import SkyCoord

import plotting
plotting.plot_preamble()

def taueff_evo(z):
    return 0.001845 * (1.+z)**3.924

# Define cosmology
cosmo = constants.COSMOLOGY
zmin = 2.0
zmid = 2.3
comdist_mean = cosmo.comoving_distance(zmid)
comdist_zmin = cosmo.comoving_distance(zmin)
dcomdist_dz = cosmo.inv_efunc(zmid) *2998. # in Mpc/h
sim_boxlen_mpch = 250

dz_to_dmpch = lambda z: z * dcomdist_dz
dkms_to_dmpch = lambda v: ((v * u.km / u.s) / cosmo.H(zmid)).value * cosmo.h * (1 + zmid)

lyapix = xcorr.lyapix(os.path.join(constants.CLAMATO_DIR_BASE, f"pixel_radecz_cl2020_{constants.DATA_VERSION}.bin") ,cosmo=cosmo)

print("Read in %i Ly-a forest pixels" % lyapix.npix)
npix = lyapix.npix

fig, ax = plt.subplots()
binwidth = 50
histdata = lyapix.coord.distance.value

ax.hist(histdata,bins=np.arange(min(histdata),max(histdata) + binwidth, binwidth))
plt.show()

# Carry out mean-flux correction
fmean_str = ascii.read(os.path.join(constants.CLAMATO_DIR_BASE, f'fmean_measured_{constants.DATA_VERSION}.dat'))
zmid = fmean_str['zmid']
F_mean = fmean_str['F_mean']

#Fcorr = np.interp(lyapix.z, zmid, F_mean) / np.exp(-taueff_evo(lyapix.z))
#lyapix.delta = ((1.+lyapix.delta)/np.exp(-taueff_evo(lyapix.z)))-1.


### Read in galaxies and generate randoms
We use the catalog created with GRAB_COEVAL_GAL.IPYNB

At the same time, also generate mock catalog 

In [None]:
galfil = os.path.join(constants.GAL_DIR_BASE, f'cat_galxcorr_cl2020_uniq_{constants.DATA_VERSION}.dat')
gal = ascii.read(galfil, format='ipac')

#nonuniq_gal = ascii.read(os.path.join(constants.GAL_DIR_BASE, f'cat_galxcorr_cl2020_nonuniq_{constants.DATA_VERSION}.dat'), format='ipac')

# Drop ZFIRE galaxies. And also 3DHST
included_surveys = ['CLAMATO', 'MOSDEF', 'VUDS', 'zDeep']
indices_to_drop = []
for i in range(len(gal)):
    if gal[i]['source'] not in included_surveys:
        indices_to_drop.append(i)
gal.remove_rows(indices_to_drop)

# indices_to_drop = []
# for i in range(len(nonuniq_gal)):
#     if nonuniq_gal[i]['source'] not in included_surveys:
#         indices_to_drop.append(i)
# nonuniq_gal.remove_rows(indices_to_drop)

print(f'Stacked catalog has {len(gal)} galaxies')

specz_cat = ascii.read(os.path.join(constants.GAL_DIR_BASE, 'all_specz_v3_comb_COSMOS2020_v3.dat'))

indices_to_drop = []
for i in range(len(specz_cat)):
    if specz_cat[i]['source'] not in included_surveys:
        indices_to_drop.append(i)
specz_cat.remove_rows(indices_to_drop)

del specz_cat['id']
del specz_cat['zspec']
specz_cat.rename_column('ID_specz', 'id')
specz_cat['id'] = specz_cat['id'].astype(int)

gal = astropy.table.join(gal, specz_cat, keys='id')

print(f'After stellar mass join, galaxy catalog has {len(gal)} galaxies.')

# Use best-fit (minimum chi2) stellar masses. KG says best-fit, and also probably not much difference.
log_smass_obs = gal['Ms_best']

# Drop the two galaxies which have negative log masses.
print(f'{np.sum(log_smass_obs <= 0)} galaxies have negative log stellar masses; dropping.')
print(f'{np.sum(np.isnan(log_smass_obs))} galaxies have NaN stellar masses; also dropping these.')
gal = gal[log_smass_obs > 0]
log_smass_obs = log_smass_obs[log_smass_obs > 0]

print(f'After drops, final stacked catalog has {len(gal)} galaxies.')

In [None]:
# print(len(nonuniq_gal))
# print(len(set(nonuniq_gal['id'])))

In [None]:
# Plot histogram of stellar masses
plt.hist(log_smass_obs, bins=50);
plt.xlabel('Best fit log stellar mass [M_sun]')
plt.ylabel('# galaxies')
# plt.axvline(np.percentile(log_smass_obs, 25), color='black')
# plt.axvline(np.percentile(log_smass_obs, 50), color='black')
# plt.axvline(np.percentile(log_smass_obs, 75), color='black', label='25/50/75 percentiles')

plt.axvline(np.percentile(log_smass_obs, 33.3), color='black')
plt.axvline(np.percentile(log_smass_obs, 66.6), color='black', label='33.3/66.6 percentiles')

print(np.percentile(log_smass_obs, 33.3), np.percentile(log_smass_obs, 66.6))

plt.legend()

In [None]:
print(np.sum(log_smass_obs < 8))

In [None]:
# Plot histogram of stellar masses
fig, ax = plt.subplots(1, 2, figsize=(9, 3.5), sharey=True)
# plt.figure(figsize=(5,3.5))

survey_names = sorted(list(set(gal['source_1'])))
survey_smass = [log_smass_obs[gal['source_1'] == s] for s in survey_names]
survey_rmag = [gal[gal['source_1'] == s]['rmag'] for s in survey_names]

plt.sca(ax[0])
print(f'{np.sum(gal["rmag"] < 0)} gals have -99 (blank) R-band magnitude.')
min_nonneg_rmag = np.min(gal[gal['rmag'] > 0]['rmag'])
plt.hist(survey_rmag, stacked=True, bins=50, alpha=0.4, label=survey_names, range=(min_nonneg_rmag, np.max(gal['rmag'])))
plt.xlabel('HSC r-band apparent magnitude')
plt.ylabel(r'$N_{gal}$')

plt.sca(ax[1])
plt.hist(survey_smass, stacked=True, bins=50, alpha=0.4, label=survey_names)
# for s in set(gal['source_1']):
#     plt.hist(log_smass_obs[gal['source_1'] == s], bins=50, alpha=0.4, label=s)
plt.xlabel(r'$\log_{10}(M_* / M_\odot)$')
# plt.axvline(np.percentile(log_smass_obs, 25), color='black')
# plt.axvline(np.percentile(log_smass_obs, 50), color='black')
# plt.axvline(np.percentile(log_smass_obs, 75), color='black', label='25/50/75 percentiles')

plt.axvline(np.percentile(log_smass_obs, 33.3), color='black', ls='--')
plt.axvline(np.percentile(log_smass_obs, 66.6), color='black', ls='--')#, label='33.3/66.6 percentiles')

plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(constants.FIG_DIR_BASE, 'split-gal-sample.png'), dpi=200)
plt.savefig(os.path.join(constants.FIG_DIR_BASE, 'split-gal-sample.pdf'))

In [None]:
np.sum(gal['rmag'] < 0)

In [None]:
print(np.sum(gal[gal['rmag'] < 0]['source_1'] == 'MOSDEF'))

In [None]:
smass_bin_boundaries = [-np.inf, np.quantile(log_smass_obs, 1/3), np.quantile(log_smass_obs, 2/3), np.inf]

bin_titles = constants.STACKED_BIN_TITLES

## Approximate "combined" mock z-dispersion & offset

For the bias-HM pipeline where we use Vega mocks, since our combined bias is across surveys, draw a z-offset/disp using the mock covar parameters (close to fitted values), then try fitting a combined Gaussian to the combined unique catalog.

In [None]:
# v14-final from mocksurvey_galgen.ipynb
survey_zparams = {
    'MOSDEF':  (-0.84, 1.91),
    'zDeep':   (-1.87, 3.88),
    # These differ from above 'converged' values since they're the output posteriors of the above values.
    'VUDS':    (-2.36, 2.64),
    'CLAMATO': (-2.12, 2.20),
}

rng = np.random.default_rng(seed=4892347589)

survey_ngal = {k: np.sum(gal['source_1'] == k) for k in survey_zparams.keys()}
print(survey_ngal)

In [None]:
def combined_survey_realization():
    z_samples = []
    for s, (mean, sig) in survey_zparams.items():
        z_samples.append(rng.normal(loc=mean, scale=sig, size=survey_ngal[s]))
    return np.concatenate(z_samples)

n_realizations = 1000

bootstrap_combined_survey_z = np.concatenate([combined_survey_realization() for _ in range(n_realizations)])

plt.hist(bootstrap_combined_survey_z, bins=100);

print(np.mean(bootstrap_combined_survey_z), np.std(bootstrap_combined_survey_z))

### Read in bin edges 

In [None]:
PiBin_fil = os.path.join(constants.XCORR_DIR_BASE, 'bins23_pi_0-30hMpc.txt')
SigBin_fil = os.path.join(constants.XCORR_DIR_BASE, 'bins10_sigma_0-30hMpc.txt')

PiBins0 = ascii.read(PiBin_fil)
SigBins0 = ascii.read(SigBin_fil)

PiEdges = PiBins0['pi_edges'].data
SigEdges = SigBins0['sigma_edges'].data

# Convert bin boundaries from Mpc/h to Mpc
PiEdges  = PiEdges/(len(PiEdges)*[cosmo.h])
SigEdges = SigEdges/(len(SigEdges)*[cosmo.h])

print('Pi bin edges in Mpc:')
print(PiEdges)
print('Sigma bin edges in Mpc:')
print(SigEdges)


PiBound = (min(PiEdges), max(PiEdges) )

### Compute Cross-Correlation For Stellar Mass Bins (Stacked catalog)


In [None]:
base_dir = os.path.join(constants.XCORR_DIR_BASE, 'stacked')

binned_Coord = []

for i in range(len(smass_bin_boundaries) - 1):
    lb, ub = smass_bin_boundaries[i], smass_bin_boundaries[i + 1]
    mask = (log_smass_obs >= lb) & (log_smass_obs < ub)
    masked_cat = gal[mask]
    avg_smass = np.mean(log_smass_obs[mask])
    print(f'Log smass {lb} - {ub} | # gal {np.sum(mask)} | Average log smass {avg_smass} | Median log smass {np.median(log_smass_obs[mask])}')
    binned_Coord.append((lb, ub, avg_smass, SkyCoord(ra=masked_cat['ra'], dec=masked_cat['dec'],
                                                     distance=cosmo.comoving_distance(masked_cat['zspec']))))

for (smass_lb, smass_ub, avg_smass, Coord), title in zip(binned_Coord, bin_titles):
    XCorr, _ = xcorr.xcorr_gal_lya(Coord, lyapix, SigEdges, PiEdges, cosmo=cosmo)
    np.save(os.path.join(base_dir, f"xcorr_stacked_{title}_globalf_{constants.DATA_VERSION}.npy"), XCorr.value)
        
    # Plotting code. For 1D, since 2D cross-correlations are sus due to mixing different surveys.
    SigCenters = (SigEdges[1:] + SigEdges[:-1]) / 2
    
    plt.plot(SigCenters, np.sum(XCorr, axis=1), label=f'{avg_smass:.2f}')

plt.xlabel(r'$\sigma\; (\mathrm{cMpc})$')
plt.ylabel('Parallel-summed cross-correlation')
plt.legend(title='Average log-smass of bin [M_sun]')

### Compute Cross-Correlation For Stellar Mass Bins (Separate surveys)

In [None]:
# Of interest because fitted bias is very low. Want to see if it lies within the COSTCO-I footprint.
mosdef_medium_mask = None

for survey in included_surveys:
    survey_filename_fmt = survey.lower() if survey != 'zDeep' else survey
    base_dir = os.path.join(constants.XCORR_DIR_BASE, 'split')
    os.makedirs(base_dir, exist_ok=True)

    binned_Coord = []
    bin_proportions = []
    for i in range(len(smass_bin_boundaries) - 1):
        lb, ub = smass_bin_boundaries[i], smass_bin_boundaries[i + 1]
        mask = (log_smass_obs >= lb) & (log_smass_obs < ub)
        mask = mask & (gal['source_1'] == survey)
        if i == 1 and survey == 'MOSDEF':
            mosdef_medium_mask = mask
        masked_cat = gal[mask]
        avg_smass = np.mean(log_smass_obs[mask])
        print(f'Survey {survey} | Log smass {lb} - {ub} | # gal {np.sum(mask)} | Median log smass {np.median(log_smass_obs[mask])} ')
        binned_Coord.append((lb, ub, avg_smass, SkyCoord(ra=masked_cat['ra'], dec=masked_cat['dec'],
                                                         distance=cosmo.comoving_distance(masked_cat['zspec']))))
        bin_proportions.append(np.sum(mask) / np.sum(gal['source_1'] == survey))
    assert np.isclose(np.sum(bin_proportions), 1)
    print(bin_proportions)

    for (smass_lb, smass_ub, avg_smass, Coord), bin_prop, title in zip(binned_Coord, bin_proportions, bin_titles):
        XCorr, _ = xcorr.xcorr_gal_lya(Coord, lyapix, SigEdges, PiEdges, cosmo=cosmo)
        np.save(os.path.join(base_dir, f"xcorr_{survey_filename_fmt}_{title}_globalf_{constants.DATA_VERSION}.npy"), XCorr.value)
        np.save(os.path.join(base_dir, f"binprop_{survey_filename_fmt}_{title}_{constants.DATA_VERSION}.npy"), bin_prop)

## Do MOSDEF medium-mass galaxies lie close to COSTCO-I?

Since we find that the bias for this bin/survey specifically is very low.

In [None]:
costco_gal_cat = pd.read_csv(os.path.join(constants.GAL_DIR_BASE, 'COSTCO_I_member.csv'))
costco_gal_cat['ID'] = costco_gal_cat['ID'].astype(int)

In [None]:
mosdef_medium_gal = gal[mosdef_medium_mask]

In [None]:
mosdef_medium_gal

In [None]:
costco_gal_cat

In [None]:
# Do any IDs match?
print(costco_gal_cat['ID'].isin(mosdef_medium_gal['id']))

In [None]:
# Are MOSDEF medium-mass galaxies close to the COSTCO cluster?
costco_skycoord = SkyCoord(ra=150.11 * u.deg, dec=2.161 * u.deg, distance=dz_to_dmpch(2.298) / cosmo.h * u.Mpc)

costco_gal_skycoord = SkyCoord(ra=costco_gal_cat['RA'] * u.deg, dec=costco_gal_cat['Dec'] * u.deg, distance=dz_to_dmpch(costco_gal_cat['zspec']) / cosmo.h * u.Mpc)

mosdef_medium_skycoord = SkyCoord(ra=mosdef_medium_gal['ra'], dec=mosdef_medium_gal['dec'], distance=dz_to_dmpch(mosdef_medium_gal['zspec']) / cosmo.h * u.Mpc)

In [None]:
costco_skycoord

In [None]:
costco_gal_skycoord

In [None]:
sorted(costco_skycoord.separation_3d(costco_gal_skycoord))

In [None]:
print(sorted(costco_skycoord.separation_3d(mosdef_medium_skycoord).to(u.Mpc).value))
plt.hist(costco_skycoord.separation_3d(mosdef_medium_skycoord).to(u.Mpc).value);
plt.xlabel('3D separation between COSTCO-I protocluster center and MOSDEF medium-mass galaxies (Mpc)');

In [None]:
plt.hist(costco_skycoord.separation(mosdef_medium_skycoord).to(u.deg).value);
plt.xlabel('Angular separation between COSTCO-I protocluster center and MOSDEF medium-mass galaxies (deg)');

## ZFIRE Protocluster

In [None]:
zfire_skycoord = SkyCoord(ra=150.094 * u.deg, dec=2.251 * u.deg, distance=dz_to_dmpch(2.095) / cosmo.h * u.Mpc)

In [None]:
print(sorted(zfire_skycoord.separation_3d(mosdef_medium_skycoord).to(u.Mpc).value))
plt.hist(zfire_skycoord.separation_3d(mosdef_medium_skycoord).to(u.Mpc).value);
plt.xlabel('3D separation between ZFIRE protocluster center and MOSDEF medium-mass galaxies (Mpc)');