# ~~Exploring~~ Preparing the __SLICS-HR__ particle data
notebook by _Alex Malz (GCCL@RUB)_, (add your name here)

In [None]:
import astropy as ap
from astropy.cosmology import FlatLambdaCDM
import matplotlib as mpl
import matplotlib.pyplot as plt
import multiprocessing as mp
import numpy as np
import os
import pandas as pd
from scipy.stats import gaussian_kde

%matplotlib inline

## Some important constants

We're only considering the first 8 SLICS snapshots because the GAMA data doesn't have good enough coverage beyond that.

In [None]:
z_SLICS = np.array([0.042, 0.080, 0.130, 0.221, 0.317, 0.418, 0.525])
#, 0.640, 0.764, 0.897, 1.041, 1.199, 1.372, 1.562, 1.772, 2.007, 2.269, 2.565, 2.899])
z_mids = (z_SLICS[1:] + z_SLICS[:-1]) / 2.
z_bins = np.insert(z_mids, 0, 0.023)
z_bins = np.append(z_bins, 3.066)

In [None]:
# number of MPI tasks per dimension
nodes_dim = 4

# volume size
rnc = 3072.

# subvolume size
ncc = rnc / nodes_dim

# physical scale in Mpc/h
phys_scale = 505.

## How much data do we need?

The SLICS cosmology has $\Omega_{m} = 0.2905$, $\Omega_{\Lambda} = 0.7095$, $\Omega_{b} = 0.0473$, $h = 0.6898$, $\sigma_{8} = 0.826$, and $n_{s} = 0.969$.
Let's assume the naive Cartesian-to-angular coordinates and flatten along the `z` direction.

In [None]:
boxdepth = 500.e6 / 4. * ap.units.pc
# boxdepth per file, not per redshift integrated!

h = 0.6898
cosmo = FlatLambdaCDM(H0=100.*h, Om0=0.2905, Ob0=0.0473)

### line-of-sight integration depth

We need to ensure a correspondance between the redshift range of the GAMA data about each $z_{SLICS}$ and the depth of SLICS data to integrate over when projecting the 3D density into 2D sky coordinates.
For simplicity, let's do this along one of the Cartesian axes of the SLICS data even though it can justifiably be done along any line of sight projection.
It doesn't matter which is chosen first between the GAMA redshift bin ends and the SLICS projection depth, but life is easier if we don't have to combine multiple SLICS files, meaning that depth should be $depth_{physical} \leq \frac{1}{4} \times 500 Mpc/h$.
In fact, it would be easiest to take $depth_{physical} = \frac{1}{4} \times 500 Mpc/h$.
The next step would be to calculate the redshift bin endpoints for GAMA under the SLICS cosmology.

In [None]:
d_comov = []
for z in z_SLICS:
    dc = cosmo.comoving_distance(float(z))
    d_comov.append(dc.value / h)
d_comov = np.array(d_comov) * 1e6 * ap.units.pc

d_comov_mins = d_comov - boxdepth / 2.
d_comov_maxs = d_comov + boxdepth / 2.
min_zs = [ap.cosmology.z_at_value(cosmo.comoving_distance, d_comov_min * h) for d_comov_min in d_comov_mins]
max_zs = [ap.cosmology.z_at_value(cosmo.comoving_distance, d_comov_max * h) for d_comov_max in d_comov_maxs]
# print(list(zip(min_zs, max_zs)))

### area

Area will be determined by the maximum distance over which colors are correlated.
We expect the characteristic scale to not be significantly larger than a galaxy cluster, $diameter \leq 10 Mpc$.
For now, let's use a placeholder.

In [None]:
dmax = 10.e6 * ap.units.pc
rmax = dmax / h / 2.
d_ang = (d_comov_maxs + d_comov_mins) / 2. / (1 + z_SLICS)

### particle density field bubble locations

We only need mock galaxy positions corresponding to areas over which galaxy colors are correlated.
Let's assume we have this function $radius(z)$ evaluated at each $z_{SLICS}$ where there is sufficient GAMA data.
Further, let's assume that $radius(z) \ll \frac{1}{4} \times 500 Mpc/h$ so we don't have to combine multiple SLICS files or worry about boundaries within SLICS files.
We can choose to center our cylinders of data where the projected density field is highest, so a first step is to identify some such points.


Read in from binary float(4) format.

In [None]:
dt_each = 'f' + str(4)
dt = np.dtype([('x', dt_each), ('y', dt_each), ('z', dt_each), ('vx', dt_each), ('vy', dt_each), ('vz', dt_each)])

In [None]:
cubeside = rnc / nodes_dim
dmin = cubeside * rmax / (phys_scale * 1.e6 * ap.units.pc / nodes_dim)
rmin = dmin / 2.
resolution = (phys_scale * 1.e6 * ap.units.pc / nodes_dim) / rmax

Throw out first 12 entries as unwanted header information.

In [None]:
def help_read(which_z, fn_index):
    z_str = '{:<05}'.format(str(z_SLICS[which_z]))
    fn_base = 'xv'
    fn_ext = '.dat'
    fn = z_str + fn_base + str(fn_index) + fn_ext
    data_dir = 'particle_data/cuillin.roe.ac.uk/~jharno/SLICS/SLICS_HR/LOS1'
    with open(os.path.join(data_dir, fn), 'rb') as f1:
        raw_data = np.fromfile(f1, dtype=dt)
    if loc_data.duplicated().any():
        print('duplicates found in z='+str(z_SLICS[which_z])+' box='+str(fn_index)+'!')
    loc_data = pd.DataFrame(data=raw_data[2:], columns=['x', 'y', 'z', 'vx', 'vy', 'vz']).drop_duplicates()
    assert(~loc_data.duplicated().any())
    return(loc_data)

Sadly, throwing out duplicates is slow (why???), but it really should be done before the coarse histogram so has to happen on the whole file.

TODO: combine z coordinates over multiple files

In [None]:
# testz = 2
# testfn = 21

In [None]:
# loc_data = help_read(which_z=testz, fn_index=testfn)

We want to select in the native coordinates of the data so we only perform the conversion to physical coordinates for the portions of data we're going to use.
The following cell is a bit slow.


In [None]:
# (coarsedensity, xedges, yedges) = np.histogram2d(loc_data['x'], loc_data['y'], bins=int(resolution))

TODO: finedensity to pick sane bubble centers

In [None]:
# plt.hist(coarsedensity.flatten(), bins=int(resolution))
# plt.semilogx()
# plt.savefig('densitydist.png')

In [None]:
# extreme = np.quantile(coarsedensity.flatten(), 0.99)

In [None]:
# indcenters = np.argwhere(coarsedensity > extreme)
# xcenters = (xedges[indcenters.T[0]] + xedges[indcenters.T[0]+1]) / 2
# ycenters = (yedges[indcenters.T[1]] + yedges[indcenters.T[1]+1]) / 2

In [None]:
# fig = plt.figure(figsize=(5, 5))
# ax = fig.add_subplot(111, title='projected histogram', aspect='equal')
# X, Y = np.meshgrid(xedges, yedges)
# ax.pcolormesh(X, Y, np.log(coarsedensity))
# plt.scatter(ycenters, xcenters, c='r', s=10, alpha=0.5)
# fig.savefig('loghistogram.png')

In [None]:
# bubbles = []
# for i, center in enumerate(indcenters):
#     bubble = loc_data.loc[lambda df: (df['x'] > xcenters[i] - rmin) & (df['x'] < xcenters[i] + rmin)
#                           & (df['y'] > ycenters[i] - rmin) & (df['y'] < ycenters[i] + rmin), :]
#     assert(~bubble.duplicated().any())
#     bubbles.append(bubble)

In [None]:
# plt.hist2d(loc_data['x'], loc_data['y'], bins=(200, 200), norm=mpl.colors.LogNorm(), cmap='Spectral_r')

In [None]:
# loc_to_plot = loc_data.sample(50000)
# plt.scatter(loc_to_plot['x'], loc_to_plot['y'], marker='.', s=1, alpha=0.5)

### Convert to physical units

The particle data starts out in simulation units relative to the per-node subvolume and needs to be converted to physical units in the space of all subvolumes before the whole volume can be considered.

Note that the conversion below makes sense for `x`, `y`, and `z` but not for `vx`, `vy`, and `vz`.
Because of how the data is distributed across the files, I think 21, 22, 25, 26, 37, 38, 41, 42 are "adjacent" and free of edge effects.
_Note_: We can just have this be an automated check, knowing that files are adjacent when their `node_coords` are the same aside from being off by one in one of their dimensions.

In [None]:
def help_find_coords(fn_index):
#     all_nodes_coords = np.empty((nodes_dim, nodes_dim, nodes_dim))
    for k1 in range(1, nodes_dim+1):
        for j1 in range(1, nodes_dim+1):
            for i1 in range(1, nodes_dim+1):
                current_ind = (i1 - 1) + (j1 - 1) * nodes_dim + (k1 - 1) * nodes_dim ** 2
                node_coords = {'x': i1 - 1, 'y': j1 - 1, 'z': k1 - 1}
                if fn_index == current_ind:
#                     print('found index '+str(fn_index)+' at '+str((i1, j1, k1)))
                    true_node_coords = node_coords
#                 all_nodes_coords[node_coords['x'], node_coords['y'], node_coords['z']] = current_ind
                    return(true_node_coords)
# print(all_nodes_coords)

In [None]:
# true_node_coords = help_find_coords(fn_index=testfn)

To get coherent coordinates across all files, we need to shift them accordingly.

In [None]:
def help_shift(true_node_coords, loc_data):
    # shift data
    glob_data = loc_data
    for col in ['x', 'y', 'z']:
        glob_data[col] = np.remainder(loc_data[col] + true_node_coords[col] * ncc, rnc)
        assert(max(glob_data[col] <= rnc))
    return(glob_data)

In [None]:
# globs = []
# for loc_data in bubbles:
#     glob_data = help_shift(loc_data)
#     globs.append(glob_data)

In [None]:
def help_convert(glob_data):
    # convert to Mpc/h
    phys_data = glob_data * phys_scale / rnc
    return(phys_data)

In [None]:
# pos_mpc = []
# for glob_data in globs:
#     phys_data = help_convert(glob_data)
#     pos_mpc.append(phys_data)

In [None]:
# for dim in ['x', 'y', 'z']:
#     plt.hist(phys_data[dim], density=True, alpha=0.5)
# plt.xlabel('distance (Mpc/h)')

In [None]:
# plt.hist2d(phys_data['x'], phys_data['y'], bins=(200,200), norm=mpl.colors.LogNorm(), cmap='Spectral_r')

In [None]:
# phys_to_plot = phys_data.sample(50000)
# plt.scatter(phys_to_plot['x'], phys_to_plot['y'], marker='.', s=1, alpha=0.5)

Now convert physical units to angular units:

In [None]:
# pos_ang = []
# for phys_data in pos_mpc:
#     ang_data = pd.DataFrame()
#     ang_data['RA'] = phys_data['x']# / d_ang.value[testz] * 180. / np.pi
#     ang_data['DEC'] = phys_data['y']# / d_ang.value[testz] * 180. / np.pi
#     pos_ang.append(ang_data.drop_duplicates())

In [None]:
# fig, axs = plt.subplots(1, len(pos_ang), figsize=(5*len(pos_ang), 5))
# for i, pos in enumerate(pos_ang):
#     axs[i].scatter(pos['RA'], pos['DEC'], s=0.1, c='k', alpha=0.1)
# # plt.savefig('bubbles.png')

In [None]:
# for i, pos in enumerate(pos_ang):
#     pos.to_csv('../DEAR/Data/bubbles/z'+str(testz)+'box'+str(testfn)+'bubble'+str(i)+'.csv', index=False)

These still look like they're cutting through the high-density regions, so I'm going to add an intermediate step to pick the bubble centers on a fine grid evaluated only within the vicinity of the densest of the coarse grid centers.

## Now, as a parallelized pipeline

In [None]:
bubble_combos = {}
for i in [2]:#range(len(z_SLICS)):
    for j in [22]:#, 22, 25, 26, 37, 38, 41, 42]:
        #Note: j loop will change a lot when SLICS files are combined
        bubble_combos['z'+str(i)+'box'+str(j)] = (i, j)
        
pathname = '../DEAR/Data/bubbles/'

In [None]:
def isolate_one_bubble(one_key):
    (testz, testfn) = bubble_combos[one_key]
    print('starting z='+str(z_SLICS[testz]))
    zpath = pathname+'z'+str(testz)
    if os.path.isdir(zpath) == False:
        os.system('mkdir '+ zpath)
    boxpath = zpath+'/box'+str(testfn)
    if os.path.isdir(boxpath) == False:
        os.system('mkdir '+ boxpath)
    elif os.listdir(boxpath) != []:
        print('not-rerunning z='+str(z_SLICS[testz])+' box='+str(testfn))
        return None
    #could use this as a place to not have to reload SLICS data file, instead save bubble before doing finedensity
    print('starting box='+str(testfn)+', loading SLICS data (the slow step)')
    loc_data = help_read(which_z=testz, fn_index=testfn)
    print('loaded SLICS data, identifying bubble centers')
    (coarsedensity, xedges, yedges) = np.histogram2d(loc_data['x'], loc_data['y'], bins=int(resolution))
    extreme = np.quantile(coarsedensity.flatten(), 0.99)
    indcenters = np.argwhere(coarsedensity > extreme)
    #finedensity step would go here
    xcenters = (xedges[indcenters.T[0]] + xedges[indcenters.T[0]+1]) / 2
    ycenters = (yedges[indcenters.T[1]] + yedges[indcenters.T[1]+1]) / 2
    print('identified bubble centers, going through each bubble')
#     bubbles, globs, pos_mpc, pos_ang = [], [], [], []
    for i, center in enumerate(indcenters):
        bubpath = boxpath+'/bub'+str(i)
        if os.path.isdir(bubpath) == False:
            os.system('mkdir '+ bubpath)
        bubble = loc_data.loc[lambda df: (df['x'] > xcenters[i] - rmin) & (df['x'] < xcenters[i] + rmin)
                          & (df['y'] > ycenters[i] - rmin) & (df['y'] < ycenters[i] + rmin), :]
        if ~bubble.duplicated().any():
            print('no duplicate coordinates in x, y, z, vx, vy, vz')
        bubble.to_csv(bubpath+'/particles.csv', index=False)
        print('saved bubble particles to not have to load whole SLICS file again, next transform data')
#         bubbles.append(bubble)
    return
    

In [None]:
nps = mp.cpu_count() - 1
pool = mp.Pool(nps)
pool.map(isolate_one_bubble, bubble_combos.keys())

In [None]:
def project_one_bubble(one_key):
    (testz, testfn) = bubble_combos[one_key]
    true_node_coords = help_find_coords(fn_index=testfn)
    zpath = pathname+'z'+str(testz)
    boxpath = zpath+'/box'+str(testfn)
    bubpaths = os.listdir(boxpath)
    fullpaths = [boxpath+'/'+bubpath+'/' for bubpath in bubpaths]
    for fullpath in fullpaths:
        bubble = pd.read_csv(fullpath+'particles.csv')
        glob_data = help_shift(true_node_coords, bubble)
#         globs.append(glob_data)
        phys_data = help_convert(glob_data)
#         pos_mpc.append(phys_data)
        ang_data = pd.DataFrame()
        ang_data['RA'] = phys_data['x'] / d_ang.value[testz] * 180. / np.pi
        ang_data['DEC'] = phys_data['y'] / d_ang.value[testz] * 180. / np.pi
        if bubble.duplicated().any():
            print('duplicate particles introduced by dropping z, vx, vy, vz')
#         if bubble.duplicated().any():
#             print('dropped duplicates')
#             to_save = ang_data.drop_duplicates()
#         pos_ang.append(ang_data.drop_duplicates())
        #
        ang_data.to_csv(fullpath+'projection.csv', index=False)
        print('shifted from machine, converted to physical, projected to angular coordinates, and saved '+str(len(ang_data)))
    return

In [None]:
nps = mp.cpu_count() - 1
pool = mp.Pool(nps)
pool.map(project_one_bubble, bubble_combos.keys())

TODO: start RDEAR process with jittering duplicates if any introduced by dropping z, vx, vy, vz

# attic below here!

## How much data do we need?

### How much depth?

Obtain necessary depth from ~~[Ned Wright's cosmology calculator](http://www.astro.ucla.edu/~wright/CosmoCalc.html)~~ `astropy`.
The SLICS cosmology has $\Omega_{m} = 0.2905$, $\Omega_{\Lambda} = 0.7095$, $\Omega_{b} = 0.0473$, $h = 0.6898$, $\sigma_{8} = 0.826$, and $n_{s} = 0.969$.
Let's assume the naive Cartesian-to-angular coordinates and flatten along the `z` direction.
We need to flatten a depth corresponding to the bounds of each redshift bin.

In [None]:
h = 0.6898
cosmo = FlatLambdaCDM(H0=100.*h, Om0=0.2905, Ob0=0.0473)
d_comov = []
for z in z_bins:
    dc = cosmo.comoving_distance(float(z))
    d_comov.append(dc.value / h)
d_comov = np.array(d_comov)
depths = d_comov[1:] - d_comov[:-1]

avg_d_comov = []
for z in z_SLICS:
    dc = cosmo.comoving_distance(float(z))
    avg_d_comov.append(dc.value / h)
    
print(depths)
print(avg_d_comov)

In [None]:
n_z = np.ceil(depths / phys_scale)
print(n_z)

Sadly, `depths` < `phys_scale` $Mpc/h$ only in the first three redshift bins, meaning the depths of the next three GAMA redshift bin may require opening two files, and the last of these bins cannot be turned into a mock catalog based on SLICS.
I think the way they're arranged means that (21, 37), (22, 38), (25, 41), and (26, 42) are pairs adjacent in `z`.

_This is as good a time as any to note that our mock catalog will have a bit of a degeneracy if we use the same file numbers for all redshifts because each file corresponds to the same physical volume across cosmic time, whereas in a real survey, our redshift bins contain different volumes/galaxies.
We have a choice to make about discontinuities or non-physical repetitition._

### How much area?

Obtain angular diameter distance $d_{a}$ in units $\theta = x / d_{a}$ with $d_{a} = d_{c} / (1 + z)$, where $d_{c}$ is the comoving diameter distance and $x$ is the distance in the SLICS data.
Compare with the GAMA footprint of $286^{\circ^{2}} * (\pi / 180^{\circ})^{2} \approx 0.087 sr$.

In [None]:
d_ang = avg_d_comov / (1 + z_SLICS)
theta_box = phys_scale / d_ang * 180. / np.pi
footprint = theta_box**2
print(footprint)

The scaling behavior is as expected;
`phys_scale` subtends a larger angle at low redshifts and a smaller angle at high redshifts.
One file's worth of SLICS data subtends an angular area larger than the GAMA footprint in the first five GAMA redshift bins, but the next three GAMA redshift bins would definitely require more than one file's worth of data.
We need to pick an angular area for our mock galaxy catalog.
Let's go with twice that for now.
~~_Do we think twice the GAMA area is sufficiently compelling?_~~

In [None]:
theta_gama = 286.
GAMA_phys_scale = np.sqrt(theta_gama) * (np.pi / 180.) * d_ang
print(GAMA_phys_scale**2 / 505**2)

In [None]:
theta_mock = 2. * theta_gama

Count how many files are needed to fill out the RA/DEC space for a mock survey of twice the GAMA area.

In [None]:
n_xy = np.ceil(theta_gama / footprint)
# n_xy = np.empty(len(z_SLICS))
# one_xy = np.where(footprint > theta_mock)[0]
# n_xy[one_xy] = 1
# i = 1
# while i <= 64:
#     which_xy = np.where((i * footprint < theta_mock) & ((i+1) * footprint > theta_mock))[0]
#     n_xy[which_xy] = i+1
#     i += 1
print(n_xy)

If we go with twice the GAMA footprint, then the first four redshift bins need only one file but the next two need 2, and the two after that need 3 and 4.
I think (21, 22), (25, 26), (37, 38), and (41, 42) are adjacent in `x`/`RA` and (21, 25), (22, 26), (37, 41), and (38, 42) are adjacent in `y`/`DEC`.
However, we'll have to open the files that approach the edges for the last two redshifts.

Now we'll define the limiting size in RA/DEC, and to skip a time-consuming conversion of the data, will convert that into x/y at each redshift.

In [None]:
lim_omega = 2 * theta_gama
lim_a = lim_omega * (np.pi / 180.* d_ang)**2

lim_theta = np.sqrt(lim_omega)
lim_xy =  lim_theta * np.pi / 180.* d_ang

print(sum(lim_a / phys_scale**2))

We can use squares with a shared origin for now, and if at some point if we really want non-overlapping footprints, we know we'll need the area of 11 out of 16 files.

## What it looks like for one file

### Read in data

Download one of the 64 nodes $\times$ 20 redshifts files at each redshift from Joachim Harnois-Deraps to start.

Read in from binary float(4) format.

In [None]:
dt_each = 'f' + str(4)
dt = np.dtype([('x', dt_each), ('y', dt_each), ('z', dt_each), ('vx', dt_each), ('vy', dt_each), ('vz', dt_each)])

Throw out first 12 entries as unwanted header information.

In [None]:
def help_read(which_z, fn_index):
    z_str = '{:<05}'.format(str(z_SLICS[which_z]))
    fn_base = 'xv'
    fn_ext = '.dat'
    fn = z_str + fn_base + str(fn_index) + fn_ext
    data_dir = 'particle_data/cuillin.roe.ac.uk/~jharno/SLICS/SLICS_HR/LOS1'
    with open(os.path.join(data_dir, fn), 'rb') as f1:
        raw_data = np.fromfile(f1, dtype=dt)
    loc_data = pd.DataFrame(data=raw_data[2:], columns=['x', 'y', 'z', 'vx', 'vy', 'vz'])
    return(loc_data)

Slow to read in

In [None]:
loc_data = help_read(which_z=2, fn_index=21)

In [None]:
# plt.hist2d(loc_data['x'], loc_data['y'], bins=(200, 200), norm=mpl.colors.LogNorm(), cmap='Spectral_r')

In [None]:
# len(loc_data)

In [None]:
# loc_to_plot = loc_data.sample(50000)
# plt.scatter(loc_to_plot['x'], loc_to_plot['y'], marker='.', s=1, alpha=0.5)

### Convert to physical units

The particle data starts out in simulation units relative to the per-node subvolume and needs to be converted to physical units in the space of all subvolumes before the whole volume can be considered.

Note that the conversion below makes sense for `x`, `y`, and `z` but not for `vx`, `vy`, and `vz`.
Because of how the data is distributed across the files, I think 21, 22, 25, 26, 37, 38, 41, 42 are "adjacent" and free of edge effects.
_Note_: We can just have this be an automated check, knowing that files are adjacent when their `node_coords` are the same aside from being off by one in one of their dimensions.

In [None]:
def help_find_coords(fn_index):
#     all_nodes_coords = np.empty((nodes_dim, nodes_dim, nodes_dim))
    for k1 in range(1, nodes_dim+1):
        for j1 in range(1, nodes_dim+1):
            for i1 in range(1, nodes_dim+1):
                current_ind = (i1 - 1) + (j1 - 1) * nodes_dim + (k1 - 1) * nodes_dim ** 2
                node_coords = {'x': i1 - 1, 'y': j1 - 1, 'z': k1 - 1}
                if fn_index == current_ind:
#                     print('found index '+str(fn_index)+' at '+str((i1, j1, k1)))
                    true_node_coords = node_coords
#                 all_nodes_coords[node_coords['x'], node_coords['y'], node_coords['z']] = current_ind
                    return(true_node_coords)
# print(all_nodes_coords)

In [None]:
true_node_coords = help_find_coords(fn_index=21)

To get coherent coordinates across all files, we need to shift them accordingly.
The next cell is unexpectely slow.

In [None]:
def help_shift(loc_data):
    # shift data
    glob_data = loc_data
    for col in ['x', 'y', 'z']:
        glob_data[col] = np.remainder(loc_data[col] + true_node_coords[col] * ncc, rnc)
        assert(max(glob_data[col] <= rnc))
    return(glob_data)

In [None]:
glob_data = help_shift(loc_data)

In [None]:
def help_convert(glob_data):
    # convert to Mpc/h
    phys_data = glob_data * phys_scale / rnc
    return(phys_data)

In [None]:
phys_data = help_convert(glob_data)

In [None]:
# for dim in ['x', 'y', 'z']:
#     plt.hist(phys_data[dim], density=True, alpha=0.5)
# plt.xlabel('distance (Mpc/h)')

In [None]:
# plt.hist2d(phys_data['x'], phys_data['y'], bins=(200,200), norm=mpl.colors.LogNorm(), cmap='Spectral_r')

In [None]:
# phys_to_plot = phys_data.sample(50000)
# plt.scatter(phys_to_plot['x'], phys_to_plot['y'], marker='.', s=1, alpha=0.5)

## Now let's try automating it

Note: We are not attempting to enforce the anisotropy of the sky, i.e. the particles in file number X are by and large the same particles at each redshift because they're in comoving coordinates.
In contrast, when we observe the sky, we don't see the same galaxies evolved to different redshifts but instead see different, coherent portions of the large-scale structure at different redshifts.
This is something we can address later, but I'm trying to keep it simple-ish for now.

Chop up or combine data files as needed
This is slow!

In [None]:
def right_depth(which_z, filenos):
    i = 0
    while i < 4:
        temp = np.mod(phys_data['z'] - min(phys_data['z']), phys_scale)
    return(phys_data[np.mod(phys_data['z'] - min(phys_data['z']), phys_scale) < depths[which_z]])

In [None]:
ang_data = right_depth(phys_data, which_z)#phys_data[np.mod(phys_data['z'] - min(phys_data['z']), phys_scale) < depths[which_z]]
ang_data['RA'] = ang_data['x'] / d_ang[which_z] * 180. / np.pi
ang_data['DEC'] = ang_data['y'] / d_ang[which_z] * 180. / np.pi

We'd change this for the area of our mock survey when we decide on it.
_There is an edge effect going on right now.
I need to switch to one of the internal files to avoid roll-over that's breaking min/max checks._

In [None]:
cut_data = phys_data[(phys_data['x'] < lim_xy + min(phys_data['x'])) & (phys_data['y'] < lim_xy + min(phys_data['y']))]
#ang_data[(ang_data['RA'] < lim_theta + min(ang_data['RA'])) & (ang_data['DEC'] < lim_theta + min(ang_data['DEC']))]

# plt.hist(cut_data['RA'])
# plt.hist(cut_data['DEC'])

cut_data.to_csv(z_str+'cut.csv', header=True, index=False, sep=',', columns=['RA', 'DEC'])

In [None]:
# plt.hist2d(cut_data['RA'], cut_data['DEC'], bins=(200,200), norm=mpl.colors.LogNorm(), cmap='Spectral_r')
# plt.xlabel('RA (deg)')
# plt.ylabel('DEC (deg)')

In [None]:
# cut_to_plot = cut_data.sample(50000)
# plt.scatter(cut_to_plot['x'], cut_to_plot['y'], marker='.', s=1, alpha=0.5)

# scratch after here

## Spatially subsample data

Turns out 1/64th of the total data was still way more than we could reasonably use at once to compute correlation functions!
This should really be sliced by size of box.
First, just break it up into smaller boxes.
Let's say we want $10^{5}$ particles per box, so we'll cut it in 16 in each dimension.

In [None]:
# for i in range(4):
#     j = i+1
#     subset = phys_data[(phys_data['x'] <= 10.*j) & (phys_data['y'] <= 10.*j) & (phys_data['z'] <= 10.*j)]
#     subset.to_csv('spat'+str(j)+'0Mpc.csv', header=False, index=False, sep=' ', columns=['x', 'y', 'z'])
#     angular = subset / 313.5 * 69.6 / 100. * float(j) * 180 / np.pi
#     print((min(angular['x']), max(angular['x'])))
#     print((min(angular['y']), max(angular['y'])))  
#     angular.to_csv('ang'+str(j)+'deg.csv', header=False, index=False, sep=',', columns=['x', 'y'])

In [None]:
# # distances = np.sqrt(phys_data['x']**2 + phys_data['y']**2 + phys_data['z']**2)
# splitpoints = {}
# for dim in ['RA', 'DEC']:
#     splitpoints[dim] = np.linspace(min(ang_data[dim]), max(ang_data[dim]), 17)
#     print(splitpoints[dim])

In [None]:
# for i in range(16):
#     for j in range(16):
#         subsample = ang_data.loc[(ang_data['RA'] >= splitpoints['RA'][i]) & (ang_data['RA'] <= splitpoints['RA'][i+1])\
#                                  & (ang_data['DEC'] >= splitpoints['DEC'][j]) & (ang_data['DEC'] <= splitpoints['DEC'][j+1])]
#         subsample.to_csv(z_str+'slice_'+str(i)+'_'+str(j)+'.csv', header=True, index=False, sep=',', columns=['RA', 'DEC'])

## Randomly subsample data

In [None]:
# print(angular)

In [None]:
# to_plot = angular.sample(5000)

In [None]:
# plt.hist2d(to_plot['x'], to_plot['y'], bins=100, norm=mpl.colors.LogNorm(), cmap='Spectral_r')
# plt.savefig('mock_gal_pos.png', dpi=250)

In [None]:
# try_distances = np.flip(np.geomspace(0.01, 1.0, 10), axis=0)

In [None]:
# import environment as galenv

# def calc_env(ind):
#     res = []
#     friends = data
#     for dist in try_distances:
#         friends = galenv.nn_finder(friends, data[ind], dist)
#         res.append(len(friends))
#     return res

In [None]:
# data = [to_plot['x'].values, to_plot['y'].values]
# print(data)

In [None]:
# data = np.array([to_plot['x'].values, to_plot['y'].values]).T
# nps = mp.cpu_count()
# pool = mp.Pool(nps - 1)
# envs = pool.map(calc_env, range(len(data)))
# pool.close()
# # envs_arr = np.array(all_envs)
# # envs_df = pd.DataFrame(data=envs_arr, index = envs_arr[:, 0], columns = ['CATAID']+[str(i) for i in try_distances])

# # df = pd.merge(envs_df, zdf, on='CATAID')
# # df.to_csv('enviros.csv')

no clue what to plot here. . .

## Examine the precomputed 2PCF

Download the 2PCF at several redshifts [here](https://drive.google.com/drive/folders/1eGlAO_wl9h0xiXiTMKV_m7h9YCRhDHP_?usp=sharing).

Note that the data is $\Delta^{2}(k)$, not the more familiar (to me) $\mathcal{P}(k)$.  (A reminder of the relationship between them can be found [here](http://universe-review.ca/R05-04-powerspectrum.htm), particularly in [this figure](http://universe-review.ca/I02-20-correlate1b.png).)

In [None]:
# pk = np.genfromtxt('NptFns/0.042ngpps_new.dat_LOS1').T

In [None]:
# plt.plot(pk[0], pk[1])
# plt.semilogx()
# plt.semilogy()
# plt.xlabel(r'$k$ [Mpc/h]')
# plt.ylabel(r'$\Delta^2(k)$')

In [None]:
# rmin = 2 * np.pi / max(pk[0])
# rmax = 2 * np.pi / min(pk[0])
# print((rmin, rmax))

# Next steps

Ultimately, we will need to calculate the 2 and 3+ point correlation functions of the particle data.
Because the data is split into 64 files per redshift, we also need a way to combine the positional information from each file to get coherent correlation functions.
We may be able to more easily accomplish both goals if we first smooth the data using a Fourier-space basis like wavelets.

## combine particle data from multiple files

## calculate the N-point correlation functions