# Exploring the __SLICS-HR__ particle data
notebook by _Alex Malz (GCCL@RUB)_, (add your name here)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline

## Read in data

Download one of the 64 nodes $\times$ 20 redshifts files at each redshift from Joachim Harnois-Deraps to start.
I chose file 0 at $z=0.080$ for this example.
Read in from binary float(4) format and throw out first 12 entries as unwanted header information.

In [None]:
z_str = '0.080'
fn_base = 'xv'
fn_index = 0
fn_ext = '.dat'
fn = z_str + fn_base + str(fn_index) + fn_ext

In [None]:
dt_each = 'f' + str(4)
dt = np.dtype([('x', dt_each), ('y', dt_each), ('z', dt_each), ('vx', dt_each), ('vy', dt_each), ('vz', dt_each)])

In [None]:
with open(fn, 'rb') as f1:
    raw_data = np.fromfile(f1, dtype=dt)

In [None]:
raw_data[:10]

In [None]:
loc_data = pd.DataFrame(data=raw_data[2:], columns=['x', 'y', 'z', 'vx', 'vy', 'vz'])

In [None]:
loc_data[:10]

## Convert to physical units

The particle data starts out in simulation units relative to the per-node subvolume and needs to be converted to physical units in the space of all subvolumes before the whole volume can be considered.
Note that the conversion below makes sense for `x`, `y`, and `z` but not for `vx`, `vy`, and `vz`.

In [None]:
# number of MPI tasks per dimension
nodes_dim = 4

# subvolume size
ncc = 768

# volume size
rnc = 3072.

In [None]:
for k1 in range(1, nodes_dim+1):
    for j1 in range(1, nodes_dim+1):
        for i1 in range(1, nodes_dim+1):
            if fn_index == (i1 - 1) + (j1 - 1) * nodes_dim + (k1 - 1) * nodes_dim ** 2:
                print('found index '+str(fn_index)+' at '+str((i1, j1, k1)))
                node_coords = {'x': i1 - 1, 'y': j1 - 1, 'z': k1 - 1}
                print(node_coords)

In [None]:
# shift data
glob_data = loc_data
for col in ['x', 'y', 'z']:
    glob_data[col] = np.remainder(loc_data[col] + node_coords[col] * ncc, rnc)
    assert(max(glob_data[col] <= rnc))

In [None]:
glob_data[:10]

In [None]:
# convert to Mpc/h
phys_data = glob_data * 505. / 3072.

In [None]:
len(phys_data)

In [None]:
phys_data[:10]

In [None]:
plt.hist(phys_data['x'], alpha=0.25, density=True)
plt.hist(phys_data['y'], alpha=0.25, density=True)
plt.hist(phys_data['z'], alpha=0.25, density=True)

## Spatially subsample data

Turns out 1/64th of the total data was still way more than we could reasonably use at once to compute correlation functions!  

In [None]:
# slice the box by scale not number of galaxies

In [None]:
distances = np.sqrt(phys_data['x']**2 + phys_data['y']**2 + phys_data['z']**2)

In [None]:
order = np.argsort(distances)

In [None]:
phys_data.iloc[order[:100]].to_csv('pos100.csv', header=False, index=False, sep=' ', columns=['x', 'y', 'z'])

In [None]:
phys_data.iloc[order[:1000]].to_csv('pos1000.csv', header=False, index=False, sep=' ', columns=['x', 'y', 'z'])

In [None]:
phys_data.iloc[order[:10000]].to_csv('pos10000.csv', header=False, index=False, sep=' ', columns=['x', 'y', 'z'])

In [None]:
phys_data.iloc[order[:100000]].to_csv('pos100000.csv', header=False, index=False, sep=' ', columns=['x', 'y', 'z'])

In [None]:
phys_data.iloc[order[:1000000]].to_csv('pos1000000.csv', header=False, index=False, sep=' ', columns=['x', 'y', 'z'])

## Examine the precomputed 2PCF

Download the 2PCF at several redshifts [here](https://drive.google.com/drive/folders/1eGlAO_wl9h0xiXiTMKV_m7h9YCRhDHP_?usp=sharing).

Note that the data is $\Delta^{2}(k)$, not the more familiar (to me) $\mathcal{P}(k)$.  (A reminder of the relationship between them can be found [here](http://universe-review.ca/R05-04-powerspectrum.htm), particularly in [this figure](http://universe-review.ca/I02-20-correlate1b.png).)

In [None]:
pk = np.genfromtxt('NptFns/0.042ngpps_new.dat_LOS1').T

In [None]:
plt.plot(pk[0], pk[1])
plt.semilogx()
plt.semilogy()
plt.xlabel(r'$k$ [Mpc/h]')
plt.ylabel(r'$\Delta^2(k)$')

In [None]:
rmin = 2 * np.pi / max(pk[0])
rmax = 2 * np.pi / min(pk[0])
print((rmin, rmax))

# Next steps

Ultimately, we will need to calculate the 2 and 3+ point correlation functions of the particle data.
Because the data is split into 64 files per redshift, we also need a way to combine the positional information from each file to get coherent correlation functions.
We may be able to more easily accomplish both goals if we first smooth the data using a Fourier-space basis like wavelets.

## combine particle data from multiple files

## calculate the N-point correlation functions