# Create random sample (to give to Michael)
Here's how we created the random sample (i.e. various halos in LJ, some are FGs, most are not). It is built to include three narrow mass bins (same as the default bins in `make_masks()` and in each mass bin, there should be as many halos (random) as there are total number of fossils in that halo (as calculated from the `fg_forest`)

## Set Up

In [2]:
import haccytrees.mergertrees
import h5py
import numpy as np
import numba
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
from matplotlib.lines import Line2D
import matplotlib.colors as mcolors
import matplotlib.patches as mpatches
import pandas as pd
from astropy.cosmology import FlatLambdaCDM
import astropy.units as u
from itertools import groupby
from matplotlib.ticker import ScalarFormatter
%load_ext line_profiler
%reload_ext autoreload
%autoreload 1
%aimport help_func_haccytrees

plt.rcParams.update({
    "text.usetex": True,
    'font.size': 13,
    "figure.figsize": (5.25, 3.5),#(6.25, 4.25), #(6.25, 3.75)
    "patch.linewidth": 1
})

pyfftw not available, using numpy fft


#### What is `data`?
`data` is a dictionary containing only the halo information we are interested in (i.e. just at $z=0$). Use `data.keys()` to see what's inside. You can treat `data` largely like you treated `forest`: i.e. to get a `key` for a specific subset of halos, use `data['key_name'][halo_idx]`.
On the fossil groups side, the equivalent of `data` is `fg_catalog`, which consists only of $z=0$ information for halos we have previously identified as fossil groups. We create `fg_catalog` from `fg_forest.hdf5` below.

In [3]:
%%time
with h5py.File("/data/a/cpac/mbuehlmann/LastJourney/forest/z0_catalog.hdf5", "r") as f:
    data = {
        k: d[:] for k, d in f.items()
    }
s = np.argsort(data['tree_node_index'])
data = {k: data[k][s] for k in data.keys()} # all halos at z=0 above a certain mass threshold

CPU times: user 31.1 s, sys: 19.7 s, total: 50.8 s
Wall time: 50.7 s


In [6]:
data.keys()

dict_keys(['delta+1_rsoft0.0', 'delta+1_rsoft1.0', 'delta+1_rsoft10.0', 'delta+1_rsoft2.0', 'delta+1_rsoft4.0', 'filenum', 'index', 'sig_rsoft_0.0', 'sig_rsoft_1.0', 'sig_rsoft_10.0', 'sig_rsoft_2.0', 'sig_rsoft_4.0', 'sod_halo_cdelta', 'sod_halo_cdelta_accum', 'sod_halo_cdelta_peak', 'tree_node_index', 'tree_node_mass', 'x', 'xoff_com', 'xoff_fof', 'xoff_sod', 'y', 'z'])

### Now create `fg_catalog` (similar to `data`)

In [5]:
%%time
fg_forest_lm5, fg_progenitor_array_lm5 = haccytrees.mergertrees.read_forest(
    "/data/a/cpac/mbuehlmann/LastJourney/forest/fg_forest.hdf5",
    'LastJourney',
    mass_threshold=5e11
)

CPU times: user 25.4 s, sys: 33.5 s, total: 58.8 s
Wall time: 58.8 s


In [7]:
%%time
mask_z0 = fg_forest_lm5['snapnum'] == 100
fg_catalog_lm5 = {k: fg_forest_lm5[k][mask_z0] for k in fg_forest_lm5.keys()} # forest data at z=0 # Not sure why this different from `assign_fgs()`?
s = np.argsort(fg_catalog_lm5['tree_node_index']) # How is this being sorted exactly?
fg_catalog_lm5 = {k: fg_catalog_lm5[k][s] for k in fg_catalog_lm5.keys()}

CPU times: user 2.36 s, sys: 112 ms, total: 2.47 s
Wall time: 2.47 s


In [9]:
print(fg_catalog_lm5.keys())

dict_keys(['branch_size', 'desc_node_index', 'fof_halo_center_x', 'fof_halo_center_y', 'fof_halo_center_z', 'fof_halo_count', 'fof_halo_mass', 'fof_halo_tag', 'snapnum', 'sod_halo_cdelta', 'sod_halo_cdelta_accum', 'sod_halo_cdelta_error', 'sod_halo_cdelta_peak', 'sod_halo_count', 'sod_halo_mass', 'sod_halo_radius', 'tree_node_index', 'tree_node_mass', 'xoff_com', 'xoff_fof', 'xoff_sod', 'scale_factor', 'descendant_idx', 'progenitor_count', 'progenitor_offset', 'halo_index'])


In [10]:
# To create an index of fossil groups
@numba.jit(nopython=True)
def assign_fgs(full_tn_index, fg_tn_index):
    n_full = len(full_tn_index)
    n_fg = len(fg_tn_index)
    c_full_idx = 0
    c_fg_idx = 0
    fg_index = np.empty(n_fg, dtype=np.int64)
    fg_index[:] = -1
    while c_full_idx < n_full and c_fg_idx < n_fg:
        if full_tn_index[c_full_idx] == fg_tn_index[c_fg_idx]:
            fg_index[c_fg_idx] = c_full_idx
            c_fg_idx += 1
        else:
            c_full_idx += 1
    return fg_index
fg_idx = assign_fgs(data['tree_node_index'], fg_catalog_lm5['tree_node_index'])

### Create random sample

In [13]:
tn_idx = [] # tree node index
mass = []
idx = []
file = []
for this_bin in [[1e13, 10**13.05], [10**13.3, 10**13.35], [10**13.6, 10**13.65]]:
    bin_mask = (data['tree_node_mass'] > this_bin[0]) & (data['tree_node_mass'] < this_bin[1])
    bin_mask_idx, = np.nonzero(bin_mask)
    n_fgs = len(fg_idx[(fg_catalog_lm5['tree_node_mass'] > this_bin[0]) & (fg_catalog_lm5['tree_node_mass'] < this_bin[1])])
    random_sample_mask = np.random.choice(bin_mask_idx, n_fgs, replace = False)
    
    tn_idx.append(data['tree_node_index'][random_sample_mask])
    mass.append(data['tree_node_mass'][random_sample_mask])
    idx.append(data['index'][random_sample_mask])
    file.append(data['filenum'][random_sample_mask])
    
create_file = False
if create_file:
    with h5py.File('random_sample.h5', 'w') as f:
        f.create_dataset('tree_node_index', data=np.concatenate(tn_idx))
        f.create_dataset('tree_node_mass', data=np.concatenate(mass))
        f.create_dataset('index', data=np.concatenate(idx))
        f.create_dataset('file', data=np.concatenate(file))

[array([2247382487065960618, 2247442453399346018, 2247372887814052310, ...,
       2247418367222766344, 2247301406673370823, 2247517276024623655]), array([2247530736452122641, 2247559078941299220, 2247344961936722907, ...,
       2247461042017825579, 2247530178106368471, 2247523727065495203]), array([2247311203493760669, 2247422851168633196, 2247567067580466021, ...,
       2247442002427798725, 2247475060791066866, 2247448238720296175])]
[array([1.0062518e+13, 1.0187519e+13, 1.0630454e+13, ..., 1.0239149e+13,
       1.1005454e+13, 1.1051650e+13], dtype=float32), array([2.0652212e+13, 2.0548950e+13, 2.2380475e+13, ..., 2.0008189e+13,
       2.2029932e+13, 2.1972866e+13], dtype=float32), array([4.2149534e+13, 4.4440299e+13, 4.0089745e+13, ..., 4.0182137e+13,
       4.2462035e+13, 4.2054424e+13], dtype=float32)]
[array([156976971,  39208088,  58070978, ..., 125788208,  57588077,
        78619593]), array([214009278, 118449372, 143200851, ..., 229440714, 222748617,
       143481611]), arra

In [51]:
with h5py.File('random_sample.h5', 'r') as hf: # 'r' = read
    print(hf.keys())
    print(len(hf['tree_node_index']))

<KeysViewHDF5 ['file', 'index', 'tree_node_index', 'tree_node_mass']>
307993
