# The `recidivator` pipeline

## preparing the SLICS data

In [None]:
import astropy as ap
from astropy.cosmology import FlatLambdaCDM
# import dask
# import dask.dataframe as dd
import matplotlib as mpl
import matplotlib.pyplot as plt
import multiprocessing as mp
import numpy as np
import os
import pandas as pd
import pickle as pkl
from scipy.stats import gaussian_kde
import scipy.optimize as spo

%matplotlib inline

### constants

In [None]:
z_SLICS = np.array([0.042, 0.080, 0.130, 0.221, 0.317, 0.418, 0.525])
#, 0.640, 0.764, 0.897, 1.041, 1.199, 1.372, 1.562, 1.772, 2.007, 2.269, 2.565, 2.899])

h = 0.6898
cosmo = FlatLambdaCDM(H0=100.*h, Om0=0.2905, Ob0=0.0473)

N_part = 1536**3
M_part = 2.88e9 * ap.units.M_sun * h

In [None]:
dt_each = 'f' + str(4)
dt = np.dtype([('x', dt_each), ('y', dt_each), ('z', dt_each), ('vx', dt_each), ('vy', dt_each), ('vz', dt_each)])

# number of MPI tasks per dimension
nodes_dim = 4

# volume size
rnc = 3072.

# subvolume size
ncc = rnc / nodes_dim

# physical scale in Mpc/h
phys_scale = 505.

In [None]:
#NOTE: specific to using one SLICS file at a time rather than combining!
N_file = N_part / (nodes_dim**3)
N_slice = N_file

# whole slics comoving size
slics_comov = phys_scale * h * ap.units.Mpc
fileside_comov = slics_comov / float(nodes_dim)
# whole slics proper size at each redshift
slics_prop = slics_comov / (1 + z_SLICS)
fileside_prop = slics_prop / float(nodes_dim)

# box depth in comoving coordinates
boxdepth = fileside_comov#phys_scale / float(nodes_dim) * h * 1e6 * ap.units.pc#comoving distance in pc
# box side length in physical coordinates as a function of redshift
boxside = fileside_prop#proper distance in pc

In [None]:
# comoving distance to SLICS snapshot
d_comov = []
for z in z_SLICS:
    dc = cosmo.comoving_distance(float(z))
    d_comov.append(dc.value)
d_comov = np.array(d_comov) * ap.units.Mpc

# redshift bin ends around SLICS snapshot
for j in range(nodes_dim):
    i = j + 1
    d_comov_mins = d_comov - i * boxdepth / 2.
    d_comov_maxs = d_comov + i * boxdepth / 2.
    min_zs = np.array([ap.cosmology.z_at_value(cosmo.comoving_distance, d_comov_min) for d_comov_min in d_comov_mins])
    max_zs = np.array([ap.cosmology.z_at_value(cosmo.comoving_distance, d_comov_max) for d_comov_max in d_comov_maxs])

    with open('DEAR/Data/'+str(i)+'file-z-ends.txt', 'wb') as zbinfile:
        zbinfile.write(b'# z_SLICS min_z max_z\n')
        np.savetxt(zbinfile, np.vstack((z_SLICS, min_zs, max_zs)).T)

In [None]:
crit_ang = 2.5#5.

# angular diameter distance to SLICS snapshot
d_ang = (d_comov_maxs + d_comov_mins) / 2. / (1 + z_SLICS)
# angular diameter on the sky of one SLICS file from each redshift snapshot
angbox = (boxside / d_ang) * (180. / np.pi * ap.units.degree)
# fraction of a SLICS file's length (in any linear coordinates) that will subtend 2.5 degrees on sky
side_frac = crit_ang * ap.units.degree / angbox

#NOTE: change this assertion once there's infrastructure for joining SLICS files in the plane
assert(np.all(side_frac <= 1.))


avg_dens_SLICS = N_part / ((angbox * nodes_dim) ** 2)

In [None]:
bubble_combos = {}
for i in range(len(z_SLICS)):
    for j in [41]:#21, 22, 25, 26, 37, 38, 41, 42]:
        #NOTE: j loop will change a lot when SLICS files are combined
        bubble_combos['z'+str(i)+'box'+str(j)] = (i, j)
        
pathname = 'DEAR/Data/const_ang'
if os.path.isdir(pathname) == False:
    os.system('mkdir '+ pathname)
for testz in range(len(z_SLICS)):
    zpath = pathname+'/z'+str(testz)
    if os.path.isdir(zpath) == False:
        os.system('mkdir '+ zpath)

nps = 20#mp.cpu_count() - 1

### pipeline functions: make manageable bubbles

TODO: combine z coordinates over multiple files, using bash scripts.
Because of how the data is distributed across the files, I think 21, 22, 25, 26, 37, 38, 41, 42 are "adjacent" and free of edge effects.
_Note_: We can just have this be an automated check, knowing that files are adjacent when their `node_coords` are the same aside from being off by one in one of their dimensions.

In [None]:
# TODO: try dask for speed!

def help_read(which_z, fn_index):
    z_str = '{:<05}'.format(str(z_SLICS[which_z]))
    fn_base = 'xv'
    fn_ext = '.dat'
    fn = z_str + fn_base + str(fn_index) + fn_ext
    data_dir = 'SLICS/particle_data/cuillin.roe.ac.uk/~jharno/SLICS/SLICS_HR/LOS1'
    with open(os.path.join(data_dir, fn), 'rb') as f1:
        raw_data = np.fromfile(f1, dtype=dt)
    loc_data = pd.DataFrame(data=raw_data[2:], columns=['x', 'y', 'z', 'vx', 'vy', 'vz'])
    if loc_data.duplicated().any():
        print('duplicates found in z='+str(z_SLICS[which_z])+' box='+str(fn_index)+'!')
    loc_data.drop_duplicates()
    assert(~loc_data.duplicated().any())
    return(loc_data)

TODO: finedensity to pick sane bubble centers

In [None]:
def isolate_one_bubble(one_key):
    (testz, testfn) = bubble_combos[one_key]
    print('starting z='+str(z_SLICS[testz]))
    zpath = pathname+'/z'+str(testz)
    boxpath = zpath+'/box'+str(testfn)
    if os.path.isdir(boxpath) == False:
        os.system('mkdir '+ boxpath)
    elif os.listdir(boxpath) != []:
        print('not-rerunning z='+str(z_SLICS[testz])+' box='+str(testfn))
        return None
    print('starting box='+str(testfn)+', loading SLICS data (the slow step)')
    loc_data = help_read(which_z=testz, fn_index=testfn)
    print('loaded SLICS data, identifying bubble centers, moving on')
    extremex = np.array([loc_data['x'].min(), loc_data['x'].max()])
    extremey = np.array([loc_data['y'].min(), loc_data['y'].max()])
#     print((extremex, extremey))
#     (coarsedensity, xedges, yedges) = np.histogram2d(loc_data['x'], loc_data['y'], bins=int(resolution))
    center = np.array([extremex[0] + extremex[1], extremey[0] + extremey[1]]) / 2.
    radius = np.array([extremex[1] - extremex[0], extremey[1] - extremey[0]]) * side_frac[testz] / 2.
    bubble = loc_data.loc[lambda df: (df['x'] > center[0] - radius[0]) & (df['x'] < center[0] + radius[0])
                          & (df['y'] > center[1] - radius[1]) & (df['y'] < center[1] + radius[1]), :]
#     pkl.dump((coarsedensity, xedges, yedges), boxpath+'/coarsedensity.p')
#     extreme = np.quantile(coarsedensity.flatten(), 0.99)
#     indcenters = np.argwhere(coarsedensity > extreme)
#     #NOTE: finedensity step would go here
#     xcenters = (xedges[indcenters.T[0]] + xedges[indcenters.T[0]+1]) / 2
#     ycenters = (yedges[indcenters.T[1]] + yedges[indcenters.T[1]+1]) / 2
# #     print('identified bubble centers, going through each bubble')
# #     bubbles, globs, pos_mpc, pos_ang = [], [], [], []
#     for i, center in enumerate(indcenters):
#         bubpath = boxpath+'/bub'+str(i)
#         if os.path.isdir(bubpath) == False:
#             os.system('mkdir '+ bubpath)
#         bubble = loc_data.loc[lambda df: (df['x'] > xcenters[i] - rmin) & (df['x'] < xcenters[i] + rmin)
#                           & (df['y'] > ycenters[i] - rmin) & (df['y'] < ycenters[i] + rmin), :]
# #         if ~bubble.duplicated().any():
# #             print('no duplicate coordinates in x, y, z, vx, vy, vz')
# #         plt.hist2d(bubble['x'], bubble['y'], bins=(200, 200), norm=mpl.colors.LogNorm(), cmap='Spectral_r')
# #         plt.savefig(bubpath+'/bubble_raw.png')
#         bubble.to_csv(bubpath+'/particles.csv', index=False)
    bubble.to_csv(boxpath+'/particles.csv', index=False)
# #         print('saved bubble particles to not have to load whole SLICS file again, next transform data')
# #         bubbles.append(bubble)
    return

In [None]:
# pool = mp.Pool(nps)
# pool.map(isolate_one_bubble, bubble_combos.keys())

### pipeline functions: transform data

In [None]:
def help_find_coords(fn_index):
#     all_nodes_coords = np.empty((nodes_dim, nodes_dim, nodes_dim))
    for k1 in range(1, nodes_dim+1):
        for j1 in range(1, nodes_dim+1):
            for i1 in range(1, nodes_dim+1):
                current_ind = (i1 - 1) + (j1 - 1) * nodes_dim + (k1 - 1) * nodes_dim ** 2
                node_coords = {'x': i1 - 1, 'y': j1 - 1, 'z': k1 - 1}
                if fn_index == current_ind:
#                     print('found index '+str(fn_index)+' at '+str((i1, j1, k1)))
                    true_node_coords = node_coords
#                 all_nodes_coords[node_coords['x'], node_coords['y'], node_coords['z']] = current_ind
                    return(true_node_coords)

In [None]:
def help_shift(true_node_coords, loc_data):
    # shift data
    glob_data = loc_data
    print(glob_data.columns)
    for col in ['x', 'y']:#, 'z']:
        glob_data[col] = np.remainder(loc_data[col] + true_node_coords[col] * ncc, rnc)
        assert(max(glob_data[col] <= rnc))
    return(glob_data)

In [None]:
def help_convert(glob_data):
    # convert to Mpc
    phys_data = glob_data / rnc * slics_comov
    return(phys_data)

In [None]:
def project_one_bubble(one_key):
    (testz, testfn) = bubble_combos[one_key]
    true_node_coords = help_find_coords(fn_index=testfn)
    zpath = pathname+'/z'+str(testz)
    fullpath = zpath+'/box'+str(testfn)
#     bubpaths = os.listdir(boxpath)
#     fullpaths = [boxpath+'/'+bubpath+'/' for bubpath in bubpaths]
#     for fullpath in fullpaths:
    bubble = pd.read_csv(os.path.join(fullpath, 'particles.csv'))
    glob_data = help_shift(true_node_coords, bubble)
#         globs.append(glob_data)
    phys_data = help_convert(glob_data)
#         pos_mpc.append(phys_data)
    ang_data = pd.DataFrame()
    ang_data['RA'] = phys_data['x'] / d_ang[testz] * 180. / np.pi
    ang_data['DEC'] = phys_data['y'] / d_ang[testz] * 180. / np.pi
    plt.hist2d(ang_data['RA'], ang_data['DEC'], bins=(256, 256), norm=mpl.colors.LogNorm(), cmap='Spectral_r')
    plt.savefig(os.path.join(fullpath, 'projection.png'))
    if bubble.duplicated().any():
        print('duplicate particles introduced by dropping z, vx, vy, vz')
#         if bubble.duplicated().any():
#             print('dropped duplicates')
#             to_save = ang_data.drop_duplicates()
#         pos_ang.append(ang_data.drop_duplicates())
        #
    ang_data.to_csv(os.path.join(fullpath, 'projection.csv'), index=False)
    print('shifted from machine, converted to physical, projected to angular coordinates, and saved '+str(len(ang_data)))
    return (one_key, len(ang_data))

In [None]:
# pool = mp.Pool(nps)
# npart_all = pool.map(project_one_bubble, bubble_combos.keys())

## model the large-scale structure

In [None]:
import bisect
import csv
import matplotlib.pylab as plt
import multiprocessing as mp
import numpy as np
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import random
import scipy.interpolate as spi
import time

In [None]:
# import R
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
import rpy2.robjects as ro

# # only run this once ever
# utils = importr('utils')
# utils.install_packages('spatstat')
# utils.install_packa.groupby('a').count()ges('sparr')
# utils.install_packages('dplyr')
# utils.install_packages('magrittr')
# utils.install_packages('reshape2')

# import R packages

ro.r('library(spatstat)')
ro.r('library(sparr)')
ro.r('library(dplyr)')
ro.r('library(magrittr)')
ro.r('library(reshape2)')

# prevent truncation of floats from files
ro.r('options(digits=20)')

In [None]:
pathname = 'DEAR/Data/const_ang'

bubble_combos = {}
for i in range(len(z_SLICS)):
    #Note: j loop will change a lot when SLICS files are combined
    for j in [38]:#21, 22, 25, 26, 37, 38, 41, 42]:
        boxpath = pathname+'/z'+str(i)+'/box'+str(j)
#         nbub = len(os.listdir(boxpath))
#         for k in range(nbub):
#             for l in range(4):
        bubble_combos[boxpath] = (i, j)#+'/patch'+str(k)] = (i, j, l, k)

In [None]:
res = 256# should be 1024
bw_ival = 2.5 / (side_frac * boxside / 1.e6 / ap.units.pc) / np.log2(res)

to_save = {'$z$v': 'kdegrid',
           '$him$v': 'bwgrid',
           '$z$xcol': 'xgrid',
           '$z$yrow': 'ygrid',
           '$h': 'bwpart'
          }

### fit a KDE

In [None]:
def one_kde(one_key):
#     (chosenz, chosenbox, chosenbubble, chosenpatch) = bubble_combos[one_key]
    (i, j) = bubble_combos[one_key]
    # here I am reading one bubble of data
# ro.r('dataRAW <- read.csv("tinypart.csv")')
    ro.r('dataRAW <- read.csv("'+pathname+'/'+one_key+'/projection.csv'+'")')
#     ro.r('dataRAW <- fread("'+pathname+'/'+one_key+'/projection.csv'+'")')
    # will have to change this to projection.csv
#     nparticles = ro.r('nrows(dataRAW)')
    
# try data.table's fread

# get single columns from data frame
    ro.r('x <- dataRAW$RA')
    ro.r('y <- dataRAW$DEC')

# put in the correct format
    ro.r('myPointData <- ppp(x, y, xrange=range(x), yrange=range(y))')
    
# read variables into R section
    ro.r('h0 = '+str(bw_ival[i]))
    ro.r('resolution = '+str(res))#4096
    
    ti = time.time()
    ro.r('ddest <- bivariate.density(myPointData, h0=h0, adapt=TRUE, resolution=resolution, parallelise='+str(int(nps / len(z_SLICS)))+')')
# # consider use.ppp.methods = TRUE
    # ro.r('ddest <- multiscale.density(myPointData, h0=h0)')
    tf = time.time()
    dt = tf - ti
    message = 'z'+str(i)+'/box'+str(j)+' kde completed in '+str(dt)+' seconds\n'
    print(message)

    # ro.r('write.csv(ddest$z$v, "'+kdefn+'")')
    for key in to_save.keys():
        ro.r('write.csv(ddest'+key+', "'+pathname+'/z'+str(i)+'/box'+str(j)+'/'+to_save[key]+'.csv")')
    
    with open(os.path.join(pathname, 'progress.txt'), 'a') as stdout:
        stdout.write(message)
        # write essential output to files
    
    return

In [None]:
# pool = mp.Pool(nps)
# pool.map(one_kde, bubble_combos.keys())

### sample the KDEs

In [None]:
#NOTE: this should be expected value of a galaxy mass function
avg_gal_mass = 1.#.e12 * ap.units.M_sun

npatch = 4

# gama_xlims, gama_ylims = {}, {}
# gama_xlims[0] = [ 30.20075 , 38.79967 ]
# gama_ylims[0] = [ -10.24806 , -3.34789 ]
# gama_xlims[1] = [ 129.00008 , 140.99921 ]
# gama_ylims[1] = [ -1.9999900000000002 , 2.99992 ]
# gama_xlims[2] = [ 173.69011 , 185.99942 ]
# gama_ylims[2] = [ -2.99973 , 2.00243 ]
# gama_xlims[3] = [ 211.49796 , 223.49988 ]
# gama_ylims[3] = [ -1.99995 , 2.9999599999999997 ]

allz_gama_areas, allz_gama_dens, allz_gama_cts = [], [], []
# allz_gama_coords = []
for i in range(len(z_SLICS)):
    gama_areas, gama_dens, gama_cts = np.empty(npatch), np.empty(npatch), np.empty(npatch)
#     gama_coords = np.empty(4)
    gama = pd.read_csv('../environmet_clustering/classes/z_'+'{:<05}'.format(str(z_SLICS[i]))+'_manygroups_oneslice.csv')
    print(gama.columns)
    for l in gama['patch'].unique():
#         gama_coords[l] = gama[gama['patch'] == l][['RA_x', 'DEC_x']]
#         what_is_bubble(gama_coords[l], 'RA_x', 'DEC_x', i)
        gama_xlims = [gama[gama['patch'] == l]['RA_x'].min(), gama[gama['patch'] == l]['RA_x'].max()]
        gama_ylims = [gama[gama['patch'] == l]['DEC_x'].min(), gama[gama['patch'] == l]['DEC_x'].max()]
        gama_areas[l] = (gama_xlims[1] - gama_xlims[0]) * (gama_ylims[1] - gama_ylims[0])
        gama_cts[l] = gama.groupby('patch').count()['CATAID'][l]
        gama_dens[l] = avg_gal_mass * gama_cts[l] / gama_areas[l]
    allz_gama_areas.append(gama_areas)
    allz_gama_dens.append(gama_dens)
    allz_gama_cts.append(gama_cts)
#     allz_gama_coords.append(gama_coords)

In [None]:
# recall definition of galaxy bias b_g = delta_g / delta_m = (rho_g - bar[rho_g]) / (rho_m - bar[rho_m]) * (bar[rho_m] / bar[rho_g])
# rho_g = b_g * (bar[rho_g] / bar[rho_m]) * (rho_m - bar[rho_m]) + bar[rho_g]
# rho_m = KDE
# bar[rho_m] = 1 / atot (assuming bubble has average density overall)
# bar[rho_g] = N_GAMA / A_GAMA

# # constant linear bias factor
b_g = 1.5

# bias_thresh_rel = b_g * (dens_rel * (A_SLICS / 256**2))
# print(bias_thresh_rel)

In [None]:
def sample_kde(key):
    basefn = key
    (i, j) = bubble_combos[key]

    xran = pd.read_csv(os.path.join(basefn, 'ygrid.csv')).to_numpy()[:, 1]
    yran = pd.read_csv(os.path.join(basefn, 'xgrid.csv')).to_numpy()[:, 1]
    allpos = np.meshgrid(xran, yran)
    xmin, xmax = xran[0], xran[-1]
    ymin, ymax = yran[0], yran[-1]
    atot = (ymax - ymin) * (xmax - xmin)
    da = atot / ((res - 1) * (res - 1))

    kde = pd.read_csv(os.path.join(basefn, 'kdegrid.csv')).to_numpy().T[1:].T
    
#     plt.imshow(kde, extent=[xmin, xmax, ymin, ymax])
#     plt.title(key[20:])
#     plt.savefig(os.path.join(basefn, 'kdegrid.png'))
#     plt.show()
#     plt.close()
    
#     npart = len(pd.read_csv(os.path.join(basefn, 'projection.csv')))
#     dens_mock = npart / (angbox ** 2)
#     footprint_contrast = (dens_mock - avg_dens_SLICS) / avg_dens_SLICS
    
    kdenorm = kde * da
    kdetot = np.sum(kdenorm)
    assert(np.isclose(kdetot, 1.))
    kdeavg = np.mean(kdenorm)
    linearized = np.cumsum(kdenorm.reshape((1, res * res)))
    kde_contrast = (kdenorm - kdeavg) / kdeavg
    # TODO: maybe write a function to calculate density contrasts?
    
#     print((key, footprint_contrast, kde_contrast))
    
    
    plt.hist(kde_contrast.flatten(), density=True)
    plt.vlines(b_g - 1., 0., 50.)
    plt.title(str(key))
    plt.savefig(os.path.join(basefn, 'kde_dens_contrast.png'))
    plt.show()
    plt.close()
    
    allpatch = []
    for l in range(npatch):
        ngal = int(atot * allz_gama_dens[i][l])
#         print(ngal)
        galpos = []
        while len(galpos) < ngal:
            rando = random.random()
            loc = bisect.bisect(linearized, rando)
            galind = np.unravel_index(loc, (res, res))
        # this should be (kdenorm[galind] - slicsavgdens) / slicsavgdens * gamapatchavgdens >= b_g
        # will have to account for mass of particles vs galaxies!
            if kde_contrast[galind] >= b_g - 1.:
#             if kdenorm[galind] >= b_g * kde_contrast * 
            # NOTE: instead, could maybe use position of a particle in the drawn cell
                galpos.append([allpos[0][galind], allpos[1][galind]])#[allpos[0][np.int(loc / res)][np.mod(loc, res)], allpos[1][np.int(loc / res)][np.mod(loc, res)]])
        
        galpos = np.array(galpos)
#         galposinds = np.unravel_index(galinds, (res, res))
#         galpos = allpos[galinds]
        np.savetxt(os.path.join(basefn, 'patch'+str(l)+'mockpos'+str(b_g)+'.csv'), galpos)
        allpatch.append(galpos)
    return(allpatch)

In [None]:
pool = mp.Pool(nps)
allpatches = pool.map(sample_kde, bubble_combos.keys())

TODO: plot the sampled mock galaxy positions on top of KDE

## scratch

benchmarking test

In [None]:
kmin = 5
benchmark_combos = {}
for i in range(len(z_SLICS)):
    for j in [38]:#21, 22, 25, 26, 37, 38, 41, 42]:
        boxpath = pathname+'/z'+str(i)+'/box'+str(j)
        one_ang = pd.read_csv(boxpath+'/projection.csv')
        npart_oom = int(np.floor(np.log10(len(one_ang))))
        k = kmin
        while k <= npart_oom:
            samppath = boxpath+'/samp'+str(k)
            if os.path.isdir(samppath) == False:
                os.system('mkdir '+ samppath)
            benchmark_combos[samppath] = (i, j, k)#+'/patch'+str(k)] = (i, j, l, k)
            subsamp = one_ang.sample(10**k)
            subsamp.to_csv(os.path.join(samppath, 'projection.csv'), index=False)
            k += 1

In [None]:
def benchmark_kde(one_key):
    (i, j, k) = benchmark_combos[one_key]
    ro.r('dataRAW <- read.csv("'+one_key+'/projection.csv'+'")')
    
# try data.table's fread

# get single columns from data frame
    ro.r('x <- dataRAW$RA')
    ro.r('y <- dataRAW$DEC')

# put in the correct format
    ro.r('myPointData <- ppp(x, y, xrange=range(x), yrange=range(y))')
    
# read variables into R section
    ro.r('h0 = '+str(bw_ival[i]))
    ro.r('resolution = '+str(res))#4096
    ro.r('use.ppp.methods = TRUE')
    
    ti = time.time()
    ro.r('ddest <- bivariate.density(myPointData, h0=h0, adapt=TRUE, resolution=resolution)')
# # consider use.ppp.methods = TRUE
    # ro.r('ddest <- multiscale.density(myPointData, h0=h0)')
    tf = time.time()
    dt = tf - ti
    message = one_key+' kde at '+str(res)+'x'+str(res)+' completed in '+str(dt)+' seconds\n'
    print(message)

    # ro.r('write.csv(ddest$z$v, "'+kdefn+'")')
    for key in to_save.keys():
        ro.r('write.csv(ddest'+key+', "'+one_key+'/'+to_save[key]+'.csv")')
    
    with open(os.path.join(pathname, 'benchmark.txt'), 'a') as stdout:
        stdout.write(message)
        # write essential output to files

    return

In [None]:
# pool = mp.Pool(nps)
# pool.map(benchmark_kde, benchmark_combos.keys())

In [None]:
# note resolution needs to be scaled for angular area!
def what_is_bubble(loc_data, xname, yname, zbin, bins=int(resolution)):
#     plt.hist2d(loc_data[xname], loc_data[yname], bins=(bins, bins), norm=mpl.colors.LogNorm(), cmap='Spectral_r')
#     plt.show()
    (coarsedensity, xedges, yedges) = np.histogram2d(loc_data[xname], loc_data[yname], bins=bins)
    extreme = np.quantile(coarsedensity.flatten(), 0.99)
    indcenters = np.argwhere(coarsedensity > extreme)
#     print(indcenters)
    #NOTE: finedensity step would go here
    xcenters = (xedges[indcenters.T[0]] + xedges[indcenters.T[0]+1]) / 2
    ycenters = (yedges[indcenters.T[1]] + yedges[indcenters.T[1]+1]) / 2
#     print('identified bubble centers, going through each bubble')
    minscale = 10.e6 * ap.units.pc / d_comov[zbin] * 180 / np.pi
    bubbles = []
    for i, center in enumerate(indcenters):
        bubble = loc_data.loc[lambda df: (df[xname] > xcenters[i] - minscale) & (df[xname] < xcenters[i] + minscale)
                          & (df[yname] > ycenters[i] - minscale) & (df[yname] < ycenters[i] + minscale), :]
#         if ~bubble.duplicated().any():
#             print('no duplicate coordinates in x, y, z, vx, vy, vz')
#         print((len(loc_data), len(bubble)))
#         plt.hist2d(bubble[xname], bubble[yname], bins=(bins, bins), norm=mpl.colors.LogNorm(), cmap='Spectral_r')
#         plt.show()
        bubbles.append(bubble)
    return(bubbles)

In [None]:
allz_gama_dens[1][2] / dens_SLICS[1]

number of mock galaxy samples derived empirically based on maximizing coverage with GAMA environment curves: take a lot of sample positions and recalculate environment curves with subsets until they match.

In [None]:
basefn = pathname+'/z'+str(0)+'/box'+str(38)+'/'
rel_part = len(pd.read_csv(basefn+'particles.csv')) / N_slice
xran = pd.read_csv(basefn+'ygrid.csv').to_numpy()[:, 1]
yran = pd.read_csv(basefn+'xgrid.csv').to_numpy()[:, 1]
xmin, xmax = xran[0], xran[-1]
ymin, ymax = yran[0], yran[-1]
atot = (ymax - ymin) * (xmax - xmin) * (ap.units.degree)**2
rel_area = atot / ang_area_SLICS[0].value
rel_dens = rel_part / rel_area

In [None]:
print(rel_part)
print(rel_area)
print(rel_dens)

In [None]:
dens_SLICS

In [None]:
dens_contra_slics = (len(pd.read_csv(basefn+'particles.csv')) * M_part / atot - dens_SLICS[0]) / dens_SLICS[0]
print(dens_contra_slics)

In [None]:
def one_patch(one_key):
#     (chosenz, chosenbox, chosenbubble, chosenpatch) = bubble_combos[one_key]
    (i, j, k) = bubble_combos[one_key]
    basefn = pathname+'/z'+str(i)+'/box'+str(j)+'/bub'+str(k)+'/'
    
    normfactor = len(np.loadtxt(basefn+'particles.csv'))
    xran = pd.read_csv(basefn+'ygrid.csv').to_numpy()[:, 1]
    yran = pd.read_csv(basefn+'xgrid.csv').to_numpy()[:, 1]
    allpos = np.meshgrid(xran, yran)
    
    xmin, xmax = xran[0], xran[-1]
    ymin, ymax = yran[0], yran[-1]
    atot = (ymax - ymin) * (xmax - xmin)
    da = atot / ((res - 1) * (res - 1))
    
    kde = pd.read_csv(basefn+'kdegrid.csv').to_numpy().T[1:].T
    kdenorm = kde * da
    kdetot = np.sum(kdenorm)
    assert(np.isclose(kdetot, 1.))
    kdeavg = np.mean(kdenorm)
    
    linearized = np.cumsum(kdenorm.reshape((1, res * res)))
    # attempting to implement galaxy bias here
#     biased_linearized = linearized * b_g
    
    for l in range(npatch):
        ngal = 1e5#int(atot * allz_gama_dens[i][l])
        galpos = []
        while len(galpos) < ngal:
            rando = random.random()
            loc = bisect.bisect(linearized, rando)
            galind = np.unravel_index(loc, (res, res))
            # this should be (kdenorm[galind] - slicsavgdens) / slicsavgdens * gamapatchavgdens >= b_g
            # will have to account for mass of particles vs galaxies!
            if (kdenorm[galind] - kdeavg) / kdeavg >= b_g:
                # could maybe use position of a particle in the drawn cell
                galpos.append([allpos[0][galind], allpos[1][galind]])#[allpos[0][np.int(loc / res)][np.mod(loc, res)], allpos[1][np.int(loc / res)][np.mod(loc, res)]])
        galpos = np.array(galpos).T
#         galposinds = np.unravel_index(galinds, (res, res))
#         galpos = allpos[galinds]
#         print(np.shape(galpos))
        if os.path.isdir(basefn+'patch'+str(l)) == False:
            os.system('mkdir '+ basefn+'patch'+str(l))
        np.savetxt(basefn+'patch'+str(l)+'/mockpos'+str(b_g)+'.csv', galpos)
    
    return

In [None]:
pool = mp.Pool(nps)
pool.map(one_patch, bubble_combos.keys())

In [None]:
nobias = np.loadtxt('DEAR/Data/bubbles/z4/box37/bub2/patch2/mockpos1.0.csv')
bias = np.loadtxt('DEAR/Data/bubbles/z4/box37/bub2/patch2/mockpos1.5.csv')

In [None]:
np.shape(nobias)

In [None]:
plt.scatter(nobias[0], nobias[1], c='b', alpha=0.005, s=1)
plt.scatter(bias[0], bias[1], c='r', alpha=0.005, s=1)

In [None]:
biashist = np.histogram2d(bias[0], bias[1], bins=res)
nobiashist = np.histogram2d(nobias[0], nobias[1], bins=res)

In [None]:
bins = range(50)
plt.hist(biashist[0].flatten(), color='r', alpha=0.5, bins=bins)
plt.hist(nobiashist[0].flatten(), color='k', alpha=0.5, bins=bins)

In [None]:
gama = pd.read_csv('../environmet_clustering/classes/z_'+'{:<05}'.format(str(z_SLICS[1]))+'_manygroups_oneslice.csv')

In [None]:
gama.columns

In [None]:
allz_gama_coords

In [None]:
plt.hist2d(allz_gama_coords[0][1]['RA_x'], allz_gama_coords[0][1]['DEC_x'])

In [None]:
test = np.loadtxt('DEAR/Data/bubbles/z3/box37/bub0/patch1/mockpos.csv')

In [None]:
test

In [None]:
kde = pd.read_csv('DEAR/Data/bubbles/z3/box37/bub0/kdegrid.csv')

In [None]:
plt.imshow(kde)

attic: scratch after here

In [None]:
#NOTE: currently using bubble size based on physical scale of galaxy clusters, 
#but does need to be adjusted for angular size so it makes sense for making mock data
dmax = 10.e6 * ap.units.pc
rmax = dmax / h / 2.

area_SLICS = (phys_scale * h * 1.e6 * ap.units.pc)**2
ang_area_SLICS = area_SLICS / (d_ang / 180. / ap.units.degree * np.pi)**2
dens_SLICS =  N_slice * M_part / ang_area_SLICS
#NOTE: can use this angular area to define appropriate bubble radius/kde resolution relationship

cubeside = rnc / nodes_dim
dmin = cubeside * rmax / (phys_scale * 1.e6 * ap.units.pc / nodes_dim)
rmin = dmin / 2.

#NOTE: this physical resolution should be replaced with an angular-motivated bubble size resolution (in physical units)
resolution = (phys_scale * 1.e6 * ap.units.pc / nodes_dim) / rmax

In [None]:
# #make bubbles of constant 2.5 degrees size
# bub_ang = 2.5 * np.pi / 180.
# def slics_to_ang(dphys, z): 
#     phys = dphys * ap.units.pc
#     dc = cosmo.comoving_distance(float(z))
#     da = dc / (1 + z)
#     ang = (phys / da) * (180. / np.pi)
#     return ang
    
# def ang_to_phys(ang, z):
#     def helper(phys):
#         angres = slics_to_ang(phys, z)
#         dif = np.abs(ang - angres)
#         return(dif)
#     res = spo.minimize(helper, boxside/(1+z))
#     frac_box = res.x * ap.units.pc / boxside
#     return frac_box

# ang_to_phys(2.5, z_SLICS[0])