**Importing libraries**

In [25]:
import numpy as np 
import pandas as pd 
import multiprocessing as multi
from scipy.ndimage import gaussian_filter
import matplotlib.pyplot as plt
import os
from IPython.display import clear_output

**Capturing name of the PC**

In [26]:
%%capture PC_name 
!hostname

**Setting notebook parameters**

In [27]:
#setting dots per inch for images
dpi = 90

#setting number of processors for multiprocessing
PC_name_str = PC_name.stdout[0:-2]

if PC_name_str == 'science12':
    cores = 42        
else:
    cores = multi.cpu_count()

print(PC_name_str)
print(cores)

science12
42


**Defining constants**

In [28]:
omega_m = 0.272            #matter density paratmeter from Komatsu et al. (2011) 
omega_l = 1 - omega_m      #vacuum density paratmeter assuming flat universe
H_o = 70.4                 #Hubble constant in km s^−1 Mpc^−1 from Komatsu et al. (2011) 
h = 0.704
f = omega_m**0.545         #linear velocity growth rate from Tanimura et al. (2020)


**Defining Functions**

In [29]:
def H(z):                           #hubble parameter (in km s^-1 Mpc^-1) using Eq. 4.33 in Peter's book
    return np.sqrt( H_o**2 * ( (1+z)**3 * omega_m + omega_l ) ) 


**Reading galaxies dataset**

In [30]:

z = 0.42372720
# data_address = '../input/magneticumsnap027z042-massfiltered/'
data_address = 'Data/'
df_gal = pd.read_csv(data_address + 'massive_galaxies.csv')      #massive_galaxies.csv contains galaxies with mass greater than 1.8 ×10^{11} h^{−1} M_sun as done by Tanimura et al. (2020)
df_gal.describe()

Unnamed: 0,x[kpc/h],y[kpc/h],z[kpc/h],m[Msol/h]
count,93097.0,93097.0,93097.0,93097.0
mean,325576.133087,322500.028738,320613.762863,399320600000.0
std,184863.27396,182799.587421,183249.915097,399512900000.0
min,1.864417,10.164207,0.260974,180001000000.0
25%,163866.95,166201.38,162514.56,216148000000.0
50%,332449.62,329260.16,319925.78,278490000000.0
75%,483949.0,475466.12,478698.41,419293000000.0
max,639996.19,639980.94,639993.12,10774600000000.0


**Creating big simulation box (1920 x 1920 x 1920 h$^{-1}$Mpc) for galaxies**

In [31]:
df_gal_temp = df_gal.copy()

df_gal_big = pd.DataFrame()

for k in range(3):
    for i in range(3):
        for j in range(0,3):

            df_gal_temp['x[kpc/h]'] = df_gal['x[kpc/h]'] + (640000 * i)
            df_gal_temp['y[kpc/h]'] = df_gal['y[kpc/h]'] + (640000 * j)
            df_gal_temp['z[kpc/h]'] = df_gal['z[kpc/h]'] + (640000 * k)

            df_gal_big = df_gal_big.append(df_gal_temp)

df_gal_big.describe()

Unnamed: 0,x[kpc/h],y[kpc/h],z[kpc/h],m[Msol/h]
count,2513619.0,2513619.0,2513619.0,2513619.0
mean,965576.1,962500.0,960613.8,399320600000.0
std,554293.1,553608.3,553757.1,399510900000.0
min,1.864417,10.16421,0.2609741,180001000000.0
25%,483949.0,475466.1,478698.4,216148000000.0
50%,972449.6,969260.2,959925.8,278490000000.0
75%,1443867.0,1446201.0,1442515.0,419293000000.0
max,1919996.0,1919981.0,1919993.0,10774600000000.0


**Reading clusters dataset**

In [32]:
df_clusters_orig = pd.read_csv(data_address + 'massive_clusters.csv', 
                          usecols = ['x[kpc/h]', 'y[kpc/h]', 'z[kpc/h]', 'm500c[Msol/h]', 'vx[km/s]', 'vy[km/s]', 'vz[km/s]'    ])  #massive_clusters.csv contains clusters with M_500c greater than 10^13.5 h^{-1} M_sun as done by Tanimura et al. (2020)

df_clusters_orig.describe()

Unnamed: 0,x[kpc/h],y[kpc/h],z[kpc/h],m500c[Msol/h],vx[km/s],vy[km/s],vz[km/s]
count,6080.0,6080.0,6080.0,6080.0,6080.0,6080.0,6080.0
mean,326416.401838,322055.402286,321308.240244,63562450000000.0,2.109102,-1.647297,-4.296055
std,186021.550694,182650.899538,183665.85431,48516980000000.0,316.064539,306.226537,286.454388
min,19.163288,152.65388,87.370949,31623400000000.0,-1179.53,-1163.52,-1104.79
25%,160978.575,164793.43,161863.925,37950350000000.0,-200.229,-205.13925,-190.806
50%,338884.565,333274.345,322110.075,48016300000000.0,-3.61945,1.96945,-8.055475
75%,486227.7475,473556.765,478893.61,69455020000000.0,206.81675,207.75575,181.532
max,639887.56,639933.38,639848.38,743820000000000.0,1197.64,1164.96,1201.26


**Creating big simulation box (1920 x 1920 x 1920 h$^{-1}$Mpc) for clusters**

In [33]:
df_clusters_temp = df_clusters_orig.copy()

df_clusters_big = pd.DataFrame()

for k in range(3):
    for i in range(3):
        for j in range(0,3):

            df_clusters_temp['x[kpc/h]'] = df_clusters_orig['x[kpc/h]'] + (640000 * i)
            df_clusters_temp['y[kpc/h]'] = df_clusters_orig['y[kpc/h]'] + (640000 * j)
            df_clusters_temp['z[kpc/h]'] = df_clusters_orig['z[kpc/h]'] + (640000 * k)

            df_clusters_big = df_clusters_big.append(df_clusters_temp)

df_clusters_big.describe()

Unnamed: 0,x[kpc/h],y[kpc/h],z[kpc/h],m500c[Msol/h],vx[km/s],vy[km/s],vz[km/s]
count,164160.0,164160.0,164160.0,164160.0,164160.0,164160.0,164160.0
mean,966416.4,962055.4,961308.2,63562450000000.0,2.109102,-1.647297,-4.296055
std,554677.3,553556.1,553891.8,48513140000000.0,316.039508,306.202286,286.431703
min,19.16329,152.6539,87.37095,31623400000000.0,-1179.53,-1163.52,-1104.79
25%,486336.6,473606.2,478927.8,37950350000000.0,-200.229,-205.13925,-190.806
50%,978884.6,973274.3,962110.1,48016300000000.0,-3.61945,1.96945,-8.055475
75%,1440918.0,1444771.0,1441655.0,69455020000000.0,206.81675,207.75575,181.532
max,1919888.0,1919933.0,1919848.0,743820000000000.0,1197.64,1164.96,1201.26


**Extracting clusters present in central region from 640 h$^{-1}$Mpc to 1280 h$^{-1}$Mpc**

In [34]:
low_bound = 640000
upp_bound = 640000 * 2  #128000

df_clusters_center = df_clusters_big[(df_clusters_big['x[kpc/h]'] > low_bound) & 
                                     (df_clusters_big['x[kpc/h]'] < upp_bound) & 
                                     (df_clusters_big['y[kpc/h]'] > low_bound) & 
                                     (df_clusters_big['y[kpc/h]'] < upp_bound) & 
                                     (df_clusters_big['z[kpc/h]'] > low_bound) & 
                                     (df_clusters_big['z[kpc/h]'] < upp_bound)]

df_clusters_center.describe()

Unnamed: 0,x[kpc/h],y[kpc/h],z[kpc/h],m500c[Msol/h],vx[km/s],vy[km/s],vz[km/s]
count,6080.0,6080.0,6080.0,6080.0,6080.0,6080.0,6080.0
mean,966416.4,962055.4,961308.2,63562450000000.0,2.109102,-1.647297,-4.296055
std,186021.6,182650.9,183665.9,48516980000000.0,316.064539,306.226537,286.454388
min,640019.2,640152.7,640087.4,31623400000000.0,-1179.53,-1163.52,-1104.79
25%,800978.6,804793.4,801863.9,37950350000000.0,-200.229,-205.13925,-190.806
50%,978884.6,973274.3,962110.1,48016300000000.0,-3.61945,1.96945,-8.055475
75%,1126228.0,1113557.0,1118894.0,69455020000000.0,206.81675,207.75575,181.532
max,1279888.0,1279933.0,1279848.0,743820000000000.0,1197.64,1164.96,1201.26


**Adding galaxy pads at the edges of central cluster region**

In [35]:

def edge_pads_adder(clus_cube_size):
       
    low_bound =  640000    - (clus_cube_size//2)
    upp_bound = (640000*2) + (clus_cube_size//2)
    
#     print(low_bound, upp_bound)
    
    df_gal_padded = df_gal_big[(df_gal_big['x[kpc/h]'] >= low_bound) & 
                               (df_gal_big['x[kpc/h]'] <= upp_bound) & 
                               (df_gal_big['y[kpc/h]'] >= low_bound) & 
                               (df_gal_big['y[kpc/h]'] <= upp_bound) & 
                               (df_gal_big['z[kpc/h]'] >= low_bound) & 
                               (df_gal_big['z[kpc/h]'] <= upp_bound)]
            
    return df_gal_padded
    

**Specifying prefactors for Eq. 1 of Tanimura et al. (2020)**

In [36]:
a = 1/(1+z)
H(z)
print(H(z))

pre_fac = (f * a * H(z) / (4 * np.pi))           #in km s^−1 Mpc^−1 
pre_fac

86.5938062370014


2.380620866668027

**Calculating mean density of the simulation box for Eq. 1**

In [37]:
def delta_gal_mean_func(cell_size):
    
    df_gal_mean = df_gal_big.copy()

    df_gal_mean['x[kpc/h]'] = df_gal_mean['x[kpc/h]'] / cell_size
    df_gal_mean['y[kpc/h]'] = df_gal_mean['y[kpc/h]'] / cell_size
    df_gal_mean['z[kpc/h]'] = df_gal_mean['z[kpc/h]'] / cell_size

    #making tuples, converting tuples to cell coordinates
    df_gal_mean["cell"] = list(zip(df_gal_mean['x[kpc/h]'].astype(int), df_gal_mean['y[kpc/h]'].astype(int), df_gal_mean['z[kpc/h]'].astype(int)))

    #array to store number of galaxies in the cells
    sim_box_size = 640000 * 3
    gals_in_cell = np.zeros((sim_box_size//cell_size, sim_box_size//cell_size, sim_box_size//cell_size))
    
    #counting number of galaxies in the cells
    for cell in df_gal_mean["cell"]:
        x, y, z = cell
        gals_in_cell[x, y, z] += 1

    delta_gal_mean = np.mean(gals_in_cell)
    
    return delta_gal_mean

**Calculating Overdensity field for Eq. 1**

In [38]:
b = 2                                #bias for LOWZ & CMASS galaxies as taken by Tanimura et al. 2020

def overdensity_field_calc(clus_x, clus_y, clus_z, delta_gal_mean, cell_size, sigma_in_pix, clus_cube_size, df_gal_padded):

    #converting strings into floats
    clus_x = float(clus_x); clus_y = float(clus_y); clus_z = float(clus_z)
    
    #filtering galaxies in a cube of 240,000 h^-1 kpc around the given cluster
    df_gal_select = df_gal_padded[(df_gal_padded['x[kpc/h]'] > (clus_x - clus_cube_size//2)) & 
                                  (df_gal_padded['x[kpc/h]'] < (clus_x + clus_cube_size//2)) & 
                                  (df_gal_padded['y[kpc/h]'] > (clus_y - clus_cube_size//2)) & 
                                  (df_gal_padded['y[kpc/h]'] < (clus_y + clus_cube_size//2)) & 
                                  (df_gal_padded['z[kpc/h]'] > (clus_z - clus_cube_size//2)) & 
                                  (df_gal_padded['z[kpc/h]'] < (clus_z + clus_cube_size//2))]
        
    #making copy to extract coordinates of cells containing the galaxies
    df_gal_cube = df_gal_select.copy()
    
    #moving the galxies cube to lie within 0 to 240,000 h^-1 kpc 
    df_gal_cube['x[kpc/h]'] -= (clus_x - clus_cube_size//2)
    df_gal_cube['y[kpc/h]'] -= (clus_y - clus_cube_size//2)
    df_gal_cube['z[kpc/h]'] -= (clus_z - clus_cube_size//2)

    #dividing by 5000 (integer-div) so we get cell coordinates
    df_gal_cube['x[kpc/h]'] = df_gal_cube['x[kpc/h]'] / cell_size
    df_gal_cube['y[kpc/h]'] = df_gal_cube['y[kpc/h]'] / cell_size
    df_gal_cube['z[kpc/h]'] = df_gal_cube['z[kpc/h]'] / cell_size
    
    #making tuples, converting tuples to cell coordinates
    df_gal_cube["cell"] = list(zip(df_gal_cube['x[kpc/h]'].astype(int), df_gal_cube['y[kpc/h]'].astype(int), df_gal_cube['z[kpc/h]'].astype(int)))
    
    #array to store number of galaxies in the cells
    gals_in_cell = np.zeros((clus_cube_size//cell_size, clus_cube_size//cell_size, clus_cube_size//cell_size))
    
    #counting number of galaxies in the cells
    for cell in df_gal_cube["cell"]:
        x, y, z = cell
        gals_in_cell[x, y, z] += 1
        
    #determining the overdensity of galaxies    
    delta_gal = (gals_in_cell/delta_gal_mean) - 1
    
    #smoothing the overdensity of galaxies
    delta_gal_smooth = gaussian_filter(delta_gal, sigma = sigma_in_pix)
        
    #obtaining matter overdensity from galaxies overdensity
    delta_matter = delta_gal_smooth / b
    
    return delta_matter


**Calculating differential, numerator & denominator for Eq. 1**

In [39]:
def vel_terms_calc(cell_size, clus_cube_size):
    
    #calculating the differential in the Eq. 1
    dy_cubed = cell_size**3
    
    #specifing position of the clusters
    Rclus_x = np.zeros((clus_cube_size//cell_size, clus_cube_size//cell_size, clus_cube_size//cell_size))
    Rclus_x[:] = clus_cube_size//2
    Rclus_y = np.zeros((clus_cube_size//cell_size, clus_cube_size//cell_size, clus_cube_size//cell_size))
    Rclus_y[:] = clus_cube_size//2
    Rclus_z = np.zeros((clus_cube_size//cell_size, clus_cube_size//cell_size, clus_cube_size//cell_size))
    Rclus_z[:] = clus_cube_size//2

    #generating meshgrid containing coordinates of the centers of cells
    Rcell_x = np.zeros((clus_cube_size//cell_size, clus_cube_size//cell_size, clus_cube_size//cell_size))
    Rcell_y = np.zeros((clus_cube_size//cell_size, clus_cube_size//cell_size, clus_cube_size//cell_size))
    Rcell_z = np.zeros((clus_cube_size//cell_size, clus_cube_size//cell_size, clus_cube_size//cell_size))

    for i, val in enumerate(range(cell_size//2, clus_cube_size, cell_size)):
        Rcell_x[i,:,:] = val
        Rcell_y[:,i,:] = val
        Rcell_z[:,:,i] = val

    #evaluating the term in the denominator of Eq. 1 of Tanimura et al. 2020
    denom = np.sqrt((Rcell_x - Rclus_x)**2 + (Rcell_y - Rclus_y)**2 + (Rcell_z - Rclus_z)**2)**(3)

    #evaluating the direction term in the numerator of Eq. 1
    numer_x = Rcell_x - Rclus_x
    numer_y = Rcell_y - Rclus_y
    numer_z = Rcell_z - Rclus_z
    
    return (dy_cubed, numer_x, numer_y, numer_z, denom)


**Calculating velocity of clusters according to Eq. 1**

In [40]:
def clus_velocity_calc(clus_x, clus_y, clus_z, delta_gal_mean, cell_size, vel_terms, sigma_in_pix, 
                       clus_cube_size, df_gal_padded):
    
    delta_matter = overdensity_field_calc(clus_x, clus_y, clus_z, delta_gal_mean, cell_size, sigma_in_pix, 
                                          clus_cube_size, df_gal_padded)
    
    
    dy_cubed, numer_x, numer_y, numer_z, denom = vel_terms
       
    #estimating velocity in x direction
    integrand_x = dy_cubed * delta_matter * (numer_x/(h*1e3)) / denom #in units of Mpc    
    vx_est = pre_fac * np.sum(integrand_x)
        
    #estimating velocity in y direction
    integrand_y = dy_cubed * delta_matter * (numer_y/(h*1e3)) / denom #in units of Mpc    
    vy_est = pre_fac * np.sum(integrand_y)
    
    #estimating velocity in z direction
    integrand_z = dy_cubed * delta_matter * (numer_z/(h*1e3)) / denom #in units of Mpc    
    vz_est = pre_fac * np.sum(integrand_z)
    
    return(vx_est, vy_est, vz_est)


**Calculating sigma for smoothing**

In [41]:
def sigma_calc(cell_size):
    
    FWHM = 2000                           #h^-1 kpc, of Gaussian kernel, taken by Tanimura et al. 2020
    FWHM_in_pix = FWHM/cell_size          #in pixel units
    sigma_in_pix = FWHM_in_pix/(2.35482)  #in pixel units
    
    return sigma_in_pix

**Plotting the scatter plots & histograms for assesment of velocity estimates**

In [42]:
def plotting_func(df_clusters_est_err, cell_size, clus_cube_size_actual):
    
    fig = plt.figure(dpi = dpi, figsize = (10,12), facecolor=(1, 1, 1))

    plt.subplot(3,2,1)
    plt.scatter(df_clusters_est_err['vx[km/s]'], df_clusters_est_err['vx_est[km/s]'], s = 8)
    plt.xlabel('V$_\mathrm{x, true}$ (km/s)')
    plt.ylabel('V$_\mathrm{x, estimated}$ (km/s)')
    plt.gca().set_xticks(range(-2000, 2001, 1000))
    plt.gca().set_yticks(range(-2000, 2001, 1000))
    plt.ylim(-2000,2000)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.title('Vx - scatter plot')
    plt.text(0.6, 0.15, f'Clusters: {len(df_clusters_est_err)}', transform=plt.gca().transAxes)

    plt.subplot(3,2,2)
    error_x = df_clusters_est_err['vx[km/s]'] - df_clusters_est_err['vx_est[km/s]']
    plt.hist(error_x, bins = 100)
    plt.xlabel('Error in V$_{\mathrm{x}}$ (km/s)')
    plt.ylabel('Number of clusters')
    x_low, x_high = plt.xlim()
    plt.xlim(x_low, abs(x_low))

    minus_one = {-1}
    plt.text(0.57, 0.90, r'Cell size: {} h$^{}$kpc'.format(cell_size, minus_one), transform=plt.gca().transAxes)
    plt.text(0.57, 0.83, r'Cube size: {} h$^{}$kpc'.format(clus_cube_size, minus_one), transform=plt.gca().transAxes)
    plt.text(0.6, 0.76, f'Mean: {round(np.mean(error_x), 1)} km/s', transform=plt.gca().transAxes)
    plt.text(0.6, 0.69, f'SD: {round(np.std(error_x), 1)} km/s', transform=plt.gca().transAxes)
    r_vx = np.corrcoef(df_clusters_est_err['vx[km/s]'], df_clusters_est_err['vx_est[km/s]'])[1,0]
    plt.text(0.6, 0.62, f'Pearson\'s r: {round(r_vx, 2)}', transform=plt.gca().transAxes)

    plt.title('Vx error - histogram')




    plt.subplot(3,2,3)
    plt.scatter(df_clusters_est_err['vy[km/s]'], df_clusters_est_err['vy_est[km/s]'], s = 8)
    plt.xlabel('V$_\mathrm{y, true}$ (km/s)')
    plt.ylabel('V$_\mathrm{y, estimated}$ (km/s)')
    plt.gca().set_xticks(range(-2000, 2001, 1000))
    plt.gca().set_yticks(range(-2000, 2001, 1000))
    plt.ylim(-2000,2000)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.title('Vy - scatter plot')
    plt.text(0.6, 0.15, f'Clusters: {len(df_clusters_est_err)}', transform=plt.gca().transAxes)

    plt.subplot(3,2,4)
    error_y = df_clusters_est_err['vy[km/s]'] - df_clusters_est_err['vy_est[km/s]']
    plt.hist(error_y, bins = 100)
    plt.xlabel('Error in V$_{\mathrm{y}}$ (km/s)')
    plt.ylabel('Number of clusters')
    x_low, x_high = plt.xlim()
    plt.xlim(x_low, abs(x_low))

    plt.text(0.57, 0.90, r'Cell size: {} h$^{}$kpc'.format(cell_size, minus_one), transform=plt.gca().transAxes)
    plt.text(0.57, 0.83, r'Cube size: {} h$^{}$kpc'.format(clus_cube_size, minus_one), transform=plt.gca().transAxes)
    plt.text(0.6, 0.76, f'Mean: {round(np.mean(error_y), 1)} km/s', transform=plt.gca().transAxes)
    plt.text(0.6, 0.69, f'SD: {round(np.std(error_y), 1)} km/s', transform=plt.gca().transAxes)
    r_vy = np.corrcoef(df_clusters_est_err['vy[km/s]'], df_clusters_est_err['vy_est[km/s]'])[1,0]
    plt.text(0.6, 0.62, f'Pearson\'s r: {round(r_vy, 2)}', transform=plt.gca().transAxes)

    plt.title('Vy error - histogram')




    plt.subplot(3,2,5)
    plt.scatter(df_clusters_est_err['vz[km/s]'], df_clusters_est_err['vz_est[km/s]'], s = 8)
    plt.xlabel('V$_\mathrm{z, true}$ (km/s)')
    plt.ylabel('V$_\mathrm{z, estimated}$ (km/s)');
    plt.gca().set_xticks(range(-2000, 2001, 1000))
    plt.gca().set_yticks(range(-2000, 2001, 1000))
    plt.ylim(-2000,2000)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.title('Vz - scatter plot')
    plt.text(0.6, 0.15, f'Clusters: {len(df_clusters_est_err)}', transform=plt.gca().transAxes)

    plt.subplot(3,2,6)
    error_z = df_clusters_est_err['vz[km/s]'] - df_clusters_est_err['vz_est[km/s]']
    plt.hist(error_z, bins = 100)
    plt.xlabel('Error in V$_{\mathrm{z}}$ (km/s)')
    plt.ylabel('Number of clusters')
    x_low, x_high = plt.xlim()
    plt.xlim(x_low, abs(x_low))

    plt.text(0.57, 0.90, r'Cell size: {} h$^{}$kpc'.format(cell_size, minus_one), transform=plt.gca().transAxes)
    plt.text(0.57, 0.83, r'Cube size: {} h$^{}$kpc'.format(clus_cube_size, minus_one), transform=plt.gca().transAxes)
    plt.text(0.6, 0.76, f'Mean: {round(np.mean(error_z), 1)} km/s', transform=plt.gca().transAxes)
    plt.text(0.6, 0.69, f'SD: {round(np.std(error_z), 1)} km/s', transform=plt.gca().transAxes)
    r_vz = np.corrcoef(df_clusters_est_err['vz[km/s]'], df_clusters_est_err['vz_est[km/s]'])[1,0]
    plt.text(0.6, 0.62, f'Pearson\'s r: {round(r_vz, 2)}', transform=plt.gca().transAxes)

    plt.title('Vz error - histogram')

    os.system(f'mkdir Plots/big-sim-box/v_scatter_hist/{clus_cube_size_actual}')
    plt.subplots_adjust(top = 0.9, hspace = 0.4, wspace = 0.3)
    plt.tight_layout()
    plt.savefig(f'Plots/big-sim-box/v_scatter_hist/{clus_cube_size_actual}/v_scatter_hist_{cell_size}.png')
    plt.close()

**Examining the effects of cluster cube & cell sizes variation on velocity estimates**

In [43]:
%%time

cube_cell_size_assess = pd.DataFrame(columns=['Cell Size', 'Act Cube Size', 'Cube Size Set', 'Clusters', 'Mean - Vx', 'Mean - Vy', 
                                              'Mean - Vz', 'SD - Vx', 'SD - Vy', 'SD - Vz', 'r - Vx', 'r - Vy','r - Vz'])

cube_cell_size_assess_no_round = pd.DataFrame(columns=['Cell Size', 'Act Cube Size', 'Cube Size Set', 'Clusters', 'Mean - Vx', 'Mean - Vy', 
                                              'Mean - Vz', 'SD - Vx', 'SD - Vy', 'SD - Vz', 'r - Vx', 'r - Vy','r - Vz'])

for clus_cube_size in [160000, 200000, 240000, 280000, 320000]:
    for cell_size in [2000, 4000, 5000, 8000, 10000, 20000]:

# for clus_cube_size in [280000]:
#     for cell_size in [10000]:                       #h^-1 kpc, size of pixel or cell
        
        print(cell_size, clus_cube_size)
                
        no_of_cells = clus_cube_size//cell_size
        
        clus_cube_size_actual = clus_cube_size
        
        if no_of_cells % 2 != 0:
            clus_cube_size = clus_cube_size + cell_size
        
        sigma_in_pix = sigma_calc(cell_size)

        df_gal_padded = edge_pads_adder(clus_cube_size)

        delta_gal_mean = delta_gal_mean_func(cell_size)

        vel_terms = vel_terms_calc(cell_size, clus_cube_size)
        
        no_of_clus = len(df_clusters_center)

        clus_param = list(zip(df_clusters_center['x[kpc/h]'], df_clusters_center['y[kpc/h]'], df_clusters_center['z[kpc/h]'], 
                              [delta_gal_mean]*no_of_clus, [cell_size]*no_of_clus, 
                              [vel_terms]*no_of_clus, [sigma_in_pix]*no_of_clus,
                              [clus_cube_size]*no_of_clus, [df_gal_padded]*no_of_clus))

        pool = multi.Pool(processes = cores)
        v_est = pool.starmap(clus_velocity_calc, clus_param)

        df_clusters_est_err = df_clusters_center.copy()

        df_clusters_est_err['vx_est[km/s]'] = [i[0] for i in v_est]
        df_clusters_est_err['vy_est[km/s]'] = [i[1] for i in v_est]
        df_clusters_est_err['vz_est[km/s]'] = [i[2] for i in v_est]

        df_clusters_est_err['vx_err[km/s]'] = df_clusters_est_err['vx[km/s]'] - df_clusters_est_err['vx_est[km/s]']
        df_clusters_est_err['vy_err[km/s]'] = df_clusters_est_err['vy[km/s]'] - df_clusters_est_err['vy_est[km/s]']
        df_clusters_est_err['vz_err[km/s]'] = df_clusters_est_err['vz[km/s]'] - df_clusters_est_err['vz_est[km/s]']
        
        plotting_func(df_clusters_est_err, cell_size, clus_cube_size_actual)
        
        clear_output(wait=True)

        cube_cell_size_assess = cube_cell_size_assess.append({'Cell Size': cell_size,
                'Act Cube Size': clus_cube_size_actual,
                'Cube Size Set': clus_cube_size,
                'Clusters': no_of_clus,
                'Mean - Vx': round(np.mean(df_clusters_est_err['vx_err[km/s]']),1), 
                'Mean - Vy': round(np.mean(df_clusters_est_err['vy_err[km/s]']),1), 
                'Mean - Vz': round(np.mean(df_clusters_est_err['vz_err[km/s]']),1), 
                'SD - Vx': round(np.std(df_clusters_est_err['vx_err[km/s]']),1), 
                'SD - Vy': round(np.std(df_clusters_est_err['vy_err[km/s]']),1),
                'SD - Vz': round(np.std(df_clusters_est_err['vz_err[km/s]']),1), 
                'r - Vx': round(np.corrcoef(df_clusters_center['vx[km/s]'], df_clusters_est_err['vx_est[km/s]'])[1,0],2),
                'r - Vy': round(np.corrcoef(df_clusters_center['vy[km/s]'], df_clusters_est_err['vy_est[km/s]'])[1,0],2),
                'r - Vz': round(np.corrcoef(df_clusters_center['vz[km/s]'], df_clusters_est_err['vz_est[km/s]'])[1,0],2)}, 
                ignore_index=True)
        
        cube_cell_size_assess_no_round = cube_cell_size_assess_no_round.append({'Cell Size': cell_size,
                'Act Cube Size': clus_cube_size_actual,
                'Cube Size Set': clus_cube_size,
                'Clusters': no_of_clus,
                'Mean - Vx': np.mean(df_clusters_est_err['vx_err[km/s]']), 
                'Mean - Vy': np.mean(df_clusters_est_err['vy_err[km/s]']), 
                'Mean - Vz': np.mean(df_clusters_est_err['vz_err[km/s]']), 
                'SD - Vx': np.std(df_clusters_est_err['vx_err[km/s]']),
                'SD - Vy': np.std(df_clusters_est_err['vy_err[km/s]']),
                'SD - Vz': np.std(df_clusters_est_err['vz_err[km/s]']), 
                'r - Vx': np.corrcoef(df_clusters_center['vx[km/s]'], df_clusters_est_err['vx_est[km/s]'])[1,0],
                'r - Vy': np.corrcoef(df_clusters_center['vy[km/s]'], df_clusters_est_err['vy_est[km/s]'])[1,0],
                'r - Vz': np.corrcoef(df_clusters_center['vz[km/s]'], df_clusters_est_err['vz_est[km/s]'])[1,0]}, 
                ignore_index=True)
        
        clus_cube_size = clus_cube_size_actual
        

CPU times: user 5min 20s, sys: 2min 54s, total: 8min 14s
Wall time: 12min 20s


**Writing & seeing the assesment table**

In [46]:
cube_cell_size_assess_no_round.to_csv('Tables/big-sim-box/cube_cell_size_assess_big-sim-box_unrounded.tsv', index = False, sep = '\t')
cube_cell_size_assess.round(2).to_csv('Tables/big-sim-box/cube_cell_size_assess_big-sim-box_rounded.tsv', index = False, sep = '\t')
cube_cell_size_assess

Unnamed: 0,Cell Size,Act Cube Size,Cube Size Set,Clusters,Mean - Vx,Mean - Vy,Mean - Vz,SD - Vx,SD - Vy,SD - Vz,r - Vx,r - Vy,r - Vz
0,2000.0,160000.0,160000.0,6080.0,-732.8,-716.6,-722.4,910.2,902.7,906.6,0.35,0.34,0.39
1,4000.0,160000.0,160000.0,6080.0,-237.4,-230.0,-231.0,416.0,416.9,405.2,0.58,0.57,0.63
2,5000.0,160000.0,160000.0,6080.0,-151.1,-147.7,-148.1,319.3,323.3,304.7,0.65,0.64,0.7
3,8000.0,160000.0,160000.0,6080.0,-59.8,-59.1,-59.8,218.8,223.0,189.5,0.75,0.72,0.8
4,10000.0,160000.0,160000.0,6080.0,-38.5,-39.3,-39.8,200.3,203.7,165.7,0.78,0.75,0.82
5,20000.0,160000.0,160000.0,6080.0,-7.3,-11.3,-13.3,206.3,204.8,173.6,0.78,0.77,0.81
6,2000.0,200000.0,200000.0,6080.0,-732.7,-716.1,-722.0,908.5,900.7,904.8,0.36,0.35,0.4
7,4000.0,200000.0,200000.0,6080.0,-237.3,-229.5,-230.7,412.0,412.2,402.2,0.59,0.58,0.64
8,5000.0,200000.0,200000.0,6080.0,-151.0,-147.2,-147.8,313.9,317.2,300.9,0.67,0.66,0.72
9,8000.0,200000.0,208000.0,6080.0,-59.6,-58.6,-59.4,209.4,212.2,183.0,0.77,0.75,0.81


**Reshaping the Pearson's r for Vz column into a more easy-to-read form**

In [21]:
r_Vz_table = pd.DataFrame()

for name, group in cube_cell_size_assess.groupby("Cell Size"):
    if r_Vz_table.empty:
        r_Vz_table = group.set_index("Act Cube Size")[["r - Vz"]].rename(columns={"r - Vz":name})
    else:
        r_Vz_table = r_Vz_table.join(group.set_index("Act Cube Size")[["r - Vz"]].rename(columns={"r - Vz":name}))

r_Vz_table

Unnamed: 0_level_0,2000.0,4000.0,5000.0,8000.0,10000.0,20000.0
Act Cube Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
160000.0,0.39,0.63,0.7,0.8,0.82,0.81
200000.0,0.4,0.64,0.72,0.81,0.84,0.82
240000.0,0.4,0.65,0.72,0.82,0.85,0.83
280000.0,0.41,0.65,0.73,0.82,0.85,0.83
320000.0,0.41,0.65,0.73,0.82,0.85,0.83


**Plotting the mean, SD & r response due to variation of cell sizes for a given cluster cube size**

In [22]:
x_axis = np.arange(0,6)

# for clus_cube_size in [240000]:
for clus_cube_size in [160000, 200000, 240000, 280000, 320000]:

    plt.figure(dpi = dpi, figsize = (5,9), facecolor=(1, 1, 1))
    plt.subplot(311)
    plt.plot(x_axis, cube_cell_size_assess[cube_cell_size_assess['Act Cube Size'] == clus_cube_size]['Mean - Vx'], label = 'Vx')
    plt.plot(x_axis, cube_cell_size_assess[cube_cell_size_assess['Act Cube Size'] == clus_cube_size]['Mean - Vy'], label = 'Vy')
    plt.plot(x_axis, cube_cell_size_assess[cube_cell_size_assess['Act Cube Size'] == clus_cube_size]['Mean - Vz'], label = 'Vz')
    plt.legend()
    plt.xlabel(r'Cell size (h$^{-1}$kpc)')
    plt.ylabel('Mean of error (km/s)')
    plt.xticks(x_axis, [2000, 4000, 5000, 8000, 10000, 20000])
    minus_one = {-1}
    no_of_clus = int(cube_cell_size_assess[cube_cell_size_assess['Act Cube Size'] == clus_cube_size]['Clusters'].iloc[0])
    plt.title(r'Mean of velocity error | Cube: {} h$^{}$Mpc | Clusters: {}'.format(int(clus_cube_size/1e3), minus_one, no_of_clus), fontsize = 10)

    plt.subplot(312)
    plt.plot(x_axis, cube_cell_size_assess[cube_cell_size_assess['Act Cube Size'] == clus_cube_size]['SD - Vx'], label = 'Vx')
    plt.plot(x_axis, cube_cell_size_assess[cube_cell_size_assess['Act Cube Size'] == clus_cube_size]['SD - Vy'], label = 'Vy')
    plt.plot(x_axis, cube_cell_size_assess[cube_cell_size_assess['Act Cube Size'] == clus_cube_size]['SD - Vz'], label = 'Vz')
    plt.legend()
    plt.xlabel(r'Cell size (h$^{-1}$kpc)')
    plt.ylabel('SD of error (km/s)')
    plt.xticks(x_axis, [2000, 4000, 5000, 8000, 10000, 20000])
    minus_one = {-1}
    plt.title(r'SD of velocity error | Cube: {} h$^{}$Mpc | Clusters: {}'.format(int(clus_cube_size/1e3), minus_one, no_of_clus), fontsize = 10)

    plt.subplot(313)
    plt.plot(x_axis, cube_cell_size_assess[cube_cell_size_assess['Act Cube Size'] == clus_cube_size]['r - Vx'], label = 'Vx')
    plt.plot(x_axis, cube_cell_size_assess[cube_cell_size_assess['Act Cube Size'] == clus_cube_size]['r - Vy'], label = 'Vy')
    plt.plot(x_axis, cube_cell_size_assess[cube_cell_size_assess['Act Cube Size'] == clus_cube_size]['r - Vz'], label = 'Vz')
    plt.legend()
    plt.xlabel(r'Cell size (h$^{-1}$kpc)')
    plt.ylabel('Pearson\'s r')
    plt.xticks(x_axis, [2000, 4000, 5000, 8000, 10000, 20000])
    minus_one = {-1}
    plt.title(r"Pearson's r | Cube: {} h$^{}$Mpc | Clusters: {}".format(int(clus_cube_size/1e3), minus_one, no_of_clus), fontsize = 10)
    plt.subplots_adjust(hspace = 0.6)
    plt.tight_layout()
    plt.savefig(f'Plots/big-sim-box/cell_size_exam/clus_box_{clus_cube_size}.png')
    plt.close()

**Plotting the mean, SD & r response due to variation of cluster cube sizes for a given cell size**

In [23]:

x_axis = np.arange(0,5)

# for cell_size in [5000]:
for cell_size in [2000, 4000, 5000, 8000, 10000, 20000]:

    plt.figure(dpi = dpi, figsize = (5,10), facecolor=(1, 1, 1))
    plt.subplot(311)
    plt.plot(x_axis, cube_cell_size_assess[cube_cell_size_assess['Cell Size'] == cell_size]['Mean - Vx'], label = 'Vx')
    plt.plot(x_axis, cube_cell_size_assess[cube_cell_size_assess['Cell Size'] == cell_size]['Mean - Vy'], label = 'Vy')
    plt.plot(x_axis, cube_cell_size_assess[cube_cell_size_assess['Cell Size'] == cell_size]['Mean - Vz'], label = 'Vz')
    plt.legend()
    plt.xlabel(r'Cube size (h$^{-1}$kpc)')
    plt.ylabel('Mean of error (km/s)')
    plt.xticks(x_axis, [160000, 200000, 240000, 280000, 320000])
    plt.ylim(-750,50)
    minus_one = {-1}
    no_of_clus = len(df_clusters_center)
    plt.title(r'Mean of velocity error | Cell: {} h$^{}$Mpc | Clusters: {}'.format(int(cell_size/1e3), minus_one, no_of_clus), fontsize = 10)

    plt.subplot(312)
    plt.plot(x_axis, cube_cell_size_assess[cube_cell_size_assess['Cell Size'] == cell_size]['SD - Vx'], label = 'Vx')
    plt.plot(x_axis, cube_cell_size_assess[cube_cell_size_assess['Cell Size'] == cell_size]['SD - Vy'], label = 'Vy')
    plt.plot(x_axis, cube_cell_size_assess[cube_cell_size_assess['Cell Size'] == cell_size]['SD - Vz'], label = 'Vz')
    plt.legend()
    plt.xlabel(r'Cube size (h$^{-1}$kpc)')
    plt.ylabel('SD of error (km/s)')
    plt.ylim(100, 950)
    plt.xticks(x_axis, [160000, 200000, 240000, 280000, 320000])
    plt.title(r'SD of velocity error | Cell: {} h$^{}$Mpc | Clusters: {}'.format(int(cell_size/1e3), minus_one, no_of_clus), fontsize = 10)

    plt.subplot(313)
    plt.plot(x_axis, cube_cell_size_assess[cube_cell_size_assess['Cell Size'] == cell_size]['r - Vx'], label = 'Vx')
    plt.plot(x_axis, cube_cell_size_assess[cube_cell_size_assess['Cell Size'] == cell_size]['r - Vy'], label = 'Vy')
    plt.plot(x_axis, cube_cell_size_assess[cube_cell_size_assess['Cell Size'] == cell_size]['r - Vz'], label = 'Vz')
    plt.legend()
    plt.xlabel(r'Cube size (h$^{-1}$kpc)')
    plt.ylabel('Pearson\'s r')
    plt.xticks(x_axis, [160000, 200000, 240000, 280000, 320000])
    plt.ylim(0.30, 0.90)
    plt.title(r"Pearson's | Cell: {} h$^{}$Mpc | Clusters: {}".format(int(cell_size/1e3), minus_one, no_of_clus), fontsize = 10)

    plt.subplots_adjust(hspace = 0.7)
    plt.tight_layout()
    plt.savefig(f'Plots/big-sim-box/clus_bos_size_exam/cell_size_{cell_size}.png')
#     plt.savefig(f'Plots/big-sim-box/clus_bos_size_exam/free_y_lim/cell_size_{cell_size}.png')
    plt.close()

**Finding the minimum value of mean & Sd, and maximum value of r**

In [24]:
print(cube_cell_size_assess['Mean - Vx'].abs().min())
print(cube_cell_size_assess['Mean - Vy'].abs().min())
print(cube_cell_size_assess['Mean - Vz'].abs().min())
print(cube_cell_size_assess['SD - Vx'].abs().min())
print(cube_cell_size_assess['SD - Vy'].abs().min())
print(cube_cell_size_assess['SD - Vz'].abs().min())
print(cube_cell_size_assess['r - Vx'].max())
print(cube_cell_size_assess['r - Vy'].max())
print(cube_cell_size_assess['r - Vz'].max())

6.7
10.4
12.7
173.9
173.7
153.9
0.84
0.84
0.85


**Conclusions**  
Similar to original simulation box, 
1. All three components of velocities behave similarly so anyone of them can be taken from now onwards as the line of sight velocity. Pearson's r is highest (0.85) for V$_\mathrm{z}$ so maybe prefer V$_\mathrm{z}$.
2. The code to estimate velocity seems to work fine since:  
(a) mean error in V$_\mathrm{x}$ goes to 6.7 km/s for cell size of 20,000 h$^\mathrm{-1}$kpc & cluster box size of 320,000 h$^\mathrm{-1}$kpc (SD is 179 km/s & r is 0.84 for this case)   
(b) standard deviation of error in V$_\mathrm{z}$ goes to 154 km/s for cell size of 10,000 h$^\mathrm{-1}$kpc & cluster box size of 320,000 h$^\mathrm{-1}$kpc (Mean is -38 km/s & r is 0.85 for this case)  
(c) Pearson's r goes to 0.85 for V$_\mathrm{z}$ estimates coming from cell size of 10,000 h$^\mathrm{-1}$kpc & cluster box size of 320,000 h$^\mathrm{-1}$kpc (Mean is -38 km/s & SD is 154 km/s for this case)
3. Cell size of 10,000 h$^\mathrm{-1}$kpc should be chosen since it gives least SD & maximum r.
4. Cluster box size does not make much difference but higher values do give a litter better mean, SD and r. A value of 320,000 h$^\mathrm{-1}$kpc may be choszen since it give r of 0.85.