We check the data structure; write a function to load it as a dataframe;
and make cut and compare to the real ones.

**1. Data structure**

In [2]:
import h5py

def h5ls(f):
    if type(f) == h5py._hl.dataset.Dataset:
        print(f, f[:10])
    else:
        print(f.keys())
        for key in f.keys():
            h5ls(f[key])

with h5py.File('../data/mock_lightcone/halo_lightcone_catalogue/halo_properties_in_lightcones.hdf5', 'r') as f:
    h5ls(f)

<KeysViewHDF5 ['lightcone0000']>
<KeysViewHDF5 ['GalaxyID', 'GasMass', 'LX0InRestframeWithoutRecentAGNHeating', 'LX0InRestframeWithoutRecentAGNHeatingCoreExcision', 'LX0WithoutRecentAGNHeating', 'LX0WithoutRecentAGNHeatingCoreExcision', 'M500', 'M_fof_lc', 'MfofSOAP', 'SOAPID', 'SpectroscopicLikeTemperatureWithoutRecentAGNHeatingCoreExcision', 'TopLeafID', 'Vx', 'Vy', 'Vz', 'Y5R500WithoutRecentAGNHeating', 'phi_on_lc', 'redshift', 'snap_num', 'theta_on_lc', 'x_lc', 'y_lc', 'z_lc']>
<HDF5 dataset "GalaxyID": shape (134286,), type "<i8"> [925930949 496435074  79992837 852524320 817871995 504508668 837145724
 519087878 352882225 718970137]
<HDF5 dataset "GasMass": shape (134286,), type "<f4"> [1.5762530e+12 2.1125870e+11 3.9621073e+11 2.4615531e+11 1.5784005e+11
 3.7924561e+12 6.9041599e+11 9.6636764e+10 1.8253611e+11 2.4642375e+11]
<HDF5 dataset "LX0InRestframeWithoutRecentAGNHeating": shape (134286,), type "<f8"> [3.74582830e+42 5.15868068e+41 4.65506278e+41 4.97492820e+41
 1.94981796e+

In [9]:
# revert the file to before _d_comebine_lightcone
with h5py.File('../data/halo_properties_in_lightcones.hdf5', 'a') as f:
    for name in f['lightcone0000'].keys():
        if name[:4] != 'Snap':
            del f[f'lightcone0000/{name}']
   

**2. Load as a dataframe**

In [13]:
import numpy as np
import pandas as pd

def load_lightcone(filename, lightcone_num):
    """
    Return the observers x, y, z coord in cMpc, and a dataset containing
    the list of halo properties.

    Note
    --
    The coordinates here ranges from 0 to L instead of -L/2 to L/2, different to 
    """
    dict = {}
    with h5py.File(filename, 'r') as f:
        lc = f'lightcone{lightcone_num:04d}'
        for qty_key, qty_dataset in f[lc].items():
            if qty_key in dict.keys():
                dict[qty_key] = np.concatenate((dict[qty_key], qty_dataset[:]))
            else:
                dict[qty_key] = qty_dataset[:]

        Xobs = f[lc].attrs['Xobs']
        Yobs = f[lc].attrs['Yobs']
        Zobs = f[lc].attrs['Zobs']

    # for output
    XYZobs = np.array([Xobs, Yobs, Zobs])
    catalogue = pd.DataFrame(dict)
    return XYZobs, catalogue

obs_coord, cat = load_lightcone(filename='../data/halo_properties_in_lightcones.hdf5', lightcone_num=0)

In [14]:
obs_coord

array([750., 750., 750.])

In [15]:
cat 

Unnamed: 0,GalaxyID,GasMass,LX0InRestframeWithoutRecentAGNHeating,LX0InRestframeWithoutRecentAGNHeatingCoreExcision,LX0WithoutRecentAGNHeating,LX0WithoutRecentAGNHeatingCoreExcision,M500,M_fof_lc,MfofSOAP,SOAPID,...,Vy,Vz,Y5R500WithoutRecentAGNHeating,phi_on_lc,redshift,snap_num,theta_on_lc,x_lc,y_lc,z_lc
0,925930949,1.576253e+12,3.745828e+42,3.484491e+42,3.751273e+42,3.500825e+42,3.322587e+13,5.464225e+13,5.463198e+13,107540,...,-99.607788,-103.407776,7.872773e+42,35.079752,0.202633,73,46.005025,482.730114,499.969404,488.073594
1,496435074,2.112587e+11,5.158681e+41,4.287558e+41,4.947706e+41,4.110611e+41,6.854768e+12,1.448629e+13,1.448263e+13,108177,...,-112.380066,-77.380066,5.893691e+41,34.494315,0.204441,73,45.063492,498.558204,499.664384,485.014424
2,79992837,3.962107e+11,4.655063e+41,4.655063e+41,4.478116e+41,4.478116e+41,1.233515e+13,2.396008e+13,2.394874e+13,547901,...,-135.039307,-25.539307,1.510854e+42,35.126579,0.200162,73,-44.904141,486.268407,-484.644013,482.984137
3,852524320,2.461553e+11,4.974928e+41,3.062541e+41,4.852427e+41,2.912817e+41,1.017048e+13,1.634095e+13,1.633806e+13,548236,...,-276.651001,163.649048,9.119567e+41,35.031421,0.201908,73,-45.735446,483.717182,-496.297288,485.832832
4,817871995,1.578400e+11,1.949818e+41,1.949818e+41,1.844330e+41,1.844330e+41,6.923487e+12,1.178049e+13,1.178539e+13,548611,...,-116.151001,-83.850952,4.777564e+41,35.055800,0.200741,73,-45.266860,484.938657,-489.477113,483.460157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134281,740659177,8.576513e+10,6.048519e+40,4.372628e+40,5.988970e+40,4.325840e+40,5.944235e+12,1.028671e+13,1.029074e+13,13331286,...,-59.374634,-71.474609,2.626980e+41,72.372207,0.030784,76,85.665792,3.080619,40.646339,128.285039
134282,472785622,1.288490e+11,5.912406e+40,3.287978e+40,5.844350e+40,3.198654e+40,6.622840e+12,1.059169e+13,1.059998e+13,13331331,...,-139.774597,-204.874634,2.188016e+41,62.347162,0.045958,76,16.013698,89.312552,25.633072,177.337552
134283,287556330,8.979166e+10,3.385810e+40,3.385810e+40,3.334767e+40,3.334767e+40,6.614250e+12,1.031581e+13,1.030792e+13,13331328,...,-183.374634,-34.674561,2.075722e+41,60.006044,0.038558,76,35.523963,68.434924,48.857354,145.675934
134284,69803936,7.865159e+10,3.772881e+40,2.543611e+40,3.696317e+40,2.488315e+40,5.729486e+12,1.025946e+13,1.025638e+13,13219519,...,-83.838013,101.361938,1.776274e+41,66.947945,0.031261,76,83.622434,5.940499,53.148599,125.672369


**3. Compare to our sample**

In [16]:
import pandas as pd
df = pd.read_hdf('../data/halo_properties_in_lightcone0.hdf5', key='lightcone', mode='r')
df

Unnamed: 0,lc_id,redshift,theta_on_lc,phi_on_lc,M_fof_lc,x_lc,y_lc,z_lc,snap_num,MfofSOAP,...,LX0WithoutRecentAGNHeating,LX0InRestframeWithoutRecentAGNHeating,LX0WithoutRecentAGNHeatingCoreExcision,LX0InRestframeWithoutRecentAGNHeatingCoreExcision,GasTemperatureWithoutRecentAGNHeatingCoreExcision,SpectroscopicLikeTemperatureWithoutRecentAGNHeatingCoreExcision,Y5R500WithoutRecentAGNHeating,Vx,Vy,Vz
0,13494138,0.289412,2.434740,46.845344,5.089802e+12,810.524940,864.491405,50.387261,71,5.093831e+12,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.0,0.000000e+00,-13.873657,136.326300,-286.77368
1,13492844,0.288434,2.159398,47.195837,1.007841e+13,802.852393,866.875374,44.551823,71,1.008458e+13,...,5.257363e+40,7.749931e+40,4.423671e+40,6.142097e+40,2473984.0,2224128.0,2.715453e+41,-189.534910,68.665040,-146.43494
2,13493475,0.287379,2.838580,47.159344,6.340263e+12,800.310973,863.028035,58.359069,71,6.339372e+12,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.0,0.000000e+00,-194.329100,32.970947,-198.32910
3,13492385,0.285515,2.281295,47.353490,1.101831e+13,792.932866,860.904376,46.626492,71,1.101230e+13,...,3.375601e+41,3.688661e+41,3.375601e+41,3.688661e+41,2629632.0,5652480.0,3.586576e+41,-226.034910,166.265140,-223.23492
4,13493858,0.279886,3.055411,45.777673,5.561836e+12,800.853042,822.893445,61.291777,71,5.557688e+12,...,2.930682e+40,3.555951e+40,2.922175e+40,3.547444e+40,1148928.0,3170304.0,6.031505e+40,-238.429080,-55.829100,-228.42908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12194113,7422479,0.127782,-2.290613,-46.849291,5.109128e+12,372.898101,-397.781361,-21.809497,74,5.111011e+12,...,6.388801e+40,6.984296e+40,6.380294e+40,6.975789e+40,1020928.0,2842624.0,1.175676e+41,157.193000,-96.707030,507.99304
12194114,7421622,0.127696,-2.282832,-46.711938,7.941985e+12,373.292331,-396.293601,-21.702877,74,7.945690e+12,...,2.113153e+41,2.191418e+41,9.017483e+40,9.323737e+40,3502080.0,3702784.0,2.688231e+41,50.399048,-243.600950,405.79895
12194115,7420256,0.127743,-2.912506,-44.372803,2.831814e+13,389.670885,-381.231748,-27.735059,74,2.831242e+13,...,2.136973e+42,2.139696e+42,1.246795e+42,1.253600e+42,7872512.0,7806976.0,1.973638e+42,-160.425900,-243.125920,335.87415
12194116,7421463,0.128083,-2.978288,-43.864296,8.746282e+12,393.683956,-378.378307,-28.409139,74,8.744553e+12,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.0,0.000000e+00,-207.100950,-355.601000,286.29895


In [28]:
# Extract the overlaps
true_soapid = df['SOAPID'].values
true_snapnum = df['snap_num'].values

mock_soapid = cat['SOAPID'].values
mock_snapnum = cat['snap_num'].values

# the halos from mock lightcones that are in the halo lightcones
mock_overlap_mask = np.isin(mock_soapid, true_soapid) & np.isin(mock_snapnum, true_snapnum)
mock_overlap = cat[mock_overlap_mask]

# the halos from halo_lightcones that are in our mock samples
true_overlap_mask = np.isin(true_soapid, mock_soapid) & np.isin(true_snapnum, mock_snapnum)
true_overlap = df[true_overlap_mask]

print(len(mock_overlap), len(true_overlap))

mock_overlap.sort_values(by='redshift', inplace=True)
mock_overlap = mock_overlap.drop_duplicates(subset=['SOAPID', 'snap_num'], keep='first') 

true_overlap.sort_values(by='redshift', inplace=True)
true_overlap = true_overlap.drop_duplicates(subset=['SOAPID', 'snap_num'], keep='first')

print(len(mock_overlap), len(true_overlap))

131831 198259
131831 138563


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mock_overlap.sort_values(by='redshift', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  true_overlap.sort_values(by='redshift', inplace=True)


In [30]:
mock_overlap

Unnamed: 0,GalaxyID,GasMass,LX0InRestframeWithoutRecentAGNHeating,LX0InRestframeWithoutRecentAGNHeatingCoreExcision,LX0WithoutRecentAGNHeating,LX0WithoutRecentAGNHeatingCoreExcision,M500,M_fof_lc,MfofSOAP,SOAPID,...,Vy,Vz,Y5R500WithoutRecentAGNHeating,phi_on_lc,redshift,snap_num,theta_on_lc,x_lc,y_lc,z_lc
130392,135974422,1.069715e+11,1.301580e+41,5.699730e+40,1.301580e+41,5.699730e+40,6.837588e+12,1.297204e+13,1.297080e+13,12092781,...,236.147705,-193.052368,2.912817e+41,-44.200430,0.005554,77,101.516290,-3.494822,17.152638,-17.023172
132347,685132241,1.777043e+11,8.847342e+40,6.516407e+40,8.847342e+40,6.516407e+40,9.277129e+12,1.418184e+13,1.417339e+13,12588009,...,81.430237,94.230286,3.416435e+41,15.659146,0.006138,77,-13.677499,25.242591,-6.142979,7.282471
131131,698818846,2.450816e+11,1.259045e+41,1.138245e+41,1.259045e+41,1.138245e+41,1.068588e+13,1.628107e+13,1.628652e+13,12307272,...,-1.998352,-109.498352,6.771619e+41,-29.048466,0.006246,77,91.303693,-0.546152,23.998618,-13.332652
130071,633280161,1.290638e+12,1.529910e+42,1.294434e+42,1.529910e+42,1.294434e+42,2.497953e+13,3.717857e+13,3.717724e+13,12092139,...,281.847656,-145.452271,3.892830e+42,-52.096904,0.007459,77,125.106145,-11.580822,16.474088,-25.864622
130349,760442232,1.758252e+11,9.851175e+40,9.851175e+40,9.851175e+40,9.851175e+40,7.138236e+12,1.425507e+13,1.425929e+13,12092675,...,249.047729,-80.252319,4.641451e+41,-42.737751,0.007554,77,136.712910,-17.747387,16.716743,-22.527637
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6,837145724,6.904160e+11,2.460922e+42,8.997066e+41,2.303031e+42,8.738451e+41,1.810758e+13,3.082907e+13,3.082069e+13,767214,...,-160.327271,-39.427307,2.229530e+42,34.786014,0.202810,73,-134.588731,-490.040002,-497.126002,484.904558
9,718970137,2.464237e+11,6.880509e+41,3.804357e+41,6.730785e+41,3.702272e+41,1.027356e+13,1.946145e+13,1.944761e+13,861582,...,-351.529846,-247.929871,1.456409e+42,-35.476700,0.202849,73,45.927398,481.529464,497.375554,-493.374636
14,383816326,5.626407e+11,1.480909e+42,1.155599e+42,1.486353e+42,1.162405e+42,1.745475e+13,3.190104e+13,3.188584e+13,1546718,...,-214.804810,-148.304810,3.223155e+42,-35.751800,0.203772,73,-133.845141,-479.966310,-499.715550,-498.835590
7,519087878,9.663676e+10,7.222493e+40,4.432178e+40,6.933253e+40,4.308825e+40,7.378754e+12,1.117926e+13,1.118409e+13,767907,...,-188.727295,-76.827271,4.205890e+41,35.717822,0.204423,73,-134.500123,-487.325412,-495.903812,499.933428


In [31]:
true_overlap

Unnamed: 0,lc_id,redshift,theta_on_lc,phi_on_lc,M_fof_lc,x_lc,y_lc,z_lc,snap_num,MfofSOAP,...,LX0WithoutRecentAGNHeating,LX0InRestframeWithoutRecentAGNHeating,LX0WithoutRecentAGNHeatingCoreExcision,LX0InRestframeWithoutRecentAGNHeatingCoreExcision,GasTemperatureWithoutRecentAGNHeatingCoreExcision,SpectroscopicLikeTemperatureWithoutRecentAGNHeatingCoreExcision,Y5R500WithoutRecentAGNHeating,Vx,Vy,Vz
1590680,12092782,0.005547,-44.204207,101.430080,1.297204e+13,-3.464560,17.135870,-17.003580,77,1.297080e+13,...,1.301580e+41,1.301580e+41,5.699730e+40,5.699730e+40,3330048.0,3166208.0,2.912817e+41,-373.552370,236.147700,-193.052370
1589977,12588010,0.006131,15.646501,-13.688982,1.418184e+13,25.220365,-6.142925,7.270235,77,1.417339e+13,...,8.847342e+40,8.847342e+40,6.516407e+40,6.516407e+40,4587520.0,4562944.0,3.416435e+41,254.930240,81.430240,94.230286
1589998,12307273,0.006290,-29.078361,91.299223,1.628107e+13,-0.543224,23.952076,-13.323134,77,1.628652e+13,...,1.259045e+41,1.259045e+41,1.138245e+41,1.138245e+41,6070272.0,6103040.0,6.771619e+41,-194.998350,-1.998352,-109.498350
1589811,12589065,0.006897,71.891733,-24.967590,7.090135e+12,8.608027,-4.008063,29.037037,77,7.086696e+12,...,1.184608e+40,1.184608e+40,1.097411e+40,1.097411e+40,1732608.0,2523136.0,4.959615e+40,108.957340,43.157288,315.957340
1590581,12092140,0.007605,-52.183898,125.038375,3.717857e+13,-11.539276,16.456324,-25.896306,77,3.717724e+13,...,1.529910e+42,1.529910e+42,1.294434e+42,1.294434e+42,9764864.0,10240000.0,3.892830e+42,-424.152340,281.847660,-145.452270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1300387,10640579,0.224880,-3.880162,60.509737,6.256694e+12,460.418854,814.110346,-63.436115,73,6.253472e+12,...,6.031505e+40,9.357765e+40,6.031505e+40,9.357765e+40,2293760.0,2281472.0,9.425822e+40,72.302490,-18.297607,-9.497559
1296683,10640215,0.224885,-7.514917,65.982099,8.673324e+12,378.087345,848.484564,-122.539525,73,8.675834e+12,...,5.572124e+40,5.742265e+40,2.679724e+40,2.892400e+40,2777088.0,3178496.0,1.114425e+41,-36.097534,-216.697570,-18.897583
1280606,12497979,0.224926,-14.921669,89.754234,6.837042e+12,3.884969,905.701690,-241.357629,73,6.837588e+12,...,6.550436e+40,9.766104e+40,6.550436e+40,9.766104e+40,1478656.0,2854912.0,1.485333e+41,121.345580,-334.154420,463.945560
1303845,11358510,0.224970,6.678292,63.028547,1.172043e+13,422.401682,830.032085,109.048476,73,1.171667e+13,...,2.058708e+41,2.102945e+41,1.711620e+41,1.759260e+41,5734400.0,5423104.0,1.813024e+42,-151.119260,-562.319300,-414.319270
