In [1]:
import gzip
from astropy.table import Table

In [2]:
column_info_fname = "/Users/aphearin/work/sims/bolplanck/halo_catalogs/halotools_v0p4/column_info.dat"
column_info_table = Table.read(column_info_fname, format='ascii.commented_header')

In [10]:
halocat_fname = "/Users/aphearin/work/sims/bolplanck/halo_catalogs/halotools_v0p4/hlist_0.08037.list.gz"

import os

output_base_dirname = os.path.dirname(halocat_fname)
halocat_basename = os.path.basename(halocat_fname)
try:
    # sanity check on string formatting
    first_idx, last_idx = len('hlist_'), -len('.list.gz')
    a = float(halocat_basename[first_idx:last_idx])
    assert a > 0.
except:
    raise ValueError("halocat_fname basename {0} not formatted as expected".format(halocat_basename))

a_substring = 'a' + halocat_basename[first_idx-1: last_idx]
# print(a_substring)
output_binaries_dirname = os.path.join(output_base_dirname, a_substring)
print("passively creating the output dirname = \n    {0}".format(output_binaries_dirname))
try:
    os.makedirs(output_binaries_dirname)
except OSError:
    pass

passively creating the output dirname = 
    /Users/aphearin/work/sims/bolplanck/halo_catalogs/halotools_v0p4/a_0.08037


In [1]:
from halocat_binary_reduction import create_output_dir

create_output_dir()

'/Users/aphearin/work/sims/bolplanck/halo_catalogs/halotools_v0p4/a_0.08037'

In [1]:
def load_halocat(fname, *colnames):
    pass

In [28]:
def get_column_info(colname, column_info_table):
    idx = np.where(column_info_table['colname'] == colname)[0]
    try:
        dt = column_info_table['coltype'][idx][0]
        assert len(idx) == 1
    except AssertionError:
        raise AssertionError("detected multiple columns names ``{0}``".format(colname))
    except IndexError:
        raise IndexError("column name ``{0}`` not available".format(colname))
    return idx[0], dt

In [29]:
get_column_info('x', column_info_table)

(17, 'f4')

In [42]:
def build_composite_dt(column_info_table, *colnames):
    try:
        assert len(colnames) == len(set(colnames))
    except AssertionError:
        raise AssertionError("Input ``colnames`` sequence contains repeated elements")
    dt_list = []
    idx_list = []
    for colname in colnames:
        idx, dt = get_column_info(colname, column_info_table)
        dt_list.append((colname, dt))
        idx_list.append(idx)
    
    idx_sorted = np.argsort(idx_list)
    dt_list = [tuple(x) for x in np.array(dt_list)[idx_sorted]]
    return np.dtype(dt_list)

In [43]:
build_composite_dt(column_info_table, 'vmax', 'mvir', 'halo_id')

dtype([('halo_id', '<i8'), ('mvir', '<f4'), ('vmax', '<f4')])

In [34]:
def get_index_list(dt):
    idx_list = []
    for colname in dt.names:
        idx, dt = get_column_info(colname, column_info_table)
        idx_list.append(idx)
    return idx_list

In [46]:
dt = build_composite_dt(column_info_table, 'vmax', 'halo_id', 'mvir')
list_of_indices_to_use = list(get_index_list(dt))
print(list_of_indices_to_use)

[1, 10, 16]


In [47]:
def row_generator(fname, column_info_table, *colnames):
    idx_list = get_index_list(build_composite_dt(column_info_table, *colnames))
    with gzip.open(fname, 'r') as f:
        for raw_line in f:
            if raw_line[0] != '#':
                yield tuple(s for i, s in enumerate(raw_line.strip().split()) if i in idx_list)

In [48]:
d = list(row_generator(halocat_fname, column_info_table, 'vmax', 'mvir', 'halo_id'))

In [49]:
d[0]

('1465370', '7.74900e+08', '43.750000')

In [55]:
data_strarr = np.array(d, dtype=build_composite_dt(column_info_table, 'vmax', 'mvir', 'halo_id'))

In [57]:
Table(data_strarr)[0:10]

halo_id,mvir,vmax
int64,float32,float32
1465370,774900000.0,43.75
1465372,9918000000.0,106.9
1465373,1550000000.0,59.42
1465377,1550000000.0,55.1
1465381,930000000.0,45.87
1465383,1937000000.0,67.02
1465385,2324000000.0,63.36
1465389,2324000000.0,61.76
1465395,309900000.0,32.81
1465397,3874000000.0,76.08
