In [4]:
import os
import json
import gzip
import time
import math
import hilbert
import pyarrow
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq


## Prepare the signal data

In [5]:
chromosomes_to_exclude = ['chrY']

In [6]:
# setup the data directory if it doesn't already exist
data_dir = 'hilbert_genome_cooler_data'
if not os.path.exists(data_dir): 
    os.makedirs(data_dir)

In [7]:
# setup the data directory if it doesn't already exist
bytes_data_dir = 'hilbert_genome_cooler_data/bytes_data'
if not os.path.exists(bytes_data_dir): 
    os.makedirs(bytes_data_dir)

In [8]:
signal_categories = 18
input_signal_NA_category_value = signal_categories - 1

In [9]:
# Get the data file and store it locally
#
# ref. /net/seq/data/projects/Epilogos
#          /multivec-for-browser-2022-redo/epilogos_tracks/single/human
#          /Boix_et_al_833_sample/hg19/18/All_833_biosamples/S1/scores.txt.filledGap.versionSorted.txt.gz
#
signal_remote_URI = 'https://resources.altius.org/~areynolds/public/Boix_et_al_833_sample.hg19.18.All_833_biosamples.S1.scores.txt.gz'
signal_local_fn = os.path.join(data_dir, 'Boix_et_al_833_sample.hg19.18.All_833_biosamples.S1.scores.txt.gz')

In [10]:
if not os.path.exists(signal_local_fn):
    try:
        r = requests.get(signal_remote_URI)
        with open(signal_local_fn, "wb") as ofh:
            b = io.BytesIO(r.content)
            ofh.write(b.getbuffer())
    except requests.exceptions.RequestException as e:
        raise SystemExit(e)

In [11]:
input_signal_dir = os.path.split(os.path.abspath(signal_local_fn))[0]

In [12]:
def generate_categorical_data(ifn):
    signals = {}
    signals['__all'] = []
    with gzip.open(ifn, 'rb') as ifh:
        for line in ifh:
            elems = line.decode().rstrip().split('\t')
            chromosome = elems[0]
            signal = np.array([float(x) for x in elems[3:]])
            if chromosome not in signals:
                signals[chromosome] = []
            if chromosome not in chromosomes_to_exclude:
                signals[chromosome].append({'argmax': np.argmax(signal), 'sum': np.sum(signal)})
                signals['__all'].append({'argmax': np.argmax(signal), 'sum': np.sum(signal)})
            else:
                signals[chromosome].append({'argmax': input_signal_NA_category_value, 'sum': 0})
                signals['__all'].append({'argmax': input_signal_NA_category_value, 'sum': 0})
    return signals

In [13]:
def generate_categorical_data_file(ofn, categorical_data):
    with gzip.open(ofn, 'wb') as ofh:
        categorical_data_json = json.dumps(categorical_data, cls=NumpyEncoder)
        ofh.write(categorical_data_json.encode())

In [14]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        if isinstance(obj, np.bool_):
            return bool(obj)
        return super(NumpyEncoder, self).default(obj)

In [15]:
categorical_data_fn = os.path.join(data_dir,'{}.categorical_data.json.gz'.format(os.path.basename(signal_local_fn)))

In [16]:
if not os.path.exists(categorical_data_fn):
    categorical_data = generate_categorical_data(signal_local_fn)
    generate_categorical_data_file(categorical_data_fn, categorical_data)
else:
    with gzip.open(categorical_data_fn, 'rb') as ifh:
        categorical_data = json.load(ifh)

## Hilbert Genome

In [17]:
import hilbertgenome.chromsizes

In [18]:
genome = hilbertgenome.chromsizes["hg38"]

In [19]:
genome.total_size

3088269832

In [20]:
maxOrder = 16 # 16 is the order that can contain the full genome

In [21]:
pow(4, maxOrder) 

4294967296

In [22]:
# convert a hilbert position from one order to another
def hilbert_pos_to_order(pos, fro, to):
    return math.floor(pos / pow(4, fro - to))

In [23]:
# spread our genome positions evenly across the 16th order hilbert curve
def spread_scale(gpos):
    i_min = 0
    i_max = genome.total_size
    o_min = 0
    o_max = pow(4, maxOrder)
    return round(o_min + (gpos - i_min) * ((o_max - o_min) / (i_max - i_min)))

In [24]:
# get the genome position for a given hilbert position (may return duplicates)
def spread_scale_invert(pos):
    i_min = 0
    i_max = genome.total_size
    o_min = 0
    o_max = pow(4, maxOrder)
    return round((pos - o_min) / ((o_max - o_min) / (i_max - i_min)) + i_min)

In [25]:
# get the global genome position from something like "chr2:12345"
def get_genome_from_base(start):
    chr = start.split(":")[0]
    offset = int(start.split(":")[1])
    return genome.sizes_acc[chr] + offset

In [26]:
def get_hilbert_from_genome(gpos):
    return spread_scale(gpos)

In [27]:
def get_genome_from_hilbert(pos):
  gpos = spread_scale_invert(pos)
  if pos != spread_scale(gpos): 
    return None
  return gpos;

In [28]:
# get the first base at the maximum order resolution 
# that comes after the starting pos and before the next pos at the current order
def scan_spread(pos, order):
  if(order == maxOrder):
    return get_genome_from_hilbert(pos)
  start = hilbert_pos_to_order(pos, order, maxOrder)
  found = get_genome_from_hilbert(start)
  # need to scan everything at the highest order
  # but only until we find something which should usually happen in a few loops
  for l in range(pow(4, maxOrder - order)):
    if found is not None:
        return found
    found = get_genome_from_hilbert(start+l)

In [29]:
# get the last base at the maximum order resolution 
# that comes after the starting pos and before the next pos at the current order
def scan_spread_reverse(pos, order):
  start = hilbert_pos_to_order(pos+1, order, maxOrder) - 1
  found = get_genome_from_hilbert(start)
  # need to scan everything at the highest order
  # but only until we find something which should usually happen in a few loops
  for l in range(pow(4, maxOrder - order)):
    if found is not None:
        return found
    found = get_genome_from_hilbert(start-l)

In [292]:
def get_hilbert_genome_resolution(order):
    hpos = hilbert_pos_to_order(1, order, maxOrder)
    gpos = scan_spread_reverse(1, order)

In [311]:
scan_spread(0, 15)

0

In [312]:
scan_spread_reverse(1, 15)

5

In [331]:
# order 0 doesn't really make sense
hilbert_genome_resolutions = [scan_spread_reverse(0, o) + 1 for o in range(0, 17)]

In [332]:
hilbert_genome_resolutions

[3088269832,
 772067458,
 193016865,
 48254216,
 12063554,
 3015889,
 753972,
 188493,
 47123,
 11781,
 2945,
 736,
 184,
 46,
 12,
 3,
 1]

In [336]:
hilbert_genome_resolutions[8]

47123

## Aggregation

In [360]:
#find_maxvecsum_label = lambda x: x[np.argmax(np.array([i['sum'] for i in x]))]['argmax']
def aggregate(data):
    return data[np.argmax(np.array([i['sum'] for i in data]))]['argmax']
#     return sum([d['argmax'] for d in data])

In [82]:
def accessor(d):
    return d["argmax"]

In [343]:
signal_resolution = 200
# signal_resolution = 25000

In [344]:
def aggregate_range(chromosome, order, hstart, hstop, gstart, individual, dtype="int8"):
    print(chromosome, order, hstart, hstop, gstart, individual, dtype)
    #print("start", hstart, hstop)
    #start_time = time.time()
    samples = categorical_data[chromosome]
    nsamples = len(samples)
    #hp = Hilbert(order, hstart, hstop)
    hp = np.arange(hstart, hstop, dtype=np.int32)
    v = np.arange(hstart, hstop, dtype=np.dtype(dtype))
    print("len v", len(v))
    print("samples", nsamples)
    # loop over the hilbert points for this order
    # can get bounds in genome position
    for i, pos in enumerate(hp):
        # global genome pos
        gpos = scan_spread(pos, order)          
        # at order 16 we may get None values for individual hilbert coordinates
        if(gpos is None):
            v[i] = input_signal_NA_category_value
            continue
        
        # local position
        lpos = gpos - gstart                    
        if(lpos < 0):
            lpos = 0
        
        # the starting index in our sample
        datum_idx = round(lpos/signal_resolution)              
        
        if individual:
            # we have a single point, this will happen for higher orders
            # print("single data",  pos, lpos, slpos, datum_idx)
            if datum_idx >= nsamples:
                v[i] = input_signal_NA_category_value
                #print(i,pos,datum_idx)
            else:
                v[i] = accessor(samples[datum_idx])
        else:
            # we know multiple samples will fit in our hilbert cell
            # so we get the last genome pos in the hilbert cell
            sgpos = scan_spread_reverse(pos, order) # global genome pos           
            slpos = sgpos - gstart                  # local position
            sdatum_idx = round(slpos/signal_resolution)            #the ending index in our samples
            # we need to aggregate at lower orders
            # we grab all the samples from the starting index to the ending index
            data = samples[datum_idx:sdatum_idx+1]  # include the last sample
            # if there are no samples in our list, we save NA
            if(len(data) == 0):
                #print("aggregate data", pos, gpos, lpos, slpos, datum_idx, sdatum_idx, len(data))
                v[i] = input_signal_NA_category_value
            else:
                # we save the output of the user supplied aggregate function
                v[i] = aggregate(data)

    # can convert genome position bounds to sample index range (multiple of 200)
    # collect the datapoints that have the same point
    #print("done in: %s" % (time.time() - start_time))
    fn = aggregate_filename(chromosome, order, hstart, dtype)
    with open(fn, 'wb') as f:
        f.write(v.tobytes())
    #return [hp,v]

In [403]:
bed_data = {
    "chr1": [
        { 'start': 0, 'stop':25000, 'argmax': 1 },
        { 'start': 25001, 'stop':50000, 'argmax': 2 },
        { 'start': 50001, 'stop':75000, 'argmax': 3 },
        { 'start': 75001, 'stop':100000, 'argmax': 4 },
        { 'start': 125001, 'stop':150000, 'argmax': 5 },
        
        { 'start': 225001, 'stop':250000, 'argmax': 6 },
        { 'start': 250001, 'stop':275000, 'argmax': 7 },
    ]
}

In [404]:
bed_data["chr1"][2:]

[{'start': 50001, 'stop': 75000, 'argmax': 3},
 {'start': 75001, 'stop': 100000, 'argmax': 4},
 {'start': 125001, 'stop': 150000, 'argmax': 5},
 {'start': 225001, 'stop': 250000, 'argmax': 6},
 {'start': 250001, 'stop': 275000, 'argmax': 7}]

In [405]:
# Want an alternative "sampling" aggregate_range where we look up if the hilbert coordinate is within a BED file range
# could filter down the "samples" that overlap with the hilbert region before iterating
# again we will have the case where there are many samples in one hilbert index
# or that we are a tiny hilbert index hitting the same sample over and over
# the above code assumes coverage of whole chromosome by samples, but we want to search regions
# this does assume that regions dont overlap and are sorted
def aggregate_range_region(chromosome, order, hstart, hstop, gstart, individual, dtype="int8"):
    print(chromosome, order, hstart, hstop, gstart, individual, dtype)

    # each sample will have a start, stop and some value
    samples = bed_data[chromosome]
    # nsamples = len(samples)
    #hp = Hilbert(order, hstart, hstop)
    hp = np.arange(hstart, hstop, dtype=np.int32)
    v = np.arange(hstart, hstop, dtype=np.dtype(dtype))
    #print("len v", len(v))
    #print("samples", nsamples)
    # loop over the hilbert points for this order
    # can get bounds in genome position
    
    last_idx = 0
    for i, pos in enumerate(hp):
        # global genome pos
        gpos = scan_spread(pos, order)          
        # at order 16 we may get None values for individual hilbert coordinates
        if(gpos is None):
            v[i] = input_signal_NA_category_value
            continue
        
        # local position
        lpos = gpos - gstart                    
        if(lpos < 0):
            lpos = 0
        
        if individual:
            # our hilbert pos is smaller than the signal resolution
            # loop thru the samples
            found = False
            for idx,s in enumerate(samples[last_idx:]):
                if(lpos > s['start']):
                    # we are inside a sample
                    v[i] = accessor(samples[idx])
                    last_idx = idx
                    found = True
                    break
            if not found:
                v[i] = input_signal_NA_category_value
                    
        else:
            # we know multiple samples will fit in our hilbert cell
            # so we get the last genome pos in the hilbert cell
            sgpos = scan_spread_reverse(pos, order) # global genome pos           
            slpos = sgpos - gstart                  # local position stop
            
            # get the samples that fall within the hilbert range
            data = []
            print(lpos, slpos)
            for idx,s in enumerate(samples[last_idx:]):
                if(lpos <= s['start'] and slpos >= s['start']):
                    # our sample is inside the hilbert cell
                    data.append(s)
                    last_idx += 1
                if(lpos > s['stop']):
                    break
                    
            #print("data", data, pos, last_idx)
            # if there are no samples in our list, we save NA
            if(len(data) == 0):
                #print("aggregate data", pos, gpos, lpos, slpos, datum_idx, sdatum_idx, len(data))
                v[i] = input_signal_NA_category_value
            else:
                # we save the output of the user supplied aggregate function
                v[i] = aggregate(data)

    # can convert genome position bounds to sample index range (multiple of 200)
    # collect the datapoints that have the same point
    #print("done in: %s" % (time.time() - start_time))
    
#     fn = aggregate_filename(chromosome, order, hstart, dtype)
#     with open(fn, 'wb') as f:
#         f.write(v.tobytes())
        
    return [hp,v]

In [406]:
files8 = files_for_order(8)

samples 1246254
chromosome chr1 order 8
genome 0 248956422
hilbert 0 5283
length 5283
individual False
samples 1215997
chromosome chr2 order 8
genome 248956422 491149951
hilbert 5283 10422
length 5139
individual False
samples 990113
chromosome chr3 order 8
genome 491149951 689445510
hilbert 10422 14630
length 4208
individual False
samples 955772
chromosome chr4 order 8
genome 689445510 879660065
hilbert 14630 18667
length 4037
individual False
samples 904577
chromosome chr5 order 8
genome 879660065 1061198324
hilbert 18667 22519
length 3852
individual False
samples 855576
chromosome chr6 order 8
genome 1061198324 1232004303
hilbert 22519 26144
length 3625
individual False
samples 795694
chromosome chr7 order 8
genome 1232004303 1391350276
hilbert 26144 29525
length 3381
individual False
samples 731821
chromosome chr8 order 8
genome 1391350276 1536488912
hilbert 29525 32605
length 3080
individual False
samples 706068
chromosome chr9 order 8
genome 1536488912 1674883629
hilbert 32605 355

In [407]:
files8

[{'chromosome': 'chr1',
  'order': 8,
  'hstart': 0,
  'hstop': 5283,
  'gstart': 0,
  'individual': False,
  'dtype': 'int8'},
 {'chromosome': 'chr2',
  'order': 8,
  'hstart': 5283,
  'hstop': 10422,
  'gstart': 248956422,
  'individual': False,
  'dtype': 'int8'},
 {'chromosome': 'chr3',
  'order': 8,
  'hstart': 10422,
  'hstop': 14630,
  'gstart': 491149951,
  'individual': False,
  'dtype': 'int8'},
 {'chromosome': 'chr4',
  'order': 8,
  'hstart': 14630,
  'hstop': 18667,
  'gstart': 689445510,
  'individual': False,
  'dtype': 'int8'},
 {'chromosome': 'chr5',
  'order': 8,
  'hstart': 18667,
  'hstop': 22519,
  'gstart': 879660065,
  'individual': False,
  'dtype': 'int8'},
 {'chromosome': 'chr6',
  'order': 8,
  'hstart': 22519,
  'hstop': 26144,
  'gstart': 1061198324,
  'individual': False,
  'dtype': 'int8'},
 {'chromosome': 'chr7',
  'order': 8,
  'hstart': 26144,
  'hstop': 29525,
  'gstart': 1232004303,
  'individual': False,
  'dtype': 'int8'},
 {'chromosome': 'chr8',
 

In [408]:
agg = aggregate_range_region(**files8[0])

chr1 8 0 5283 0 False int8
0 47122
data [{'start': 0, 'stop': 25000, 'argmax': 1}, {'start': 25001, 'stop': 50000, 'argmax': 2}] 0 2
47123 94246
data [{'start': 50001, 'stop': 75000, 'argmax': 3}, {'start': 75001, 'stop': 100000, 'argmax': 4}] 1 4
94247 141369
data [{'start': 125001, 'stop': 150000, 'argmax': 5}] 2 5
141370 188492
data [] 3 5
188493 235615
data [{'start': 225001, 'stop': 250000, 'argmax': 6}] 4 6
235616 282739
data [{'start': 250001, 'stop': 275000, 'argmax': 7}] 5 7
282740 329862
data [] 6 7
329863 376985
data [] 7 7
376986 424108
data [] 8 7
424109 471232
data [] 9 7
471233 518355
data [] 10 7
518356 565478
data [] 11 7
565479 612601
data [] 12 7
612602 659725
data [] 13 7
659726 706848
data [] 14 7
706849 753971
data [] 15 7
753972 801095
data [] 16 7
801096 848218
data [] 17 7
848219 895341
data [] 18 7
895342 942464
data [] 19 7
942465 989588
data [] 20 7
989589 1036711
data [] 21 7
1036712 1083834
data [] 22 7
1083835 1130957
data [] 23 7
1130958 1178081
data [] 

In [409]:
agg[1][0:20]

array([ 3,  7,  5, 17,  6,  7, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
       17, 17, 17], dtype=int8)

In [189]:
def aggregate_parallel(params):
    start_time = time.time()
    aggregate_range(**params)
    print(params['chromosome'], params['order'], params['hstart'], "done in: %s" % (time.time() - start_time))

In [274]:
def aggregate_chromosome(chromosome, order, chunk_size=4**11, dtype="int8"):
    
    samples = categorical_data[chromosome]
    nsamples = len(samples)
    print("samples", nsamples)
    
    gstart = genome.sizes_acc[chromosome]
    gstop = genome.sizes_acc[chromosome] + genome.sizes[chromosome]
    hbstart = get_hilbert_from_genome(gstart)
    hbstop = get_hilbert_from_genome(gstop)
    
    hstart = hilbert_pos_to_order(hbstart, 16, order)
    hstop = hilbert_pos_to_order(hbstop, 16, order)
    hlen = hstop - hstart
    
    print("chromosome", chromosome, "order", order)
    print("genome", gstart, gstop)
    print("hilbert", hstart, hstop)
    print("length", hlen)

    # if there are more hilbert indices than samples we dont need to aggregate
    # really it should be sample resolution vs hilbert genome resolution...
    # individual = (hlen) / nsamples > 1
    individual = hilbert_genome_resolutions[order] < signal_resolution
    print("individual", individual)
    
    params = []
    if hlen > chunk_size:
        # break up our hlen into chunks
        nchunks = math.ceil(hlen/chunk_size)
        print("chunks", nchunks)
        for i in range(0, nchunks):
            cstart = hstart + chunk_size * i
            cstop = cstart + chunk_size
            if cstop > hstop:
                cstop = hstop
            params.append({
                'chromosome': chromosome,
                'order': order,
                'hstart': cstart,
                'hstop': cstop,
                'gstart': gstart, # this stays the same
                'individual': individual,
                'dtype': dtype
            })
            
    else:
        # TODO: chunk up the hilbert and run the aggregate_range in parallel
        params.append({
            'chromosome': chromosome,
            'order': order,
            'hstart': hstart,
            'hstop': hstop,
            'gstart': gstart,
            'individual': individual,
            'dtype': dtype
        })
    return params
    #return aggregate_range(chromosome, order, hstart, hstop, gstart, individual, dtype)
    # the parallel version can write files to disk
    # then when its done combine them

In [191]:
def aggregate_filename(chromosome, order, hstart, dtype):
    return os.path.join(bytes_data_dir, chromosome + "_" + str(order) + "_" + str(hstart) + "." + dtype)

## Generate byte files

In [192]:
# def write_bytes(chromosome, order):
#     agg = aggregate_chromosome(chromosome, order)
#     bfn = os.path.join(bytes_data_dir, chromosome + "_" + str(order) + ".int8")
#     with open(bfn, 'wb') as f:
#         f.write(agg[1].tobytes())

In [193]:
# write_bytes("chr1", 13)

In [194]:
def files_for_order(order):
    files = []
    for c in genome.chromosomes:
        files += aggregate_chromosome(c, order)
    return files

In [322]:
files = files_for_order(12)

samples 1246254
chromosome chr1 order 12
genome 0 248956422
hilbert 0 1352471
length 1352471
individual True
samples 1215997
chromosome chr2 order 12
genome 248956422 491149951
hilbert 1352471 2668202
length 1315731
individual True
samples 990113
chromosome chr3 order 12
genome 491149951 689445510
hilbert 2668202 3745455
length 1077253
individual True
samples 955772
chromosome chr4 order 12
genome 689445510 879660065
hilbert 3745455 4778807
length 1033352
individual True
samples 904577
chromosome chr5 order 12
genome 879660065 1061198324
hilbert 4778807 5765025
length 986218
individual True
samples 855576
chromosome chr6 order 12
genome 1061198324 1232004303
hilbert 5765025 6692939
length 927914
individual True
samples 795694
chromosome chr7 order 12
genome 1232004303 1391350276
hilbert 6692939 7558596
length 865657
individual True
samples 731821
chromosome chr8 order 12
genome 1391350276 1536488912
hilbert 7558596 8347070
length 788474
individual True
samples 706068
chromosome chr9 or

In [323]:
files

[{'chromosome': 'chr1',
  'order': 12,
  'hstart': 0,
  'hstop': 1352471,
  'gstart': 0,
  'individual': True,
  'dtype': 'int8'},
 {'chromosome': 'chr2',
  'order': 12,
  'hstart': 1352471,
  'hstop': 2668202,
  'gstart': 248956422,
  'individual': True,
  'dtype': 'int8'},
 {'chromosome': 'chr3',
  'order': 12,
  'hstart': 2668202,
  'hstop': 3745455,
  'gstart': 491149951,
  'individual': True,
  'dtype': 'int8'},
 {'chromosome': 'chr4',
  'order': 12,
  'hstart': 3745455,
  'hstop': 4778807,
  'gstart': 689445510,
  'individual': True,
  'dtype': 'int8'},
 {'chromosome': 'chr5',
  'order': 12,
  'hstart': 4778807,
  'hstop': 5765025,
  'gstart': 879660065,
  'individual': True,
  'dtype': 'int8'},
 {'chromosome': 'chr6',
  'order': 12,
  'hstart': 5765025,
  'hstop': 6692939,
  'gstart': 1061198324,
  'individual': True,
  'dtype': 'int8'},
 {'chromosome': 'chr7',
  'order': 12,
  'hstart': 6692939,
  'hstop': 7558596,
  'gstart': 1232004303,
  'individual': True,
  'dtype': 'int8'

In [324]:
len(files)

24

In [250]:
s = 0
for f in files:
    s += f['hstop'] - f['hstart']
print(s)

67108864


In [251]:
4**13

67108864

In [209]:
files[0]

{'chromosome': 'chr1',
 'order': 7,
 'hstart': 0,
 'hstop': 1320,
 'gstart': 0,
 'individual': False,
 'dtype': 'int8'}

In [221]:
# aggregate_parallel(files[0])

chr1 7 0 1320 0 False int8
len v 1320
samples 1246254
chr1 7 0 done in: 0.12947702407836914


In [183]:
# import multiprocessing as mp
import multiprocess as mp

In [224]:
def generate_order(order):
    start_time = time.time()
    files = files_for_order(order)
    workers = mp.cpu_count() - 1
    with mp.Pool(workers) as p:
        p.map(aggregate_parallel, files)
    print(len(files), "done in: %s" % (time.time() - start_time))
    print("combining files")
    combine_bytes_files(files, order)
    #return files

In [254]:
def combine_bytes_files(files, order):
    paths = [aggregate_filename(f['chromosome'], f['order'], f['hstart'], f['dtype']) for f in files]
    print(paths)
    out = os.path.join(bytes_data_dir,"order_" + str(order) + "." + files[0]['dtype'])
    with open(out, "wb") as of:
        for p in paths:
            with open(p, "rb") as rf:
                of.write(rf.read())

In [334]:
generate_order(12)

samples 1246254
chromosome chr1 order 12
genome 0 248956422
hilbert 0 1352471
length 1352471
individual True
samples 1215997
chromosome chr2 order 12
genome 248956422 491149951
hilbert 1352471 2668202
length 1315731
individual True
samples 990113
chromosome chr3 order 12
genome 491149951 689445510
hilbert 2668202 3745455
length 1077253
individual True
samples 955772
chromosome chr4 order 12
genome 689445510 879660065
hilbert 3745455 4778807
length 1033352
individual True
samples 904577
chromosome chr5 order 12
genome 879660065 1061198324
hilbert 4778807 5765025
length 986218
individual True
samples 855576
chromosome chr6 order 12
genome 1061198324 1232004303
hilbert 5765025 6692939
length 927914
individual True
samples 795694
chromosome chr7 order 12
genome 1232004303 1391350276
hilbert 6692939 7558596
length 865657
individual True
samples 731821
chromosome chr8 order 12
genome 1391350276 1536488912
hilbert 7558596 8347070
length 788474
individual True
samples 706068
chromosome chr9 or

In [None]:
# order 15 took 400 seconds w/ 9 workers on mbp m1 pro
# order 16 took 2000 seconds (1034 files)