In [92]:
import os
import json
import gzip
import time
import math
import hilbert
import pyarrow
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq


## Prepare the signal data

In [93]:
chromosomes_to_exclude = ['chrY']

In [94]:
# setup the data directory if it doesn't already exist
data_dir = 'hilbert_genome_cooler_data'
if not os.path.exists(data_dir): 
    os.makedirs(data_dir)

In [95]:
# setup the data directory if it doesn't already exist
parquet_data_dir = 'hilbert_genome_cooler_data/parquet_data'
if not os.path.exists(parquet_data_dir): 
    os.makedirs(parquet_data_dir)

In [96]:
signal_categories = 18
input_signal_NA_category_value = signal_categories - 1

In [97]:
# Get the data file and store it locally
#
# ref. /net/seq/data/projects/Epilogos
#          /multivec-for-browser-2022-redo/epilogos_tracks/single/human
#          /Boix_et_al_833_sample/hg19/18/All_833_biosamples/S1/scores.txt.filledGap.versionSorted.txt.gz
#
signal_remote_URI = 'https://resources.altius.org/~areynolds/public/Boix_et_al_833_sample.hg19.18.All_833_biosamples.S1.scores.txt.gz'
signal_local_fn = os.path.join(data_dir, 'Boix_et_al_833_sample.hg19.18.All_833_biosamples.S1.scores.txt.gz')

In [98]:
if not os.path.exists(signal_local_fn):
    try:
        r = requests.get(signal_remote_URI)
        with open(signal_local_fn, "wb") as ofh:
            b = io.BytesIO(r.content)
            ofh.write(b.getbuffer())
    except requests.exceptions.RequestException as e:
        raise SystemExit(e)

In [99]:
input_signal_dir = os.path.split(os.path.abspath(signal_local_fn))[0]

In [100]:
def generate_categorical_data(ifn):
    signals = {}
    signals['__all'] = []
    with gzip.open(ifn, 'rb') as ifh:
        for line in ifh:
            elems = line.decode().rstrip().split('\t')
            chromosome = elems[0]
            signal = np.array([float(x) for x in elems[3:]])
            if chromosome not in signals:
                signals[chromosome] = []
            if chromosome not in chromosomes_to_exclude:
                signals[chromosome].append({'argmax': np.argmax(signal), 'sum': np.sum(signal)})
                signals['__all'].append({'argmax': np.argmax(signal), 'sum': np.sum(signal)})
            else:
                signals[chromosome].append({'argmax': input_signal_NA_category_value, 'sum': 0})
                signals['__all'].append({'argmax': input_signal_NA_category_value, 'sum': 0})
    return signals

In [101]:
def generate_categorical_data_file(ofn, categorical_data):
    with gzip.open(ofn, 'wb') as ofh:
        categorical_data_json = json.dumps(categorical_data, cls=NumpyEncoder)
        ofh.write(categorical_data_json.encode())

In [102]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        if isinstance(obj, np.bool_):
            return bool(obj)
        return super(NumpyEncoder, self).default(obj)

In [103]:
categorical_data_fn = os.path.join(data_dir,'{}.categorical_data.json.gz'.format(os.path.basename(signal_local_fn)))

In [104]:
if not os.path.exists(categorical_data_fn):
    categorical_data = generate_categorical_data(signal_local_fn)
    generate_categorical_data_file(categorical_data_fn, categorical_data)
else:
    with gzip.open(categorical_data_fn, 'rb') as ifh:
        categorical_data = json.load(ifh)

## Hilbert Genome

In [28]:
import hilbertgenome.chromsizes

In [29]:
genome = hilbertgenome.chromsizes["hg38"]

In [30]:
genome.total_size

3088269832

In [33]:
maxOrder = 16 # 16 is the order that can contain the full genome

In [34]:
pow(4, maxOrder) 

4294967296

In [35]:
# convert a hilbert position from one order to another
def hilbert_pos_to_order(pos, fro, to):
    return math.floor(pos / pow(4, fro - to))

In [36]:
# spread our genome positions evenly across the 16th order hilbert curve
def spread_scale(gpos):
    i_min = 0
    i_max = genome.total_size
    o_min = 0
    o_max = pow(4, maxOrder)
    return round(o_min + (gpos - i_min) * ((o_max - o_min) / (i_max - i_min)))

In [37]:
# get the genome position for a given hilbert position (may return duplicates)
def spread_scale_invert(pos):
    i_min = 0
    i_max = genome.total_size
    o_min = 0
    o_max = pow(4, maxOrder)
    return round((pos - o_min) / ((o_max - o_min) / (i_max - i_min)) + i_min)

In [38]:
# get the global genome position from something like "chr2:12345"
def get_genome_from_base(start):
    chr = start.split(":")[0]
    offset = int(start.split(":")[1])
    return genome.sizes_acc[chr] + offset

In [39]:
def get_hilbert_from_genome(gpos):
    return spread_scale(gpos)

In [40]:
def get_genome_from_hilbert(pos):
  gpos = spread_scale_invert(pos)
  if pos != spread_scale(gpos): 
    return None
  return gpos;

In [43]:
# get the first base at the maximum order resolution 
# that comes after the starting pos and before the next pos at the current order
def scan_spread(pos, order):
  if(order == maxOrder):
    return get_genome_from_hilbert(pos)
  start = hilbert_pos_to_order(pos, order, maxOrder)
  found = get_genome_from_hilbert(start)
  # need to scan everything at the highest order
  # but only until we find something which should usually happen in a few loops
  for l in range(pow(4, maxOrder - order)):
    if found is not None:
        return found
    found = get_genome_from_hilbert(start+l)

In [44]:
# get the last base at the maximum order resolution 
# that comes after the starting pos and before the next pos at the current order
def scan_spread_reverse(pos, order):
  start = hilbert_pos_to_order(pos+1, order, maxOrder) - 1
  found = get_genome_from_hilbert(start)
  # need to scan everything at the highest order
  # but only until we find something which should usually happen in a few loops
  for l in range(pow(4, maxOrder - order)):
    if found is not None:
        return found
    found = get_genome_from_hilbert(start-l)

## Aggregation

In [47]:
#find_maxvecsum_label = lambda x: x[np.argmax(np.array([i['sum'] for i in x]))]['argmax']
def aggregate(data):
    return data[np.argmax(np.array([i['sum'] for i in data]))]['argmax']

In [63]:
def accessor(d):
    return d["argmax"]

In [45]:
signal_resolution = 200

In [78]:
def aggregate_chromosome(chromosome, order):
    start_time = time.time()
    
    samples = categorical_data[chromosome]
    nsamples = len(samples)
    print("samples", nsamples)
    # sample resolution 
    sr = signal_resolution
    
    gstart = genome.sizes_acc[chromosome]
    gstop = genome.sizes_acc[chromosome] + genome.sizes[chromosome]
    hbstart = get_hilbert_from_genome(gstart)
    hbstop = get_hilbert_from_genome(gstop)
    
    # TODO: break this up into pieces for the highest orders
    hstart = hilbert_pos_to_order(hbstart, 16, order)
    hstop = hilbert_pos_to_order(hbstop, 16, order)
    
    print("chromosome", chromosome, "order", order)
    print("genome", gstart, gstop)
    print("hilbert", hstart, hstop)
    print("length", hstop - hstart)

    # if there are more hilbert indices than samples we dont need to aggregate
    individual = (hstop - hstart) / len(samples) > 1
    print("individual", individual)
    
    #hp = Hilbert(order, hstart, hstop)
    hp = np.arange(hstart, hstop)
    v = np.arange(hstart, hstop)
    # loop over the hilbert points for this order
    # can get bounds in genome position
    for i, pos in enumerate(hp):
        # global genome pos
        gpos = scan_spread(pos, order)          
        # at order 16 we may get None values for individual hilbert coordinates
        if(gpos is None):
            v[i] = input_signal_NA_category_value
            continue
        
        # local position
        lpos = gpos - gstart                    
        if(lpos < 0): 
            lpos = 0
        
        # the starting index in our sample
        datum_idx = round(lpos/sr)              
        
        if individual:
            # we have a single point, this will happen for higher orders
            # print("single data",  pos, lpos, slpos, datum_idx)
            if datum_idx >= nsamples:
                v[i] = input_signal_NA_category_value
                #print(i,pos,datum_idx)
            else:
                v[i] = accessor(samples[datum_idx])
        else:
            # we know multiple samples will fit in our hilbert cell
            # so we get the last genome pos in the hilbert cell
            sgpos = scan_spread_reverse(pos, order) # global genome pos           
            slpos = sgpos - gstart                  # local position
            sdatum_idx = round(slpos/sr)            #the ending index in our samples
            # we need to aggregate at lower orders
            # we grab all the samples from the starting index to the ending index
            data = samples[datum_idx:sdatum_idx+1]  # include the last sample
            # if there are no samples in our list, we save NA
            if(len(data) == 0):
                #print("aggregate data", pos, gpos, lpos, slpos, datum_idx, sdatum_idx, len(data))
                v[i] = input_signal_NA_category_value
            else:
                # we save the output of the user supplied aggregate function
                v[i] = aggregate(data)

    # can convert genome position bounds to sample index range (multiple of 200)
    # collect the datapoints that have the same point
    print("done in: %s" % (time.time() - start_time))
    return [hp,v]

In [77]:
agg1 = aggregate_chromosome("chr1", 11)

samples 1246254
chromosome chr1 order 11
genome 0 248956422
hilbert 0 338117
length 338117
individual False
done in: 2.274808883666992


In [83]:
agg1

[array([     0,      1,      2, ..., 338114, 338115, 338116]),
 array([17, 17, 17, ..., 17, 17, 17])]

In [436]:
pq.write_table(pa.table({
    # "index": agg1[0], 
    "value": agg1[1]
}), "test_chr1_13.parquet", compression='gzip')

## Generate parquet files

In [85]:
def write_parquet(chromosome, order):
    pfn = os.path.join(parquet_data_dir, chromosome + "_" + str(order) + ".parquet")
    agg = aggregate_chromosome(chromosome, order)
    table = {
        # "index": agg[0], 
        "value": agg[1],
    }
    pq.write_table(pa.table(table), pfn, compression='gzip')

In [86]:
from hilbertgenome.coalesce import coalesce_parquets

In [89]:
def combine(order):
    paths = [os.path.join(parquet_data_dir, (c + "_" + str(order) + ".parquet")) for c in genome.chromosomes]
    print(paths)
    coalesce_parquets(paths, os.path.join(parquet_data_dir,"order_" + str(order) + ".parquet"))

In [105]:
for o in range(5, 14):
# for o in range(5, 12):
    for c in genome.chromosomes:
        write_parquet(c, o)
    combine(o)

samples 1246254
chromosome chr1 order 5
genome 0 248956422
hilbert 0 82
length 82
individual False
done in: 0.22801518440246582
samples 1215997
chromosome chr2 order 5
genome 248956422 491149951
hilbert 82 162
length 80
individual False
done in: 0.1953120231628418
samples 990113
chromosome chr3 order 5
genome 491149951 689445510
hilbert 162 228
length 66
individual False
done in: 0.13689899444580078
samples 955772
chromosome chr4 order 5
genome 689445510 879660065
hilbert 228 291
length 63
individual False
done in: 0.12675714492797852
samples 904577
chromosome chr5 order 5
genome 879660065 1061198324
hilbert 291 351
length 60
individual False
done in: 0.11143302917480469
samples 855576
chromosome chr6 order 5
genome 1061198324 1232004303
hilbert 351 408
length 57
individual False
done in: 0.11091017723083496
samples 795694
chromosome chr7 order 5
genome 1232004303 1391350276
hilbert 408 461
length 53
individual False
done in: 0.14467406272888184
samples 731821
chromosome chr8 order 5
g

samples 955772
chromosome chr4 order 7
genome 689445510 879660065
hilbert 3657 4666
length 1009
individual False
done in: 0.06729817390441895
samples 904577
chromosome chr5 order 7
genome 879660065 1061198324
hilbert 4666 5629
length 963
individual False
done in: 0.06038308143615723
samples 855576
chromosome chr6 order 7
genome 1061198324 1232004303
hilbert 5629 6536
length 907
individual False
done in: 0.058145999908447266
samples 795694
chromosome chr7 order 7
genome 1232004303 1391350276
hilbert 6536 7381
length 845
individual False
done in: 0.05358409881591797
samples 731821
chromosome chr8 order 7
genome 1391350276 1536488912
hilbert 7381 8151
length 770
individual False
done in: 0.05056619644165039
samples 706068
chromosome chr9 order 7
genome 1536488912 1674883629
hilbert 8151 8885
length 734
individual False
done in: 0.04796123504638672
samples 677674
chromosome chr10 order 7
genome 1674883629 1808681051
hilbert 8885 9595
length 710
individual False
done in: 0.04598903656005859

done in: 0.2255110740661621
samples 1215997
chromosome chr2 order 9
genome 248956422 491149951
hilbert 21132 41690
length 20558
individual False
done in: 0.21035099029541016
samples 990113
chromosome chr3 order 9
genome 491149951 689445510
hilbert 41690 58522
length 16832
individual False
done in: 0.17494726181030273
samples 955772
chromosome chr4 order 9
genome 689445510 879660065
hilbert 58522 74668
length 16146
individual False
done in: 0.17532706260681152
samples 904577
chromosome chr5 order 9
genome 879660065 1061198324
hilbert 74668 90078
length 15410
individual False
done in: 0.15748381614685059
samples 855576
chromosome chr6 order 9
genome 1061198324 1232004303
hilbert 90078 104577
length 14499
individual False
done in: 0.15963435173034668
samples 795694
chromosome chr7 order 9
genome 1232004303 1391350276
hilbert 104577 118103
length 13526
individual False
done in: 0.14009594917297363
samples 731821
chromosome chr8 order 9
genome 1391350276 1536488912
hilbert 118103 130422
len

done in: 0.427440881729126
samples 296868
chromosome chrY order 10
genome 3031042417 3088269832
hilbert 1029145 1048576
length 19431
individual False
done in: 0.14962124824523926
['hilbert_genome_cooler_data/parquet_data/chr1_10.parquet', 'hilbert_genome_cooler_data/parquet_data/chr2_10.parquet', 'hilbert_genome_cooler_data/parquet_data/chr3_10.parquet', 'hilbert_genome_cooler_data/parquet_data/chr4_10.parquet', 'hilbert_genome_cooler_data/parquet_data/chr5_10.parquet', 'hilbert_genome_cooler_data/parquet_data/chr6_10.parquet', 'hilbert_genome_cooler_data/parquet_data/chr7_10.parquet', 'hilbert_genome_cooler_data/parquet_data/chr8_10.parquet', 'hilbert_genome_cooler_data/parquet_data/chr9_10.parquet', 'hilbert_genome_cooler_data/parquet_data/chr10_10.parquet', 'hilbert_genome_cooler_data/parquet_data/chr11_10.parquet', 'hilbert_genome_cooler_data/parquet_data/chr12_10.parquet', 'hilbert_genome_cooler_data/parquet_data/chr13_10.parquet', 'hilbert_genome_cooler_data/parquet_data/chr14_10

done in: 2.1358048915863037
samples 669260
chromosome chr12 order 12
genome 1943767673 2077042982
hilbert 10559637 11283663
length 724026
individual True
done in: 2.17116117477417
samples 575850
chromosome chr13 order 12
genome 2077042982 2191407310
hilbert 11283663 11904955
length 621292
individual True
done in: 1.8100941181182861
samples 536748
chromosome chr14 order 12
genome 2191407310 2298451028
hilbert 11904955 12486476
length 581521
individual True
done in: 1.6642990112304688
samples 512657
chromosome chr15 order 12
genome 2298451028 2400442217
hilbert 12486476 13040550
length 554074
individual True
done in: 1.5954198837280273
samples 451774
chromosome chr16 order 12
genome 2400442217 2490780562
hilbert 13040550 13531318
length 490768
individual True
done in: 1.4310531616210938
samples 405977
chromosome chr17 order 12
genome 2490780562 2574038003
hilbert 13531318 13983619
length 452301
individual True
done in: 1.3089160919189453
samples 390387
chromosome chr18 order 12
genome 25

In [587]:
# this will take a while
for o in range(14, 17):
    for c in genome.chromosomes:
        write_parquet(c, o)
    combine(o)

chromosome chr1 order 14
genome 0 248956422
hilbert 0 21639537
length 21639537
samples 1246254
done in: 55.57538604736328
chromosome chr2 order 14
genome 248956422 491149951
hilbert 21639537 42691237
length 21051700
samples 1215997
done in: 53.453176975250244
chromosome chr3 order 14
genome 491149951 689445510
hilbert 42691237 59927282
length 17236045
samples 990113
done in: 43.61011290550232
chromosome chr4 order 14
genome 689445510 879660065
hilbert 59927282 76460919
length 16533637
samples 955772
done in: 43.18458390235901
chromosome chr5 order 14
genome 879660065 1061198324
hilbert 76460919 92240403
length 15779484
samples 904577
done in: 41.49327898025513
chromosome chr6 order 14
genome 1061198324 1232004303
hilbert 92240403 107087027
length 14846624
samples 855576
done in: 39.69519782066345
chromosome chr7 order 14
genome 1232004303 1391350276
hilbert 107087027 120937536
length 13850509
samples 795694
done in: 36.93603491783142
chromosome chr8 order 14
genome 1391350276 153648891

done in: 554.6817569732666
chromosome chr6 order 16
genome 1061198324 1232004303
hilbert 1475846459 1713392442
length 237545983
samples 855576
done in: 523.9524297714233
chromosome chr7 order 16
genome 1232004303 1391350276
hilbert 1713392442 1935000585
length 221608143
samples 795694
done in: 4193.353150844574
chromosome chr8 order 16
genome 1391350276 1536488912
hilbert 1935000585 2136850077
length 201849492
samples 731821
done in: 446.4800748825073
chromosome chr9 order 16
genome 1536488912 1674883629
hilbert 2136850077 2329320559
length 192470482
samples 706068
done in: 424.7898647785187
chromosome chr10 order 16
genome 1674883629 1808681051
hilbert 2329320559 2515397418
length 186076859
samples 677674
done in: 413.3677101135254
chromosome chr11 order 16
genome 1808681051 1943767673
hilbert 2515397418 2703267215
length 187869797
samples 675033
done in: 416.712571144104
chromosome chr12 order 16
genome 1943767673 2077042982
hilbert 2703267215 2888617953
length 185350738
samples 6692

In [56]:
%timeit
combine(5)

['hilbert_genome_cooler_data/parquet_data/chr1_5.parquet', 'hilbert_genome_cooler_data/parquet_data/chr2_5.parquet', 'hilbert_genome_cooler_data/parquet_data/chr3_5.parquet', 'hilbert_genome_cooler_data/parquet_data/chr4_5.parquet', 'hilbert_genome_cooler_data/parquet_data/chr5_5.parquet', 'hilbert_genome_cooler_data/parquet_data/chr6_5.parquet', 'hilbert_genome_cooler_data/parquet_data/chr7_5.parquet', 'hilbert_genome_cooler_data/parquet_data/chr8_5.parquet', 'hilbert_genome_cooler_data/parquet_data/chr9_5.parquet', 'hilbert_genome_cooler_data/parquet_data/chr10_5.parquet', 'hilbert_genome_cooler_data/parquet_data/chr11_5.parquet', 'hilbert_genome_cooler_data/parquet_data/chr12_5.parquet', 'hilbert_genome_cooler_data/parquet_data/chr13_5.parquet', 'hilbert_genome_cooler_data/parquet_data/chr14_5.parquet', 'hilbert_genome_cooler_data/parquet_data/chr15_5.parquet', 'hilbert_genome_cooler_data/parquet_data/chr16_5.parquet', 'hilbert_genome_cooler_data/parquet_data/chr17_5.parquet', 'hilb

FileNotFoundError: [Errno 2] Failed to open local file 'hilbert_genome_cooler_data/parquet_data/chr1_5.parquet'. Detail: [errno 2] No such file or directory