In [2]:
# download accessibility HDF5 from FTP
!mkdir -pv ../data/external
!wget --no-clobber ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/accessibility/accessibility.h5 -O ../data/external/accessibility.h5

File ‘../data/external/accessibility.h5’ already there; not retrieving.


In [3]:
import zarr

dest = zarr.open('../data/external/accessibility.zarr', mode='w')

import h5py

source = h5py.File('../data/external/accessibility.h5', mode='r')

In [7]:
print(zarr.tree(source))

/
 ├── 2L
 │   ├── coverage (49364325,) int64
 │   ├── coverage_mq0 (49364325,) int64
 │   ├── filter_dust (49364325,) bool
 │   ├── filter_high_coverage (49364325,) bool
 │   ├── filter_high_mq0 (49364325,) bool
 │   ├── filter_low_coverage (49364325,) bool
 │   ├── filter_low_mq (49364325,) bool
 │   ├── filter_n (49364325,) bool
 │   ├── filter_no_coverage (49364325,) bool
 │   ├── high_coverage (49364325,) int32
 │   ├── high_mq0 (49364325,) int32
 │   ├── is_accessible (49364325,) bool
 │   ├── low_coverage (49364325,) int32
 │   ├── low_mq (49364325,) int32
 │   ├── low_pairing (49364325,) int32
 │   ├── no_coverage (49364325,) int32
 │   ├── pos (49364325,) int64
 │   ├── ref (49364325,) |S1
 │   ├── ref_masked (49364325,) bool
 │   ├── ref_n (49364325,) bool
 │   ├── repeat_dust (49364325,) bool
 │   ├── repeat_repeatmask (49364325,) bool
 │   └── repeat_trf (49364325,) bool
 ├── 2R
 │   ├── coverage (61545105,) int64
 │   ├── coverage_mq0 (61545105,) int64
 │   ├── filter_dust

In [8]:
import numcodecs
import sys

In [9]:
compressor = numcodecs.Blosc(cname='zstd', clevel=5, shuffle=-1)
zarr.copy_all(source, dest, dry_run=False, if_exists='replace', compressor=compressor, chunks=10_000_000, log=sys.stdout)

copy /2L
copy /2L/coverage (49364325,) int64
copy /2L/coverage_mq0 (49364325,) int64
copy /2L/filter_dust (49364325,) bool
copy /2L/filter_high_coverage (49364325,) bool
copy /2L/filter_high_mq0 (49364325,) bool
copy /2L/filter_low_coverage (49364325,) bool
copy /2L/filter_low_mq (49364325,) bool
copy /2L/filter_n (49364325,) bool
copy /2L/filter_no_coverage (49364325,) bool
copy /2L/high_coverage (49364325,) int32
copy /2L/high_mq0 (49364325,) int32
copy /2L/is_accessible (49364325,) bool
copy /2L/low_coverage (49364325,) int32
copy /2L/low_mq (49364325,) int32
copy /2L/low_pairing (49364325,) int32
copy /2L/no_coverage (49364325,) int32
copy /2L/pos (49364325,) int64
copy /2L/ref (49364325,) |S1
copy /2L/ref_masked (49364325,) bool
copy /2L/ref_n (49364325,) bool
copy /2L/repeat_dust (49364325,) bool
copy /2L/repeat_repeatmask (49364325,) bool
copy /2L/repeat_trf (49364325,) bool
copy /2R
copy /2R/coverage (61545105,) int64
copy /2R/coverage_mq0 (61545105,) int64
copy /2R/filter_dust

(168, 0, 16931808222)

In [11]:
!du -hs ../data/external/*

1.8G	../data/external/accessibility.h5
970M	../data/external/accessibility.zarr


In [14]:
!ls -lh ../data/external/accessibility.zarr/2L/is_accessible/

total 440K
-rw-r--r-- 1 aliman aliman 75K Nov 19 21:42 0
-rw-r--r-- 1 aliman aliman 78K Nov 19 21:42 1
-rw-r--r-- 1 aliman aliman 97K Nov 19 21:42 2
-rw-r--r-- 1 aliman aliman 94K Nov 19 21:42 3
-rw-r--r-- 1 aliman aliman 88K Nov 19 21:42 4


In [15]:
zarr.consolidate_metadata('../data/external/accessibility.zarr')

path = 'accessibility.zarr/'
!gsutil -m rsync -ru ../data/external/{path} gs://ag1000g-release/phase1.AR3/accessibility/{path}

In [17]:
# check via intake catalog

In [18]:
import intake

In [19]:
cat = intake.open_catalog('https://malariagen.github.io/intake/gcs.yml')
cat

gcs:
  args:
    path: https://malariagen.github.io/intake/gcs.yml
  description: ''
  driver: intake.catalog.local.YAMLFileCatalog
  metadata:
    version: 1


In [20]:
list(cat.ag1)

['samples',
 'snps',
 'snps_pass',
 'snps_pass_biallelic',
 'haps',
 'accessibility']

In [21]:
accessibility = cat.ag1.accessibility.to_zarr()
accessibility

<zarr.hierarchy.Group '/' read-only>

In [22]:
print(accessibility.tree())

/
 ├── 2L
 │   ├── coverage (49364325,) int64
 │   ├── coverage_mq0 (49364325,) int64
 │   ├── filter_dust (49364325,) bool
 │   ├── filter_high_coverage (49364325,) bool
 │   ├── filter_high_mq0 (49364325,) bool
 │   ├── filter_low_coverage (49364325,) bool
 │   ├── filter_low_mq (49364325,) bool
 │   ├── filter_n (49364325,) bool
 │   ├── filter_no_coverage (49364325,) bool
 │   ├── high_coverage (49364325,) int32
 │   ├── high_mq0 (49364325,) int32
 │   ├── is_accessible (49364325,) bool
 │   ├── low_coverage (49364325,) int32
 │   ├── low_mq (49364325,) int32
 │   ├── low_pairing (49364325,) int32
 │   ├── no_coverage (49364325,) int32
 │   ├── pos (49364325,) int64
 │   ├── ref (49364325,) |S1
 │   ├── ref_masked (49364325,) bool
 │   ├── ref_n (49364325,) bool
 │   ├── repeat_dust (49364325,) bool
 │   ├── repeat_repeatmask (49364325,) bool
 │   └── repeat_trf (49364325,) bool
 ├── 2R
 │   ├── coverage (61545105,) int64
 │   ├── coverage_mq0 (61545105,) int64
 │   ├── filter_dust

In [23]:
accessibility['3R/is_accessible'][:]

array([False, False, False, ..., False, False, False])

In [24]:
accessibility['3R/is_accessible'].nchunks_initialized

6

In [25]:
for chrom in accessibility:
    for d in accessibility[chrom]:
        print(chrom, d)
        a = accessibility[chrom][d]
        assert a.nchunks == a.nchunks_initialized

2L coverage
2L coverage_mq0
2L filter_dust
2L filter_high_coverage
2L filter_high_mq0
2L filter_low_coverage
2L filter_low_mq
2L filter_n
2L filter_no_coverage
2L high_coverage
2L high_mq0
2L is_accessible
2L low_coverage
2L low_mq
2L low_pairing
2L no_coverage
2L pos
2L ref
2L ref_masked
2L ref_n
2L repeat_dust
2L repeat_repeatmask
2L repeat_trf
2R coverage
2R coverage_mq0
2R filter_dust
2R filter_high_coverage
2R filter_high_mq0
2R filter_low_coverage
2R filter_low_mq
2R filter_n
2R filter_no_coverage
2R high_coverage
2R high_mq0
2R is_accessible
2R low_coverage
2R low_mq
2R low_pairing
2R no_coverage
2R pos
2R ref
2R ref_masked
2R ref_n
2R repeat_dust
2R repeat_repeatmask
2R repeat_trf
3L coverage
3L coverage_mq0
3L filter_dust
3L filter_high_coverage
3L filter_high_mq0
3L filter_low_coverage
3L filter_low_mq
3L filter_n
3L filter_no_coverage
3L high_coverage
3L high_mq0
3L is_accessible
3L low_coverage
3L low_mq
3L low_pairing
3L no_coverage
3L pos
3L ref
3L ref_masked
3L ref_n
3L 

In [26]:
!wget --no-clobber https://storage.googleapis.com/ag1000g-release/phase1.AR3/extras/allele_counts.h5 -O ../data/external/allele_counts.h5

File ‘../data/external/allele_counts.h5’ already there; not retrieving.


In [27]:
import zarr

dest = zarr.open('../data/external/allele_counts.zarr', mode='w')

import h5py

source = h5py.File('../data/external/allele_counts.h5', mode='r')

In [28]:
print(zarr.tree(source))

/
 ├── 2L
 │   ├── AOM (10377280, 4) int32
 │   ├── BFM (10377280, 4) int32
 │   ├── BFS (10377280, 4) int32
 │   ├── CMS (10377280, 4) int32
 │   ├── GAS (10377280, 4) int32
 │   ├── GNS (10377280, 4) int32
 │   ├── GWA (10377280, 4) int32
 │   ├── KES (10377280, 4) int32
 │   ├── UGS (10377280, 4) int32
 │   ├── all (10377280, 4) int32
 │   ├── all_m (10377280, 4) int32
 │   └── all_s (10377280, 4) int32
 ├── 2R
 │   ├── AOM (14080970, 4) int32
 │   ├── BFM (14080970, 4) int32
 │   ├── BFS (14080970, 4) int32
 │   ├── CMS (14080970, 4) int32
 │   ├── GAS (14080970, 4) int32
 │   ├── GNS (14080970, 4) int32
 │   ├── GWA (14080970, 4) int32
 │   ├── KES (14080970, 4) int32
 │   ├── UGS (14080970, 4) int32
 │   ├── all (14080970, 4) int32
 │   ├── all_m (14080970, 4) int32
 │   └── all_s (14080970, 4) int32
 ├── 3L
 │   ├── AOM (9643193, 4) int32
 │   ├── BFM (9643193, 4) int32
 │   ├── BFS (9643193, 4) int32
 │   ├── CMS (9643193, 4) int32
 │   ├── GAS (9643193, 4) int32
 │   ├── GNS (

In [30]:
compressor = numcodecs.Blosc(cname='zstd', clevel=5, shuffle=-1)
zarr.copy_all(source, dest, dry_run=False, if_exists='replace', compressor=compressor, chunks=(10_000_000, -1), log=sys.stdout)

copy /2L
copy /2L/AOM (10377280, 4) int32
copy /2L/BFM (10377280, 4) int32
copy /2L/BFS (10377280, 4) int32
copy /2L/CMS (10377280, 4) int32
copy /2L/GAS (10377280, 4) int32
copy /2L/GNS (10377280, 4) int32
copy /2L/GWA (10377280, 4) int32
copy /2L/KES (10377280, 4) int32
copy /2L/UGS (10377280, 4) int32
copy /2L/all (10377280, 4) int32
copy /2L/all_m (10377280, 4) int32
copy /2L/all_s (10377280, 4) int32
copy /2R
copy /2R/AOM (14080970, 4) int32
copy /2R/BFM (14080970, 4) int32
copy /2R/BFS (14080970, 4) int32
copy /2R/CMS (14080970, 4) int32
copy /2R/GAS (14080970, 4) int32
copy /2R/GNS (14080970, 4) int32
copy /2R/GWA (14080970, 4) int32
copy /2R/KES (14080970, 4) int32
copy /2R/UGS (14080970, 4) int32
copy /2R/all (14080970, 4) int32
copy /2R/all_m (14080970, 4) int32
copy /2R/all_s (14080970, 4) int32
copy /3L
copy /3L/AOM (9643193, 4) int32
copy /3L/BFM (9643193, 4) int32
copy /3L/BFS (9643193, 4) int32
copy /3L/CMS (9643193, 4) int32
copy /3L/GAS (9643193, 4) int32
copy /3L/GNS 

(65, 0, 10084983744)

In [31]:
zarr.consolidate_metadata('../data/external/allele_counts.zarr')

path = 'allele_counts.zarr/'
!gsutil -m rsync -ru ../data/external/{path} gs://ag1000g-release/phase1.AR3/extras/{path}

Building synchronization state...
Starting synchronization...
Copying file://../data/external/allele_counts.zarr/.zgroup [Content-Type=application/octet-stream]...
Copying file://../data/external/allele_counts.zarr/.zmetadata [Content-Type=application/octet-stream]...
Copying file://../data/external/allele_counts.zarr/2L/.zattrs [Content-Type=application/octet-stream]...
Copying file://../data/external/allele_counts.zarr/2L/.zgroup [Content-Type=application/octet-stream]...
Copying file://../data/external/allele_counts.zarr/2L/AOM/.zarray [Content-Type=application/octet-stream]...
Copying file://../data/external/allele_counts.zarr/2L/AOM/0.0 [Content-Type=application/octet-stream]...
Copying file://../data/external/allele_counts.zarr/2L/BFM/1.0 [Content-Type=application/octet-stream]...
Copying file://../data/external/allele_counts.zarr/2L/BFM/.zarray [Content-Type=application/octet-stream]...
Copying file://../data/external/allele_counts.zarr/2L/BFM/.zattrs [Content-Type=application/o

In [32]:
allele_counts = cat.ag1.allele_counts.to_zarr()
allele_counts

<zarr.hierarchy.Group '/' read-only>

In [33]:
print(allele_counts.tree())

/
 ├── 2L
 │   ├── AOM (10377280, 4) int32
 │   ├── BFM (10377280, 4) int32
 │   ├── BFS (10377280, 4) int32
 │   ├── CMS (10377280, 4) int32
 │   ├── GAS (10377280, 4) int32
 │   ├── GNS (10377280, 4) int32
 │   ├── GWA (10377280, 4) int32
 │   ├── KES (10377280, 4) int32
 │   ├── UGS (10377280, 4) int32
 │   ├── all (10377280, 4) int32
 │   ├── all_m (10377280, 4) int32
 │   └── all_s (10377280, 4) int32
 ├── 2R
 │   ├── AOM (14080970, 4) int32
 │   ├── BFM (14080970, 4) int32
 │   ├── BFS (14080970, 4) int32
 │   ├── CMS (14080970, 4) int32
 │   ├── GAS (14080970, 4) int32
 │   ├── GNS (14080970, 4) int32
 │   ├── GWA (14080970, 4) int32
 │   ├── KES (14080970, 4) int32
 │   ├── UGS (14080970, 4) int32
 │   ├── all (14080970, 4) int32
 │   ├── all_m (14080970, 4) int32
 │   └── all_s (14080970, 4) int32
 ├── 3L
 │   ├── AOM (9643193, 4) int32
 │   ├── BFM (9643193, 4) int32
 │   ├── BFS (9643193, 4) int32
 │   ├── CMS (9643193, 4) int32
 │   ├── GAS (9643193, 4) int32
 │   ├── GNS (

In [34]:
for chrom in allele_counts:
    for d in allele_counts[chrom]:
        print(chrom, d)
        a = allele_counts[chrom][d]
        assert a.nchunks == a.nchunks_initialized

2L AOM
2L BFM
2L BFS
2L CMS
2L GAS
2L GNS
2L GWA
2L KES
2L UGS
2L all
2L all_m
2L all_s
2R AOM
2R BFM
2R BFS
2R CMS
2R GAS
2R GNS
2R GWA
2R KES
2R UGS
2R all
2R all_m
2R all_s
3L AOM
3L BFM
3L BFS
3L CMS
3L GAS
3L GNS
3L GWA
3L KES
3L UGS
3L all
3L all_m
3L all_s
3R AOM
3R BFM
3R BFS
3R CMS
3R GAS
3R GNS
3R GWA
3R KES
3R UGS
3R all
3R all_m
3R all_s
X AOM
X BFM
X BFS
X CMS
X GAS
X GNS
X GWA
X KES
X UGS
X all
X all_m
X all_s
