## Fetch VCF and index

In [13]:
import sgkit
! mkdir -p data
! test ! -e data/chr22.vcf.gz && wget -O data/chr22.vcf.gz http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.filtered.shapeit2-duohmm-phased.vcf.gz

In [15]:
! test ! -e data/chr22.vcf.gz.tbi && tabix -f -p vcf data/chr22.vcf.gz

## Fetch ancestral alleles and index

In [16]:
! test ! -e data/ancestral_alleles.tar.gz && wget -O data/ancestral_alleles.tar.gz  ftp://ftp.ensembl.org/pub/release-100/fasta/ancestral_alleles/homo_sapiens_ancestor_GRCh38.tar.gz

Will not apply HSTS. The HSTS database must be a regular and non-world-writable file.
ERROR: could not open HSTS store at '/home/benj/.wget-hsts'. HSTS will be disabled.
--2023-01-19 00:40:39--  ftp://ftp.ensembl.org/pub/release-100/fasta/ancestral_alleles/homo_sapiens_ancestor_GRCh38.tar.gz
           => ‘data/ancestral_alleles.tar.gz’
Resolving ftp.ensembl.org (ftp.ensembl.org)... 193.62.193.139
Connecting to ftp.ensembl.org (ftp.ensembl.org)|193.62.193.139|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/release-100/fasta/ancestral_alleles ... done.
==> SIZE homo_sapiens_ancestor_GRCh38.tar.gz ... 852605016
==> PASV ... done.    ==> RETR homo_sapiens_ancestor_GRCh38.tar.gz ... done.
Length: 852605016 (813M) (unauthoritative)


2023-01-19 00:43:12 (5.33 MB/s) - ‘data/ancestral_alleles.tar.gz’ saved [852605016]



In [17]:
! cd data && tar -xzf ancestral_alleles.tar.gz

In [18]:
! samtools faidx data/homo_sapiens_ancestor_GRCh38/homo_sapiens_ancestor_22.fa

## Convert VCF to an sgkit dataset

In [1]:
%%time
import sgkit as sg
import xarray as xr
import numpy as np
import dask.array as da
from sgkit.io.vcf import vcf_to_zarr

CPU times: user 14.2 s, sys: 393 ms, total: 14.6 s
Wall time: 16.1 s


In [30]:
%%time
vcf_to_zarr("data/chr22.vcf.gz", "data/chr22.zarr")

CPU times: user 7min 40s, sys: 22.8 s, total: 8min 3s
Wall time: 2min 18s


# Load ancestral states from fasta and save to dataset

In [31]:
import pysam
import sys
fasta = pysam.FastaFile("data/homo_sapiens_ancestor_GRCh38/homo_sapiens_ancestor_22.fa")
# NB! We put in an extra character at the start to convert to 1 based coords.
codec = 'utf-32-le' if sys.byteorder == 'little' else 'utf-32-be'
ancestral_sequence = "X" + fasta.fetch(reference=fasta.references[0])
ancestral_sequence = np.frombuffer(bytearray(ancestral_sequence,codec), dtype="U1")
# From the ancestral states README:
# The convention for the sequence is:
#    ACTG : high-confidence call, ancestral state supported by other 2 sequences
#    actg : low-confidence call, ancestral state supported by one sequence only
#    N    : failure, the ancestral state is not supported by any other sequence
#    -    : the extant species contains an insertion at this position
#    .    : no coverage in the alignment
ds = sg.load_dataset("data/chr22.zarr")
ancestral_states = ancestral_sequence[ds['variant_position'].values]
ancestral_states = xr.DataArray(data=ancestral_states, dims=["variants"], name="variant_ancestral_state")
print("Seen states:")
for val, count in zip(*np.unique(ancestral_states, return_counts=True)):
    print(val, count)
ds.update({"variant_ancestral_state": ancestral_states})
sg.save_dataset(ds.drop_vars(set(ds.data_vars) - {"variant_ancestral_state"}), "data/chr22.zarr", mode="a")

Seen states:
- 51535
. 141300
A 130150
C 252721
G 236782
N 8216
T 127890
a 21735
c 40884
g 37531
t 21657


## Create a mask of sites that have bad ancestral states

In [32]:
ds = sg.load_dataset("data/chr22.zarr")
wanted_variants = da.logical_and(ds['variant_ancestral_state'] != '-',
                     da.logical_and(ds['variant_ancestral_state'] != '.', ds['variant_ancestral_state'] != 'N'))
wanted_variants = wanted_variants.chunk((10000,))
ds.update({"variant_bad_ancestral_mask": xr.DataArray(data=wanted_variants, dims=["variants"], name="variant_bad_ancestral_mask")})
sg.save_dataset(ds.drop_vars(set(ds.data_vars) - {"variant_bad_ancestral_mask"}), "data/chr22.zarr", mode="a")
print(f"{da.sum(~wanted_variants).compute()} sites masked out for bad ancestral state")
assert set(np.unique(ds['variant_ancestral_state'][wanted_variants])) == {'A', 'C', 'G', 'T', 'a', 'c', 'g', 't'}

201051 sites masked out for bad ancestral state


## Create a mask of duplicate positions

In [33]:
ds = sg.load_dataset("data/chr22.zarr")
pos = ds['variant_position']
pos_shift_left = da.full_like(pos,-1)
pos_shift_left[0:-1] = pos[1:]
pos_shift_right = da.full_like(pos,-1)
pos_shift_right[1:] = pos[:-1]
wanted_variants = da.logical_and(pos != pos_shift_left, pos != pos_shift_right)
ds.update({"variant_duplicate_position_mask": xr.DataArray(data=wanted_variants, dims=["variants"], name="variant_duplicate_position_mask")})
sg.save_dataset(ds.drop_vars(set(ds.data_vars) - {"variant_duplicate_position_mask"}), "data/chr22.zarr", mode="a")
print(f"{da.sum(~wanted_variants).compute()} sites masked out for duplicate position")

99504 sites masked out for duplicate position


## Create the combined mask

In [34]:
## Create the combined mask
ds = sg.load_dataset("data/chr22.zarr")
wanted_variants = da.logical_and(ds['variant_duplicate_position_mask'], ds['variant_bad_ancestral_mask'])
ds.update({"variant_mask": xr.DataArray(data=wanted_variants, dims=["variants"], name="variant_mask")})
sg.save_dataset(ds.drop_vars(set(ds.data_vars) - {"variant_mask"}), "data/chr22.zarr", mode="a")
print(f"{da.sum(~wanted_variants).compute()} sites masked out")

<xarray.DataArray 'variant_duplicate_position_mask' (variants: 1070401)>
dask.array<open_dataset-b70e6d3b582f66d0cb6ee07e93ae7bd5variant_duplicate_position_mask, shape=(1070401,), dtype=bool, chunksize=(10000,), chunktype=numpy.ndarray>
Dimensions without coordinates: variants <xarray.DataArray 'variant_bad_ancestral_mask' (variants: 1070401)>
dask.array<open_dataset-b70e6d3b582f66d0cb6ee07e93ae7bd5variant_bad_ancestral_mask, shape=(1070401,), dtype=bool, chunksize=(10000,), chunktype=numpy.ndarray>
Dimensions without coordinates: variants
276599 sites masked out


In [35]:
ds = sg.load_dataset("data/chr22.zarr")

## Take a subset of the samples for testing

In [36]:
ds = sg.load_dataset("data/chr22.zarr")
wanted_samples = np.zeros((ds.sizes['samples'],), dtype=bool)
wanted_samples[:100] = True
ds = ds.sel(samples=wanted_samples)
sg.save_dataset(ds, "data/chr22.subset.zarr")

ContainsGroupError: path '' contains a group

In [2]:
%%time
import tsinfer
sampledata = tsinfer.SgkitSampleData("data/chr22.subset.zarr")
inf_ts = tsinfer.infer(sampledata, num_threads=4, progress_monitor=True)

ga-add   (1/6)100%|██████████| 971k/971k [00:22, 42.8kit/s] 
ga-gen   (2/6)100%|██████████| 77.0k/77.0k [06:16, 204it/s]
ma-match (3/6)100%|█████████▉| 77.0k/77.0k [02:33, 408it/s]
ms-muts  (4/6)  0%|          | 0.00/155k [00:00, ?it/s][A
ms-muts  (4/6)  0%|          | 1.00/155k [00:00, 2.28it/s][A
ms-muts  (4/6)  3%|▎         | 5.18k/155k [00:00, 9.69kit/s][A
ms-muts  (4/6)  7%|▋         | 10.7k/155k [00:00, 17.0kit/s][A
ms-muts  (4/6) 11%|█         | 16.4k/155k [00:00, 22.5kit/s][A
ms-muts  (4/6) 14%|█▍        | 22.3k/155k [00:00, 27.0kit/s][A
ms-muts  (4/6) 18%|█▊        | 28.1k/155k [00:00, 30.3kit/s][A
ms-muts  (4/6) 22%|██▏       | 34.2k/155k [00:01, 33.5kit/s][A
ms-muts  (4/6) 26%|██▌       | 40.5k/155k [00:01, 36.2kit/s][A
ms-muts  (4/6) 30%|███       | 46.6k/155k [00:01, 38.3kit/s][A
ms-muts  (4/6) 34%|███▍      | 52.8k/155k [00:01, 40.1kit/s][A
ms-muts  (4/6) 38%|███▊      | 59.0k/155k [00:01, 41.8kit/s][A
ms-muts  (4/6) 42%|████▏     | 65.3k/155k [00:01, 43.2kit/

CPU times: user 19min 21s, sys: 10.8 s, total: 19min 32s
Wall time: 10min 45s


In [3]:
inf_ts.num_sites

970897

In [14]:
inf_ts

Tree Sequence,Unnamed: 1
Trees,81252
Sequence Length,50807930.0
Time Units,uncalibrated
Sample Nodes,200
Total Size,79.9 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,544403,16.6 MiB,
Individuals,100,5.2 KiB,✅
Migrations,0,8 Bytes,
Mutations,196684,7.0 MiB,
Nodes,109304,5.4 MiB,✅
Populations,0,24 Bytes,
Provenances,1,579 Bytes,
Sites,970897,46.8 MiB,✅
