## Fetch VCF and index

In [13]:
! mkdir -p data
! wget -O data/chr22.vcf.gz http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.filtered.shapeit2-duohmm-phased.vcf.gz

Will not apply HSTS. The HSTS database must be a regular and non-world-writable file.
ERROR: could not open HSTS store at '/home/benj/.wget-hsts'. HSTS will be disabled.
--2023-01-09 15:32:07--  http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/CCDG_14151_B01_GRM_WGS_2020-08-05_chr22.filtered.shapeit2-duohmm-phased.vcf.gz
Resolving ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)... 193.62.193.140
Connecting to ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)|193.62.193.140|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 519930289 (496M) [application/x-gzip]
Saving to: ‘data/chr22.vcf.gz’


2023-01-09 15:37:59 (1.41 MB/s) - ‘data/chr22.vcf.gz’ saved [519930289/519930289]



In [6]:
! tabix -f -p vcf data/chr22.vcf.gz

## Fetch ancestral alleles and index

In [9]:
! wget -O data/ancestral_alleles.tar.gz  ftp://ftp.ensembl.org/pub/release-100/fasta/ancestral_alleles/homo_sapiens_ancestor_GRCh38.tar.gz

Will not apply HSTS. The HSTS database must be a regular and non-world-writable file.
ERROR: could not open HSTS store at '/home/benj/.wget-hsts'. HSTS will be disabled.
--2022-12-19 16:12:04--  ftp://ftp.ensembl.org/pub/release-100/fasta/ancestral_alleles/homo_sapiens_ancestor_GRCh38.tar.gz
           => ‘data/ancestral_alleles.tar.gz’
Resolving ftp.ensembl.org (ftp.ensembl.org)... 193.62.193.139
Connecting to ftp.ensembl.org (ftp.ensembl.org)|193.62.193.139|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/release-100/fasta/ancestral_alleles ... done.
==> SIZE homo_sapiens_ancestor_GRCh38.tar.gz ... 852605016
==> PASV ... done.    ==> RETR homo_sapiens_ancestor_GRCh38.tar.gz ... done.
Length: 852605016 (813M) (unauthoritative)


2022-12-19 16:14:46 (5.02 MB/s) - ‘data/ancestral_alleles.tar.gz’ saved [852605016]



In [12]:
! cd data && tar -xzf ancestral_alleles.tar.gz

In [13]:
! samtools faidx data/homo_sapiens_ancestor_GRCh38/homo_sapiens_ancestor_22.fa

## Convert VCF to an sgkit dataset

In [3]:
%%time
import sgkit as sg
import dask.array as da
from sgkit.io.vcf import vcf_to_zarr

CPU times: user 13.2 s, sys: 798 ms, total: 14 s
Wall time: 13.8 s


In [14]:
%%time
vcf_to_zarr("data/chr22.vcf.gz", "data/chr22.zarr")

CPU times: user 8min 15s, sys: 33.2 s, total: 8min 48s
Wall time: 2min 24s


## Remove sites with duplicate positions

In [6]:
ds = sg.load_dataset("data/chr22.zarr")
pos = ds['variant_position']
pos_shift_left = da.full_like(pos,-1)
pos_shift_left[0:-1] = pos[1:]
pos_shift_right = da.full_like(pos,-1)
pos_shift_right[1:] = pos[:-1]
wanted_variants = da.logical_and(pos != pos_shift_left, pos != pos_shift_right)
ds = ds.sel(variants=wanted_variants)
# Subsetting the variants causes unequal chunk sizes, so we need to rechunk for zarr (sgkit/
ds = ds.chunk(chunks={dim: max(chunks) for dim, chunks in ds.chunks.items()})
sg.save_dataset(ds, "data/chr22.remove_dups.zarr")

In [None]:
%%time
import tsinfer
sampledata = tsinfer.SgkitSampleData("data/chr22.remove_dups.zarr")
inf_ts = tsinfer.infer(sampledata, num_threads=4, progress_monitor=True)

ga-add   (1/6)100%|██████████| 971k/971k [00:35, 27.1kit/s] 
No sites used for inference
ma-match (2/6)  0%|          | 0.00/0.00 [00:00, ?it/s]
No sites used for inference
ms-muts  (3/6)  0%|          | 0.00/0.00 [00:00, ?it/s]
ms-xsites (4/6)100%|█████████▉| 971k/971k [10:22, 1.60kit/s] 