In [9]:
import intake
import numpy as np
from collections import Counter

In [2]:
cat = intake.open_catalog('https://malariagen.github.io/intake/gcs.yml')
cat

gcs:
  args:
    path: https://malariagen.github.io/intake/gcs.yml
  description: ''
  driver: intake.catalog.local.YAMLFileCatalog
  metadata:
    version: 1


In [3]:
ag1 = cat.ag1
list(ag1)

['samples',
 'snps',
 'snps_pass',
 'snps_pass_biallelic',
 'haps',
 'accessibility',
 'allele_counts_pass']

In [14]:
snps_pass = ag1.snps_pass.to_zarr()

In [19]:
chroms = '2R', '2L', '3R', '3L', 'X'

In [20]:
def count_annotations():
    x = None
    for chrom in chroms:
        y = Counter(snps_pass[chrom]['variants/ANN_Annotation'][:])
        if x is None:
            x = y
        else:
            x += y
    return x

In [25]:
ca = count_annotations()
ca.most_common()

[('intergenic_region', 21932312),
 ('intron_variant', 10893831),
 ('upstream_gene_variant', 8960291),
 ('downstream_gene_variant', 5012563),
 ('synonymous_variant', 2574257),
 ('missense_variant', 1794285),
 ('3_prime_UTR_variant', 480546),
 ('5_prime_UTR_variant', 325007),
 ('intragenic_variant', 280084),
 ('splice_region_variant&intron_variant', 140048),
 ('5_prime_UTR_premature_start_codon_gain_variant', 60784),
 ('splice_region_variant&synonymous_variant', 20601),
 ('stop_gained', 15478),
 ('missense_variant&splice_region_variant', 15246),
 ('splice_region_variant', 7579),
 ('splice_donor_variant&intron_variant', 3888),
 ('splice_acceptor_variant&intron_variant', 3055),
 ('start_lost', 1424),
 ('splice_region_variant&stop_retained_variant', 1320),
 ('stop_retained_variant', 1152),
 ('stop_lost&splice_region_variant', 907),
 ('stop_lost', 639),
 ('stop_gained&splice_region_variant', 266),
 ('initiator_codon_variant', 246),
 ('splice_acceptor_variant&splice_region_variant&intron_vari

In [31]:
ca_clean = Counter()
for k, n in ca.items():
    if '&' in k:
        for s in k.split('&'):
            ca_clean[s] += n
    else:
        ca_clean[k] += n
ca_clean.most_common()

[('intergenic_region', 21932312),
 ('intron_variant', 11040938),
 ('upstream_gene_variant', 8960291),
 ('downstream_gene_variant', 5012563),
 ('synonymous_variant', 2594858),
 ('missense_variant', 1809532),
 ('3_prime_UTR_variant', 480546),
 ('5_prime_UTR_variant', 325007),
 ('intragenic_variant', 280084),
 ('splice_region_variant', 186102),
 ('5_prime_UTR_premature_start_codon_gain_variant', 60784),
 ('stop_gained', 15744),
 ('splice_donor_variant', 3946),
 ('splice_acceptor_variant', 3117),
 ('stop_retained_variant', 2472),
 ('stop_lost', 1546),
 ('start_lost', 1444),
 ('initiator_codon_variant', 257),
 ('non_canonical_start_codon', 10)]

In [35]:
for k, v in ca_clean.most_common():
    a = k.replace("_", "\\_")
    print(f'{a} & {v:,} \\\\')

intergenic\_region & 21,932,312 \\
intron\_variant & 11,040,938 \\
upstream\_gene\_variant & 8,960,291 \\
downstream\_gene\_variant & 5,012,563 \\
synonymous\_variant & 2,594,858 \\
missense\_variant & 1,809,532 \\
3\_prime\_UTR\_variant & 480,546 \\
5\_prime\_UTR\_variant & 325,007 \\
intragenic\_variant & 280,084 \\
splice\_region\_variant & 186,102 \\
5\_prime\_UTR\_premature\_start\_codon\_gain\_variant & 60,784 \\
stop\_gained & 15,744 \\
splice\_donor\_variant & 3,946 \\
splice\_acceptor\_variant & 3,117 \\
stop\_retained\_variant & 2,472 \\
stop\_lost & 1,546 \\
start\_lost & 1,444 \\
initiator\_codon\_variant & 257 \\
non\_canonical\_start\_codon & 10 \\


In [37]:
coding = ca_clean['synonymous_variant'] + ca_clean['missense_variant']
coding

4404390

In [39]:
coding / 52525957

0.08385168498691038

In [40]:
ca_clean['synonymous_variant'] / coding

0.5891526408878415

In [41]:
ca_clean['missense_variant'] / coding

0.41084735911215853

In [26]:
def count_annotation_impacts():
    x = None
    for chrom in chroms:
        y = Counter(snps_pass[chrom]['variants/ANN_Annotation_Impact'][:])
        if x is None:
            x = y
        else:
            x += y
    return x

In [27]:
cai = count_annotation_impacts()
cai.most_common()

[('MODIFIER', 47884634),
 ('LOW', 2806020),
 ('MODERATE', 1809532),
 ('HIGH', 25771)]