In [1]:
import hail as hl
from gnomad.resources.grch37.gnomad import public_release as v2_public_release
from gnomad.resources.grch38.gnomad import public_release as v4_public_release

In [None]:
# Get the number of variant counts by allele frequencies

In [5]:
def get_variant_count(ht: hl.Table, afs: list[float]=[0.01, 0.001], singletons: bool = False, doubletons: bool = False) -> dict:
    """
    Count variants with frequency <1%, <0.1%, and singletons (AC == 1).

    .. note:: This function works for gnomAD exomes and genomes datasets, not yet for gnomAD joint dataset, since the HT scheme is slightly different.

    :param ht: Input Table.
    :param afs: List of allele frequencies cutoffs.
    :param singletons: Include singletons.
    :param doubletons: Include doubletons.
    :return: Dictionary with counts.
    """
    counts = {}

    # Filter to PASS variants.
    ht = ht.filter(hl.len(ht.filters) == 0)
    if singletons:
        n_singletons = ht.aggregate(hl.agg.count_where(ht.freq[0].AC == 1))
        counts["number of singletons"] = n_singletons
    if doubletons:
        n_doubletons = ht.aggregate(hl.agg.count_where(ht.freq[0].AC == 2))
        counts["number of doubletons"] = n_doubletons

    for af in afs:
        n_variants = ht.aggregate(hl.agg.count_where(ht.freq[0].AF < af))
        counts[f"number of variants with AF < {af}"] = n_variants

    # Count variants with frequency <1%, <0.1%, and singletons (AC == 1).
    return counts

In [3]:
v2_ht = v2_public_release("exomes").ht()

Initializing Hail with default parameters...

Reading spark-defaults.conf to determine GCS requester pays configuration. This is deprecated. Please use `hailctl config set gcs_requester_pays/project` and `hailctl config set gcs_requester_pays/buckets`.

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


SPARKMONITOR_LISTENER: Started SparkListener for Jupyter Notebook
SPARKMONITOR_LISTENER: Port obtained from environment: 48351
SPARKMONITOR_LISTENER: Application Started: application_1727440474542_0001 ...Start Time: 1727441640185


Running on Apache Spark version 3.3.2
SparkUI available at http://qh1-m.c.broad-mpg-gnomad.internal:38205
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.130-bea04d9c79b5
LOGGING: writing to /home/hail/hail-20240927-1253-0.2.130-bea04d9c79b5.log


In [6]:
print(get_variant_count(v2_ht))



{'number of variants with AF < 0.01': 14795986, 'number of variants with AF < 0.001': 14551940}




In [7]:
print(get_variant_count(v2_ht, singletons=True, doubletons=True))



{'number of singletons': 7763393, 'number of doubletons': 2194502, 'number of variants with AF < 0.01': 14795986, 'number of variants with AF < 0.001': 14551940}
