In [1]:
""" This scripts excludes possible and observed SNVs which fail gnomAD filters.
""";

In [3]:
# Import modules
import numpy as np
import pandas as pd
from collections import defaultdict

## Filtering
Variants failing various QC, frequency, or coverage criteria are excluded from the set of "possible" SNVs.

Filtering is done on a *per site basis*. Any site with a variant which fails a filter is excluded from the "possible" SNVs set.

This is acheived by filtering on a "chr" "pos" multiindex.

### SNVs failing gnomAD QC

In [3]:
def get_gnomad_failed_sites(path):
    """Get sites at which a variant fails gnomAD QC filters"""

    fails = pd.read_csv(
        path,
        sep="\t",
        header=None,
        names=header,
        dtype=datatypes,
    ).drop_duplicates()

    fail_sites = fails.set_index(["chr", "pos"]).index.drop_duplicates()

    return fail_sites

### SNVs at MAF >0.001

In [4]:
def get_common_variants(path):
    """Get sites at which a common variant (MAF > 0.001) is observed"""
    common = (
        pd.read_csv(
            path,
            sep="\t",
            header=None,
            names=header,
            dtype=datatypes,
        )
        .drop_duplicates()
        .assign(af=lambda x: x.ac / x.an)
        .query("af > 0.001")
    )

    common_sites = common.set_index(["chr", "pos"]).index.drop_duplicates()

    return common_sites

### Coverage
The gnomAD coverage data is very large.
There are a number of measures to limit memory overheads and optimise performance:
- Read the coverage data in chunks.
- chr and pos are set as a multi-index.
- Duplicate index elements are dropped.
- Inner join (using multi-indices) all SNVs on each chunk of the coverage data.

In [5]:
def get_covered_sites(path, snvs):
    """Get sites within the specified coverage"""

    # Split the coverage data into chunks
    reader = pd.read_csv(
        path,
        sep="\t",
        header=None,
        names=["chr", "pos"],
        dtype={"chr": "str", "pos": "int32"},
        chunksize=10000000,
        index_col=["chr", "pos"],
    )
    # Join performance is substantially better with non-duplicated indices
    snv_sites = snvs[~snvs.index.duplicated()]
    # This cell takes about 30 mins to run.
    covered_sites = pd.concat([snv_sites.join(chunk, how="inner") for chunk in reader])

    return covered_sites

### SNVs of interest

In [1]:
def get_snvs(path, names):
    """Read SNVs to memory"""
    snvs = (
        pd.read_csv(
            path,
            sep="\t",
            comment="#",
            header=None,
            names=names,
            usecols=["chr", "pos", "ref", "alt"],
            dtype=datatypes,
        )
        .drop_duplicates()
        .set_index(["chr", "pos"])
    )
    return snvs

### Apply filters

In [7]:
def apply_filters(snvs):
    """Filter SNVs by gnomAD criteria"""

    print(f"Total SNVs: {len(snvs)}")

    snvs = snvs.loc[~snvs.index.isin(fail_sites)]  # Exclude low-quality sites
    print(f"Quality filter: {len(snvs)} remaining")

    snvs = snvs.loc[~snvs.index.isin(common_sites)]  # Exclude common allele sites
    print(f"Frequency filter: {len(snvs)} remaining")

    snvs = snvs.loc[
        snvs.index.isin(covered_sites.index)
    ]  # Exclude sites with coverage <30 or >32
    print(f"Coverage filter: {len(snvs)} remaining")

    return snvs

### Reformat to VCF and write out

In [9]:
def write_vcf(snvs, output_path):
    snvs = snvs.reset_index().assign(ID=".", QUAL=".", FILTER=".", INFO=".")[
        ["chr", "pos", "ID", "ref", "alt", "QUAL", "FILTER", "INFO"]
    ]
    snvs.to_csv(
        output_path,
        sep="\t",
        index=False,
        header=False,
    )
    return snvs

### Main

In [9]:
if __name__ == "__main__":

    # Default headers and dtype dictionaries
    header = ["chr", "pos", "id", "ref", "alt", "qual", "filter", "info", "ac", "an"]
    datatypes = defaultdict(lambda: "str")
    datatypes.update({"pos": np.int32, "ac": np.int32, "an": np.int32})

    # Input paths
    poss_snvs = "../outputs/cds_all_possible_snvs.vcf"
    obs_snvs = "/re_gecip/enhanced_interpretation/AlexBlakes/gene_terminus_variant_constraint/outputs/gnomad/snvs_gnomad_cds.vcf"

    # Output paths
    poss_out = "../outputs/cds_all_possible_snvs_filtered.vcf"
    obs_out = "../outputs/cds_all_observed_snvs_filtered.vcf"

    # Get sites to filter on
    fail_sites = get_gnomad_failed_sites("../outputs/gnomad/gnomad_cds_snvs_failed.tsv")
    common_sites = get_common_variants(
        "/re_gecip/enhanced_interpretation/AlexBlakes/gene_terminus_variant_constraint/outputs/gnomad/snvs_gnomad_cds.vcf"
    )

    # SNV filtering
    for i, o, n in zip(
        [poss_snvs, obs_snvs], [poss_out, obs_out], [header[:-2], header]
    ):
        snvs = get_snvs(path=i, names=n)
        covered_sites = get_covered_sites(
            "../outputs/gnomad_3.1.1_coverage_30_32.tsv", snvs=snvs
        )
        snvs = snvs.pipe(apply_filters).pipe(write_vcf, o)

Total SNVs: 96953715
Quality filter: 95979939 remaining
Frequency filter: 95417079 remaining
Coverage filter: 68852169 remaining
Total SNVs: 6090156
Quality filter: 6046180 remaining
Frequency filter: 5828523 remaining
Coverage filter: 4100545 remaining
