In [None]:
""" Find coverage statistics for all coding sites of interest in gnomAD v3.1.1.
These data will be used to inform the model of expected variants per transcript.
""";

In [None]:
# Import modules
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Define VCF headers and datatypes.
header = ["chr", "pos", "id", "ref", "alt", "qual", "filter", "info"]

datatypes = defaultdict(lambda: "str")
datatypes.update({"pos": np.int32, "ac": np.int32, "an": np.int32})

In [None]:
# A TextFileReader object for parsing of very large coverage file.
reader = pd.read_csv(
    "../outputs/gnomad_3.1.1_coverage_all_sites.tsv",
    sep="\t",
    header=None,
    names=["chr", "pos", "mean_cov", "median_cov"],
    usecols=["chr", "pos", "mean_cov", "median_cov"],
    dtype={"chr": "str", "pos": "int32", "mean_cov": "float32", "median_cov": "int32"},
    index_col=["chr", "pos"],
    chunksize=10000000,
)

In [None]:
# Get VEP annotations for all possible SNVs (excluding short penultimate exons)
%%time
vep = (
    pd.read_csv(
        "../outputs/vep/vep_cds_all_possible_snvs.vcf",
        sep="\t",
        comment="#",
        header=None,
        names=header,
        dtype=datatypes,
        usecols=["chr", "pos"],
    )
    .drop_duplicates()
    .set_index(["chr", "pos"])
)

In [None]:
# Filter genomewide coverage data to coding sites only
%%time
df = pd.concat([vep.join(chunk, how="inner") for chunk in reader])
df = df.reset_index()

In [None]:
# Write to output
df.to_csv("../outputs/gnomad_3.1.1_coverage_coding_sites.tsv", sep="\t", index=False)

# Plots

In [None]:
sns.set_context("talk")

In [None]:
# Mean coverage
sns.ecdfplot(df.mean_cov[df.mean_cov < 35])

In [None]:
# Median coverage
sns.ecdfplot(df.median_cov[df.median_cov < 35])