# Observed variants
This script identifies the observed and possible variants per transcript. It groups variants by transcript and consequence, and aggregates the number observed, the number possible, and the mean mutability. These summary data are saved to a tsv file.

## Preliminaries

### Import modules

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats as _stats
import statsmodels.formula.api as smf
from statsmodels.stats.proportion import proportions_ztest

sns.set_context("talk")

### Download datasets from UKB RAP

In [2]:
%%bash
dx download \
    -f \
    -o ../data/ \
    data/cds_trinucleotide_contexts.tsv \
    data/grch38_cpg_methylation.tsv \
    data/gnomad_nc_mutation_rates.tsv \
    data/vep_cds_all_possible_snvs.vcf \
    outputs/gnomad_pass_variants/all_pass_snvs.txt \
    outputs/nmd_annotations.tsv

## Load datasets

In [3]:
# Define VCF headers and datatypes.
_header = ["chr", "pos", "id", "ref", "alt", "qual", "filter", "info"]

datatypes = defaultdict(lambda: "str")
datatypes.update({"pos": np.int32, "ac": np.int32, "an": np.int32})

In [4]:
# Retreive observed variants
obs = pd.read_csv(
    "../data/all_pass_snvs.txt",
    sep="\t",
    header=None,
    names=_header + ["ac", "an"],
    usecols=["chr", "pos", "ref", "alt", "ac", "an"],
    dtype=datatypes,
).assign(obs=1)

In [5]:
# Retreive VEP annotations of all possible SNVs
vep = pd.read_csv(
    "../data/vep_cds_all_possible_snvs.vcf",
    sep="\t",
    comment="#",
    header=None,
    names=_header,
    dtype=datatypes,
    usecols=["chr", "pos", "ref", "alt", "info"],
)

In [6]:
# Get enst
vep["enst"] = pd.Series([x.split("|", 3)[2] for x in vep["info"]])

In [7]:
# Get csq
syn = pd.Series(["synonymous" in x for x in vep["info"]])
mis = pd.Series(["missense" in x for x in vep["info"]])
non = pd.Series(["stop_gained" in x for x in vep["info"]])

vep.loc[syn, "csq"] = "synonymous"
vep.loc[mis, "csq"] = "missense"
vep.loc[non, "csq"] = "nonsense"

vep = vep.drop("info", axis=1).dropna()  # Keep only syn/mis/non variants

In [8]:
# Trinucleotide contexts
tri = pd.read_csv(
    "../data/cds_trinucleotide_contexts.tsv", sep="\t", dtype=datatypes
)

In [9]:
# gnomAD methylation data
meth = (pd.read_csv("../data/grch38_cpg_methylation.tsv", 
                    sep="\t",
                    header=0,
                    names=["ix","chr","pos","alleles","lvl"],
                    usecols=["chr","pos","lvl"],
                   )
       )

In [10]:
# Mutation rates
mu = pd.read_csv(
    "../data/gnomad_nc_mutation_rates.tsv",
    sep="\t",
    names=["tri", "ref", "alt", "lvl", "variant_type", "mu", "pos", "obs", "po", "ppo",],
    header=0,
    usecols=["tri", "ref", "alt", "lvl", "mu", "variant_type"],
)

# Mutation rates are only available for 32 codons. We need to reverse-complement for the remainder.
complement = {"A": "T", "C": "G", "G": "C", "T": "A"}
# Replace ref and alt alleles
_mu = mu.copy().replace(complement)
# Reverse-complement trinucleotide contexts
_mu["tri"] = pd.Series(["".join([complement[y] for y in x])[::-1] for x in mu.tri])
mu = pd.concat([mu, _mu])

In [11]:
nmd = (pd.read_csv("../data/nmd_annotations.tsv",
                  sep="\t",
                  usecols=["chr","pos","transcript_id","nmd_definitive"]
                 )
       .rename(columns={"transcript_id":"enst", "nmd_definitive":"nmd"})
      )

## Merge annotations

In [11]:
# Merge VEP, context, NMD, and observed variant annotations
df = vep.merge(tri, how="left")
df = df.merge(nmd, how="left")
df = df.merge(obs, how="left").fillna(0)

In [21]:
# Merge methylation annotations
variant_types = mu[["tri", "ref", "alt", "variant_type"]].drop_duplicates()
df = df.merge(variant_types, how="left")
df = df.merge(meth, how="left")

# All non-CpG sites have lvl 0
df.loc[df["variant_type"] != "CpG", "lvl"] = 0
df.lvl = df.lvl.astype(int)

# Merge with mutability data
df = df.merge(mu, how="left")

## Summarise data and save output

### Include CpG variants

#### Group by enst and csq only

In [22]:
dfg = (
    df.groupby(["enst", "csq"])
    .agg(
        n_pos=("pos", "count"),
        n_obs=("obs", "sum"),
        mu=("mu", "mean"),
    ).reset_index()
)

In [23]:
dfg.to_csv("../outputs/observed_variants_stats.tsv", sep="\t", index=False)

#### Group by NMD annotations

In [24]:
dfg = (
    df.groupby(["enst", "csq", "nmd"])
    .agg(
        n_pos=("pos", "count"),
        n_obs=("obs", "sum"),
        mu=("mu", "mean"),
    ).reset_index()
)

In [25]:
dfg.to_csv("../outputs/observed_variants_stats_nmd.tsv", sep="\t", index=False)

### Exclude CpG variants

In [26]:
_df = df[df["variant_type"] != "CpG"].copy()

#### Group by enst and csq only

In [27]:
_dfg = (
    _df.groupby(["enst", "csq"])
    .agg(
        n_pos=("pos", "count"),
        n_obs=("obs", "sum"),
        mu=("mu", "mean"),
    ).reset_index()
)

In [28]:
_dfg.to_csv("../outputs/observed_variants_stats_no_cpg.tsv", sep="\t", index=False)

#### Group by NMD annotations

In [29]:
_dfg = (
    _df.groupby(["enst", "csq", "nmd"])
    .agg(
        n_pos=("pos", "count"),
        n_obs=("obs", "sum"),
        mu=("mu", "mean"),
    ).reset_index()
)

In [30]:
_dfg.to_csv("../outputs/observed_variants_stats_no_cpg_nmd.tsv", sep="\t", index=False)