# Observed and possible synonymous variants, by context
This notebook describes number of observed and possible synonymous SNVs in UKB, stratified by their trinucleotide context and mutability. 

It combines annotations for possible variants, observed variants, VEP annotations and trinucleotide contexts. Only synonymous variants are kept.

## Preliminaries

### Download data from UKB RAP

In [1]:
%%bash
dx download \
    -f \
    -o ../data/ \
    data/cds_trinucleotide_contexts.tsv \
    data/grch38_cpg_methylation.tsv \
    data/gnomad_nc_mutation_rates.tsv \
    data/vep_cds_all_possible_snvs.vcf \
    outputs/gnomad_pass_variants/all_pass_snvs.txt

### Import modules

In [2]:
# Import modules
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.metrics import r2_score

# Set seaborn context
sns.set_context("talk")

## Load datasets

In [3]:
# Define VCF headers and datatypes.
_header = ["chr", "pos", "id", "ref", "alt", "qual", "filter", "info"]

datatypes = defaultdict(lambda: "str")
datatypes.update({"pos": np.int32, "ac": np.int32, "an": np.int32})

In [4]:
# Retreive observed variants
obs = pd.read_csv(
    "../data/all_pass_snvs.txt",
    sep="\t",
    header=None,
    names=_header + ["ac", "an"],
    usecols=["chr", "pos", "ref", "alt", "ac", "an"],
    dtype=datatypes,
).assign(obs=1)

In [5]:
# Retreive all possible SNVs
pos = pd.read_csv(
    "../data/vep_cds_all_possible_snvs.vcf",
    sep="\t",
    comment="#",
    header=None,
    names=_header,
    dtype=datatypes,
    usecols=["chr", "pos", "ref", "alt"],
)

In [6]:
# Retreive VEP annotations of all possible SNVs
vep = pd.read_csv(
    "../data/vep_cds_all_possible_snvs.vcf",
    sep="\t",
    comment="#",
    header=None,
    names=_header,
    dtype=datatypes,
    usecols=["chr", "pos", "ref", "alt", "info"],
)

In [7]:
# Limit to synonymous variants only
syn = pd.Series(["synonymous" in x for x in vep["info"]])
vep = vep.drop("info", axis=1).loc[syn].assign(csq="synonymous")

In [8]:
# Trinucleotide contexts
tri = pd.read_csv("../data/cds_trinucleotide_contexts.tsv", sep="\t", dtype=datatypes)

In [9]:
# gnomAD methylation data
meth = pd.read_csv(
    "../data/grch38_cpg_methylation.tsv",
    sep="\t",
    header=0,
    names=["ix", "chr", "pos", "alleles", "lvl"],
    usecols=["chr", "pos", "lvl"],
)

In [10]:
# Mutation rates
mu = pd.read_csv(
    "../data/gnomad_nc_mutation_rates.tsv",
    sep="\t",
    names=[
        "tri",
        "ref",
        "alt",
        "lvl",
        "variant_type",
        "mu",
        "pos",
        "obs",
        "po",
        "ppo",
    ],
    header=0,
    usecols=["tri", "ref", "alt", "lvl", "mu", "variant_type"],
)

# Mutation rates are only available for 32 codons. We need to reverse-complement for the remainder.
complement = {"A": "T", "C": "G", "G": "C", "T": "A"}
# Replace ref and alt alleles
_mu = mu.copy().replace(complement)
# Reverse-complement trinucleotide contexts
_mu["tri"] = pd.Series(["".join([complement[y] for y in x])[::-1] for x in mu.tri])
mu = pd.concat([mu, _mu])

## Merge annotations

In [11]:
# Merge variant annotations
df = pos.merge(obs, how="left").fillna(0)
df = df.merge(vep, how="left").dropna()
df = df.merge(tri, how="left")

In [12]:
# Merge methylation and mutability annotations

## Find the number of CpG sites not represented in the ENCODE data
variant_types = mu[["tri", "ref", "alt", "variant_type"]].drop_duplicates()
df = df.merge(variant_types, how="left")
df = df.merge(meth, how="left")

## All non-CpG sites have lvl 0
df.loc[df["variant_type"] != "CpG", "lvl"] = 0
df.lvl = df.lvl.astype(int)

## Merge with mutability data
df = df.merge(mu, how="left")

### Make a copy of the dataframe in case it is needed

In [13]:
df_spare = df.copy()

## Drop selected variants

In [14]:
# Drop contexts in which a synonymous variant is generally not possible.
# (NB synonymous variants in these contexts may occur at exon-intron junctions)
m1 = (df.tri == "AGT") & ((df.alt == "C") | (df.alt == "T"))
m2 = (df.tri == "AAT") & ((df.alt == "C") | (df.alt == "T"))
m3 = (df.tri == "ACT") & ((df.alt == "G") | (df.alt == "A"))
m4 = (df.tri == "ATT") & ((df.alt == "G") | (df.alt == "A"))

df = df[~(m1 | m2 | m3 | m4)]

In [15]:
# Drop common variants
df = df[(df["ac"] == 0) | (df["ac"] / df["an"] < 0.001)]

## Summarise and write to output

In [16]:
stats = (
    df.groupby(["tri", "ref", "alt", "variant_type", "lvl"])
    .agg({"mu": "mean", "obs": "mean", "pos": "count"})
    .reset_index()
)
stats.to_csv("../outputs/mutational_model_stats.tsv", sep="\t", index=False)