In [1]:
""" This script generates a model for the expected number of variants, given the sequence mutability. It first combines annotations for possible variants, observed variants, VEP annotations and trinucleotide contexts. The proportion of synonymous variants observed in each trinucleotide context is then calculated, and linear models built to describe the relationship between mutability and proportion observed.""";

In [2]:
# Install necessary modules
! conda install statsmodels -y

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.10.3
  latest version: 23.1.0

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.



In [None]:
%%bash

# Download data from UKB RAP

dx download \
    -o ../data/ \
    data/cds_trinucleotide_contexts.tsv \
    data/grch38_cpg_methylation.tsv \
    data/mutation_rate_by_context_methyl.txt \
    data/vep_cds_all_possible_snvs.vcf \
    outputs/gnomad_pass_variants/all_pass_snvs.txt

In [3]:
# Import modules
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf

In [4]:
# Define VCF headers and datatypes.
_header = ["chr", "pos", "id", "ref", "alt", "qual", "filter", "info"]

datatypes = defaultdict(lambda: "str")
datatypes.update({"pos": np.int32, "ac": np.int32, "an": np.int32})

In [5]:
# Retreive observed variants
obs = pd.read_csv(
    "../data/all_pass_snvs.txt",
    sep="\t",
    header=None,
    names=_header + ["ac", "an"],
    usecols=["chr", "pos", "ref", "alt", "ac", "an"],
    dtype=datatypes,
).assign(obs=1)

In [7]:
# Retreive all possible SNVs
pos = pd.read_csv(
    "../data/vep_cds_all_possible_snvs.vcf",
    sep="\t",
    comment="#",
    header=None,
    names=_header,
    dtype=datatypes,
    usecols=["chr", "pos", "ref", "alt"],
)

In [10]:
# Retreive VEP annotations of all possible SNVs
vep = pd.read_csv(
    "../data/vep_cds_all_possible_snvs.vcf",
    sep="\t",
    comment="#",
    header=None,
    names=_header,
    dtype=datatypes,
    usecols=["chr", "pos", "ref", "alt", "info"],
)

In [11]:
# Limit to synonymous variants only
syn = pd.Series(["synonymous" in x for x in vep["info"]])
vep = vep.drop("info", axis=1).loc[syn].assign(csq="synonymous")

In [17]:
# Trinucleotide contexts
tri = pd.read_csv(
    "../data/cds_trinucleotide_contexts.tsv", sep="\t", dtype=datatypes
)

In [35]:
# ENCODE methylation data
meth = (pd.read_csv("../data/grch38_cpg_methylation.tsv", 
                    sep="\t",
                    header=0,
                    names=["ix","chr","pos","alleles","lvl"],
                    usecols=["chr","pos","lvl"],
                   )
       )

In [43]:
# Mutation rates
mu = pd.read_csv(
    "../data/mutation_rate_by_context_methyl.txt",
    sep="\t",
    names=["tri", "ref", "alt", "lvl", "pos", "obs", "po", "mu", "fpo"],
    header=0,
    usecols=["tri", "ref", "alt", "lvl", "mu"],
)

# Mutation rates are only available for 32 codons. We need to reverse-complement for the remainder.
complement = {"A": "T", "C": "G", "G": "C", "T": "A"}
# Replace ref and alt alleles
_mu = mu.copy().replace(complement)
# Reverse-complement trinucleotide contexts
_mu["tri"] = pd.Series(["".join([complement[y] for y in x])[::-1] for x in mu.tri])
mu = pd.concat([mu, _mu])

In [None]:
# Merge variant annotations
df = pos.merge(obs, how="left").fillna(0)
df = df.merge(vep, how="left").dropna()
df = df.merge(tri, how="left")

In [None]:
# Merge methylation and mutability annotations

## Find the number of CpG sites not represented in the ENCODE data
variant_types = mu[["tri", "ref", "alt", "variant_type"]].drop_duplicates()
df = df.merge(variant_types, how="left")
df = df.merge(meth, how="left")

## Print the result
_ = df[df.variant_type == "CpG"]["lvl"].isna().value_counts(normalize=True)
print(
    f"{np.round(_[True]*100, 2)}% of CpG sites are not represented in the methylation data"
)

## Assign "missing" CpG sites to the mean methylation level
df.loc[(df.variant_type == "CpG") & (df.lvl.isna()), "lvl"] = 2

## All non-CpG sites have lvl 0
df.loc[df["variant_type"] != "CpG", "lvl"] = 0

## Merge with mutability data
df = df.merge(mu, how="left")

In [None]:
# Drop contexts in which a synonymous variant is generally not possible.
m1 = (df.tri == "AGT") & ((df.alt == "C") | (df.alt == "T"))
m2 = (df.tri == "AAT") & ((df.alt == "C") | (df.alt == "T"))
m3 = (df.tri == "ACT") & ((df.alt == "G") | (df.alt == "A"))
m4 = (df.tri == "ATT") & ((df.alt == "G") | (df.alt == "A"))

df = df[~(m1 | m2 | m3 | m4)]

# Plots

In [None]:
sns.set_context("talk")

In [None]:
g = sns.lmplot(
    data=df,
    x="mu",
    y="obs",
    ci=None,
    x_estimator=np.mean,
    height=4,
)
g.set(
    title="Synonymous", xlabel="mutability", ylabel="proportion observed", xscale="log"
)

## Square root mutability

In [None]:
df["sqrt_mu"] = np.sqrt(df["mu"])

In [None]:
g = sns.lmplot(
    data=df,
    x="sqrt_mu",
    y="obs",
    ci=None,
    x_estimator=np.mean,
    height=4,
)
g.set(title="Synonymous", xlabel="sqrt(mu)", ylabel="proportion observed", xscale="log")

# Statistics

In [None]:
stats = (
    df.groupby(["tri", "ref", "alt", "lvl"])
    .agg({"mu": "mean", "sqrt_mu": "mean", "obs": "mean", "pos": "count"})
    .reset_index()
)
stats.to_csv("../statistics/mutational_model_stats.tsv", sep="\t", index=False)

In [None]:
f = sns.residplot(data=stats, x="sqrt_mu", y="obs")

In [None]:
# OLS on mu
model = smf.ols("obs ~ mu", data=stats).fit()
print(model.summary())

In [None]:
# OLS on sqrt(mu)
model = smf.ols("obs ~ sqrt_mu", data=stats).fit()
print(model.summary())

In [None]:
# Weighted linear model
model = smf.wls("obs ~ sqrt_mu", data=stats, weights=stats["pos"]).fit()
print(model.summary())