# Expected variants
This script determines the expected number of variants for a given transcript and NMD region.

## Import modules

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from collections import defaultdict

## Synonymous variants observed in UKB
Rare synonymous variants are the basis for the model. We split CpG and non-CpG synonymous variants.

In [2]:
# Rare synonymous variants per variant context
syn = pd.read_csv("../outputs/observed_variants_stats_synonymous.tsv", sep="\t")

# Get proportion of variants observed
syn["prop_obs"] = syn["obs"] / syn["pos"]

# Split CpG and non-CpG variants
syn_non = syn[syn["variant_type"] != "CpG"].copy()
syn_cpg = syn[syn["variant_type"] == "CpG"].copy()

# Some CpG contexts are saturated; these should be dropped
syn_cpg = syn_cpg[syn_cpg["prop_obs"] != 1]

## Modelling expected proportion of variants

In [3]:
# Linear model for non-CpGs
non_cpg_model = sm.WLS(
    np.log(1 - syn_non["prop_obs"]),
    sm.tools.add_constant(syn_non["mu"]),
    weights=syn_non["pos"],
)
non_cpg_results = non_cpg_model.fit()
non_cpg_results.summary(slim=True)

0,1,2,3
Dep. Variable:,prop_obs,R-squared:,0.965
Model:,WLS,Adj. R-squared:,0.964
No. Observations:,176,F-statistic:,4739.0
Covariance Type:,nonrobust,Prob (F-statistic):,3.71e-128

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0055,0.002,-2.419,0.017,-0.010,-0.001
mu,-3.002e+07,4.36e+05,-68.840,0.000,-3.09e+07,-2.92e+07


CpG variants are fit to a log-linear model.

In [4]:
# Log-linear model for CpGs
cpg_model = sm.WLS(
    np.log(1 - syn_cpg["prop_obs"]),
    np.exp(sm.tools.add_constant(syn_cpg["mu"])),
    weights=syn_cpg["pos"],
)
cpg_results = cpg_model.fit()
cpg_results.summary(slim=True)

0,1,2,3
Dep. Variable:,prop_obs,R-squared:,0.942
Model:,WLS,Adj. R-squared:,0.941
No. Observations:,123,F-statistic:,1954.0
Covariance Type:,nonrobust,Prob (F-statistic):,1.59e-76

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.801e+06,1.76e+05,44.205,0.000,7.45e+06,8.15e+06
mu,-2.12e+07,4.8e+05,-44.205,0.000,-2.22e+07,-2.03e+07


## Variants observed in UKB

### Combine observations in NMD regions and transcripts
The expected number of variants will be predicted for each of these regions.

In [5]:
# Variants observed per transcript
enst = pd.read_csv("../outputs/observed_variants_stats_transcript.tsv", sep="\t")

In [6]:
# Variants observed per NMD region
nmd = pd.read_csv("../outputs/observed_variants_stats_nmd.tsv", sep="\t")

In [7]:
# Concatenate the transcript-level and region-level data
enst = enst.assign(region="transcript")
nmd = nmd.rename(columns={"nmd": "region"})

df = pd.concat([nmd, enst]).sort_values(["region", "enst", "csq"])
df.head(3)

Unnamed: 0,enst,csq,variant_type,region,n_pos,n_obs,mu
0,ENST00000000233,missense,CpG,distal_nmd,10,9,1.1352e-07
3,ENST00000000233,missense,non-CpG,distal_nmd,271,19,3.376155e-09
6,ENST00000000233,nonsense,non-CpG,distal_nmd,20,0,4.5407e-09


### Treat CpG and non-CpG variants separately

In [8]:
# Non-CpG
non_cpg = df[df["variant_type"] != "CpG"].copy()
non_cpg.enst.nunique()

19623

In [9]:
# CpG
cpg = df[df["variant_type"] == "CpG"].copy()
cpg.enst.nunique()

19476

Several transcripts are missing from the CpG data, presumably because there are no CpG sites in these transcripts. I will re-index the data so that all transcripts, consequences and regions are represented.

In [10]:
# Re-index non-CpGs
non_cpg = (
    non_cpg.set_index(["enst", "region", "csq"])
    .unstack("enst")
    .stack(dropna=False)
    .reset_index()
)

# Check that all combinations of enst, region, and csq are present
assert non_cpg.enst.nunique() * non_cpg.region.nunique() * non_cpg.csq.nunique() == len(
    non_cpg
)

# Fill NaN values for variant type
non_cpg["variant_type"] = non_cpg["variant_type"].fillna("non-CpG")

This is slightly more complex for CpGs, which are missing several transcripts, compared with the non-CpG data...

In [11]:
# Get all the transcripts in the non-CpG data
all_transcripts = pd.Series(non_cpg["enst"].unique(), name="enst")

# Merge on these transcripts
cpg = cpg.merge(all_transcripts, how="right")

# Fill any NaN values in csq and region with dummy values
cpg["csq"] = cpg.csq.fillna("missense")
cpg["region"] = cpg.region.fillna("transcript")

# Reindex CpGs, as above
cpg = (
    cpg.set_index(["enst", "region", "csq"])
    .unstack("enst")
    .stack(dropna=False)
    .reset_index()
)

# Check that all combinations of enst, region, and csq are present
assert cpg.enst.nunique() * cpg.region.nunique() * cpg.csq.nunique() == len(cpg)

# Fill NaN values for variant type
cpg["variant_type"] = cpg["variant_type"].fillna("CpG")

## Calculate the expected proportion of variants per transcript and region

### Non-CpGs

In [12]:
# Predict the proportion observed with the non-CpG model
non_cpg["prop_exp"] = 1 - np.exp(non_cpg_results.predict(sm.tools.add_constant(non_cpg["mu"])))

# Calculate the number of variants expected
non_cpg["n_exp"] = (non_cpg["prop_exp"] * non_cpg["n_pos"]).pipe(np.round, 3)

### CpGs

In [13]:
# Predict the proportion observed with the CpG model
cpg["prop_exp"] = 1 - np.exp(cpg_results.predict(np.exp(sm.tools.add_constant(cpg["mu"]))))

# Calculate the number of variants expected
cpg["n_exp"] = (cpg["prop_exp"] * cpg["n_pos"]).pipe(np.round, 3)

### Combine CpG and non-CpG variants

In [14]:
# Give CpG and non-CpG variants a consistent index
non_cpg = non_cpg.set_index(["enst","region","csq"])
cpg = cpg.set_index(["enst","region","csq"])

# Combine non-CpG and CpG data into the essential summary statistics
n_pos = non_cpg["n_pos"].fillna(0) + cpg["n_pos"].fillna(0)
n_obs = non_cpg["n_obs"].fillna(0) + cpg["n_obs"].fillna(0)
n_exp = non_cpg["n_exp"].fillna(0) + cpg["n_exp"].fillna(0)
oe = (n_obs / n_exp).rename("oe")
prop_obs = (n_obs / n_pos).rename("prop_obs")
prop_exp = (n_exp / n_pos).rename("prop_exp")

# Calculate the total mutability for each region
# In each dataframe, "mu" is the mean mutability for contexts in a region
mu_non_cpg = (non_cpg["mu"] * non_cpg["n_pos"]).fillna(0)
mu_cpg = (cpg["mu"] * cpg["n_pos"]).fillna(0)
mu = (mu_non_cpg + mu_cpg).rename("mu")

# Create a summary dataframe
df = pd.concat([mu, n_pos, n_obs, n_exp, oe, prop_obs, prop_exp], axis=1).reset_index(drop=False)

# How many regions are missing variants?
oe.isna().sum()

59905


Note the many sites with NaN values. These are regions where 0 variants are possible. We have kept them for now; they could be dropped later.

## Write to output

In [15]:
df.to_csv("../outputs/expected_variants_all_regions.tsv", sep="\t", index=False)