# Expected variants
This script determines the expected number of variants for a given region. 
It calculates this for:
- Transcripts
- NMD regions

## Import modules

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from collections import defaultdict

## Load data

### Synonymous variants observed in UKB
Rare synonymous variants are the basis for the model. We drop synonymous variants arising from CpG transitions.

In [2]:
# Rare synonymous variants per variant context
syn = pd.read_csv("../outputs/observed_variants_stats_synonymous.tsv", sep="\t")

# Get proportion of variants observed
syn["prop_obs"] = syn["obs"] / syn["pos"]

# Split CpG and non-CpG variants
syn_non = syn[syn["variant_type"] != "CpG"].copy()
syn_cpg = syn[syn["variant_type"] == "CpG"].copy()

### Variants observed in UKB, per transcript and NMD region
Expected numbers of variants will be predicted for these regions.

In [13]:
# Variants observed per transcript
enst = pd.read_csv("../outputs/observed_variants_stats_transcript.tsv", sep="\t")

In [14]:
# Variants observed per NMD region
nmd = pd.read_csv("../outputs/observed_variants_stats_nmd.tsv", sep="\t")

In [15]:
# Concatenate the transcript-level and region-level data
enst = enst.assign(region="transcript")
nmd = nmd.rename(columns={"nmd": "region"})

df = pd.concat([nmd, enst]).sort_values(["region", "enst", "csq"])
df.head(3)

Unnamed: 0,enst,csq,variant_type,region,n_pos,n_obs,mu
0,ENST00000000233,missense,CpG,distal_nmd,10,9,1.1352e-07
3,ENST00000000233,missense,non-CpG,distal_nmd,271,19,3.376155e-09
6,ENST00000000233,nonsense,non-CpG,distal_nmd,20,0,4.5407e-09


In [16]:
# Pivot on "variant type" so that CpG and non-CpG variants are in different columns
df = df.pivot(
    index=["enst", "csq", "region"],
    columns="variant_type",
    values=["n_pos", "n_obs", "mu"],
).swaplevel(0, 1, axis=1).sort_index(axis=1, level=[0,1])

# Reduce multi-index columns
df.columns = ["_".join([x.lower().replace("-","_"),y]) for x, y in df.columns]
df.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,cpg_mu,cpg_n_obs,cpg_n_pos,non_cpg_mu,non_cpg_n_obs,non_cpg_n_pos
enst,csq,region,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENST00000000233,missense,distal_nmd,1.1352e-07,9.0,10.0,3.376155e-09,19.0,271.0
ENST00000000233,missense,nmd_target,8.845714e-08,8.0,14.0,3.258126e-09,23.0,563.0
ENST00000000233,missense,start_proximal,1.65e-08,3.0,10.0,3.529e-09,9.0,304.0


## Modelling expected proportion of variants

Non-CpG variants are fit to a linear model.

In [17]:
# Linear model
non_cpg_model = sm.WLS(
    syn_non["prop_obs"],
    sm.tools.add_constant(syn_non["mu"]),
    weights=syn_non["pos"],
)
non_cpg_results = non_cpg_model.fit()
non_cpg_results.summary(slim=True)

0,1,2,3
Dep. Variable:,prop_obs,R-squared:,0.959
Model:,WLS,Adj. R-squared:,0.959
No. Observations:,176,F-statistic:,4083.0
Covariance Type:,nonrobust,Prob (F-statistic):,9.710000000000001e-123

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0121,0.002,5.790,0.000,0.008,0.016
mu,2.574e+07,4.03e+05,63.897,0.000,2.49e+07,2.65e+07


CpG variants are fit to a log-linear model.

In [18]:
# Log-linear model
cpg_model = sm.WLS(
    syn_cpg["prop_obs"],
    np.exp(sm.tools.add_constant(syn_cpg["mu"])),
    weights=syn_cpg["pos"],
)
cpg_results = cpg_model.fit()
cpg_results.summary(slim=True)

0,1,2,3
Dep. Variable:,prop_obs,R-squared:,0.93
Model:,WLS,Adj. R-squared:,0.929
No. Observations:,128,F-statistic:,1675.0
Covariance Type:,nonrobust,Prob (F-statistic):,1.24e-74

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.785e+06,4.36e+04,-40.925,0.000,-1.87e+06,-1.7e+06
mu,4.852e+06,1.19e+05,40.925,0.000,4.62e+06,5.09e+06


## Calculate expected variants per transcript and context

In [9]:
# Non CpG
df["non_cpg_prop_exp"] = non_cpg_results.predict(sm.tools.add_constant(df["non_cpg_mu"]))
df["non_cpg_n_exp"] = np.round((df["non_cpg_prop_exp"] * df["non_cpg_n_pos"]),3)

# CpG
df["cpg_prop_exp"] = cpg_results.predict(np.exp(sm.tools.add_constant(df["cpg_mu"])))
df["cpg_n_exp"] = np.round((df["cpg_prop_exp"] * df["cpg_n_pos"]), 3)

# Combine CpG and non-CpG
df["n_pos"] = df["non_cpg_n_pos"] + df["cpg_n_pos"]
df["n_obs"] = df["non_cpg_n_obs"] + df["cpg_n_obs"]
df["n_exp"] = df["non_cpg_n_exp"] + df["cpg_n_exp"]
df["oe"] = df["n_obs"] / df["n_exp"]
df["prop_obs"] = df["n_obs"] / df["n_pos"]
df["prop_exp"] = df["n_exp"] / df["n_pos"]

# By default, regions with no variants would be dropped.
# We keep them instead.
df = df.unstack(fill_value=0).stack().reset_index()

# Keep relevant columns
df = df[
    [
        "enst",
        "region",
        "csq",
        "n_pos",
        "n_obs",
        "n_exp",
        "oe",
        "prop_obs",
        "prop_exp",
    ]
]

## Write to output

In [10]:
df.to_csv("../outputs/expected_variants_all_regions_no_cpg.tsv", sep="\t", index=False)