# Expected variants
This script determines the expected number of variants per transcript and consequence

## Import modules

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats as _stats
import statsmodels.formula.api as smf
from statsmodels.stats.proportion import proportions_ztest

sns.set_context("talk")

## Load data

In [2]:
# Load mutational model summary statistics
mu = pd.read_csv("../outputs/mutational_model_stats.tsv", sep="\t")

In [3]:
# A function to retrieve the standard error of a proportion
def sem(p, n): return np.sqrt((p*(1-p))/n)

### Exclusions

In [4]:
# Exclude CpG transitions 
mu_cpg = mu[mu["variant_type"] != "CpG"].copy()

In [5]:
model = smf.wls("obs ~ mu", data = mu_cpg).fit()
model.summary(slim=True)

0,1,2,3
Dep. Variable:,obs,R-squared:,0.951
Model:,WLS,Adj. R-squared:,0.951
No. Observations:,176,F-statistic:,3372.0
Covariance Type:,nonrobust,Prob (F-statistic):,7.840000000000001e-116

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0110,0.002,5.465,0.000,0.007,0.015
mu,2.554e+07,4.4e+05,58.067,0.000,2.47e+07,2.64e+07


### Weight by number of possible variants

In [6]:
model = smf.wls("obs ~ mu", data = mu_cpg, weights=mu_cpg["pos"]).fit()
model.summary(slim=True)

0,1,2,3
Dep. Variable:,obs,R-squared:,0.959
Model:,WLS,Adj. R-squared:,0.959
No. Observations:,176,F-statistic:,4083.0
Covariance Type:,nonrobust,Prob (F-statistic):,9.710000000000001e-123

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0121,0.002,5.790,0.000,0.008,0.016
mu,2.574e+07,4.03e+05,63.897,0.000,2.49e+07,2.65e+07


### Weight by 1 / standard error of the proportion of observed variants

In [7]:
model = smf.wls("obs ~ mu", data = mu_cpg, weights=(1 / sem(mu_cpg["obs"],mu_cpg["pos"]))).fit()
model.summary(slim=True)

0,1,2,3
Dep. Variable:,obs,R-squared:,0.958
Model:,WLS,Adj. R-squared:,0.958
No. Observations:,176,F-statistic:,3991.0
Covariance Type:,nonrobust,Prob (F-statistic):,6.450000000000001e-122

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0084,0.002,4.883,0.000,0.005,0.012
mu,2.613e+07,4.14e+05,63.176,0.000,2.53e+07,2.69e+07


### No weighting

In [8]:
model = smf.wls("obs ~ np.sqrt(mu)", data = mu_cpg,).fit()
model.summary(slim=True)

0,1,2,3
Dep. Variable:,obs,R-squared:,0.96
Model:,WLS,Adj. R-squared:,0.96
No. Observations:,176,F-statistic:,4156.0
Covariance Type:,nonrobust,Prob (F-statistic):,2.2e-123

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0831,0.003,-26.682,0.000,-0.089,-0.077
np.sqrt(mu),3291.2322,51.052,64.469,0.000,3190.472,3391.993


### Weight by number of possible variants

In [9]:
model = smf.wls("obs ~ np.sqrt(mu)", data = mu_cpg, weights=mu_cpg["pos"]).fit()
model.summary(slim=True)

0,1,2,3
Dep. Variable:,obs,R-squared:,0.969
Model:,WLS,Adj. R-squared:,0.968
No. Observations:,176,F-statistic:,5360.0
Covariance Type:,nonrobust,Prob (F-statistic):,1.1900000000000002e-132

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0877,0.003,-28.678,0.000,-0.094,-0.082
np.sqrt(mu),3393.2236,46.350,73.209,0.000,3301.743,3484.704


### Weight by 1 / standard error of the proportion of observed variants

In [10]:
model = smf.wls("obs ~ np.sqrt(mu)", data = mu_cpg, weights=1/sem(mu_cpg["obs"],mu_cpg["pos"])).fit()
model.summary(slim=True)

0,1,2,3
Dep. Variable:,obs,R-squared:,0.961
Model:,WLS,Adj. R-squared:,0.961
No. Observations:,176,F-statistic:,4274.0
Covariance Type:,nonrobust,Prob (F-statistic):,2.1300000000000004e-124

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0826,0.003,-28.436,0.000,-0.088,-0.077
np.sqrt(mu),3288.9283,50.310,65.374,0.000,3189.633,3388.224


### No weighting

In [11]:
model = smf.wls("np.log(1-obs) ~ mu", data = mu_cpg).fit()
model.summary(slim=True)

0,1,2,3
Dep. Variable:,np.log(1 - obs),R-squared:,0.956
Model:,WLS,Adj. R-squared:,0.956
No. Observations:,176,F-statistic:,3821.0
Covariance Type:,nonrobust,Prob (F-statistic):,2.42e-120

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0050,0.002,-2.289,0.023,-0.009,-0.001
mu,-2.96e+07,4.79e+05,-61.817,0.000,-3.05e+07,-2.87e+07


### Weight by number of possible variants

In [12]:
model = smf.wls("np.log(1-obs) ~ mu", data = mu_cpg, weights=mu_cpg["pos"]).fit()
model.summary(slim=True)

0,1,2,3
Dep. Variable:,np.log(1 - obs),R-squared:,0.965
Model:,WLS,Adj. R-squared:,0.964
No. Observations:,176,F-statistic:,4739.0
Covariance Type:,nonrobust,Prob (F-statistic):,3.71e-128

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0055,0.002,-2.419,0.017,-0.010,-0.001
mu,-3.002e+07,4.36e+05,-68.840,0.000,-3.09e+07,-2.92e+07


### Weight by 1 / standard error of the observed proportion

In [13]:
model = smf.wls("np.log(1-obs) ~ mu", data = mu_cpg, weights=1/sem(mu_cpg["obs"], mu_cpg["pos"])).fit()
model.summary(slim=True)

0,1,2,3
Dep. Variable:,np.log(1 - obs),R-squared:,0.963
Model:,WLS,Adj. R-squared:,0.963
No. Observations:,176,F-statistic:,4539.0
Covariance Type:,nonrobust,Prob (F-statistic):,1.3899999999999998e-126

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0027,0.002,-1.445,0.150,-0.006,0.001
mu,-3.018e+07,4.48e+05,-67.370,0.000,-3.11e+07,-2.93e+07


## Calculate expected variants per transcript, context, and NMD annotation

In [14]:
def get_expected(df, x):
    """ 
    Get the expected proportion of variants per transcript and consequence. 
    x is a lambda function used to predict the expected value.
    """
    df = (df.assign(
            prop_obs=lambda x: x["n_obs"] / x["n_pos"],
            prop_exp=x,
            n_exp=lambda x: np.round(x["n_pos"] * x["prop_exp"], 2),
            oe=lambda x: x["n_obs"] / x["n_exp"],
    )
        .set_index(["enst","csq","nmd"])
        .unstack(fill_value=0).stack().reset_index()
    )
    # Z scores and p-values
    df["z"] = df.apply(
        lambda x: (
            proportions_ztest(
                x["n_obs"],
                x["n_pos"],
                x["prop_exp"],
                alternative="smaller",
                prop_var=x["prop_exp"],
            )[0]
        ),
        axis=1,
    )
    df["p"] = df.apply(
        lambda x: proportions_ztest(
            x["n_obs"],
            x["n_pos"],
            x["prop_exp"],
            alternative="smaller",
            prop_var=x["prop_exp"],
        )[1],
        axis=1,
    )
    return df

### Choose expectation model

In [15]:
# Linear model
fit = np.polyfit(mu_cpg["mu"], mu_cpg["obs"], 1, w = mu_cpg["pos"])
lm_p = np.poly1d(fit)

### Get the summary data of possible and observed variants

In [16]:
# No CpG variants
df_cpg = pd.read_csv("../outputs/observed_variants_stats_no_cpg_nmd.tsv", sep="\t")

### Calculate the number of expected variants using each model

In [17]:
# No CpGs
df_cpg_lm = get_expected(df=df_cpg, x=lambda x: lm_p(x["mu"])) # Linear model

  prop = count * 1. / nobs
  p_pooled = np.sum(count) * 1. / np.sum(nobs)
  nobs_fact = np.sum(1. / nobs)
  prop = count * 1. / nobs
  p_pooled = np.sum(count) * 1. / np.sum(nobs)
  nobs_fact = np.sum(1. / nobs)


## Save expected variants to output
On the basis of the figures above, the **best model** for predicting the expected number of variants (excluding CpG transitions) is a **simple linear model of obs vs mu**, weighted by the number of possible variants per context. The output of this expectation model is saved below.

In [18]:
# Save to output
df_cpg_lm.to_csv("../outputs/expected_variants_stats_no_cpg_nmd.tsv", sep="\t", index=False)

# Load from output
dfg = pd.read_csv("../outputs/expected_variants_stats_no_cpg_nmd.tsv", sep="\t")