# Expected variants
This script determines the expected number of variants for a given region. 
It calculates this for:
- Transcripts
- NMD regions

## Import modules

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.multitest import fdrcorrection as fdr
from sklearn.metrics import r2_score
from scipy import stats as _stats

sns.set_context("talk")

## Load data

### Rare synonymous variants are the basis for the model

In [2]:
# Rare synonymous variants per variant context
syn = pd.read_csv("../outputs/observed_variants_stats_synonymous.tsv", sep="\t")

# Get proportion of variants observed
syn["prop_obs"] = syn["obs"] / syn["pos"]

# Exclude CpG transitions
syn_cpg = syn[syn["variant_type"] != "CpG"].copy()

### Transcripts and NMD regions

In [3]:
# Variants observed per transcript
enst = pd.read_csv("../outputs/observed_variants_stats_transcript_no_cpg.tsv", sep="\t")

In [4]:
# Variants observed per NMD region
nmd = pd.read_csv("../outputs/observed_variants_stats_nmd_no_cpg.tsv", sep="\t")

In [5]:
# Concatenate the transcript-level and region-level data
enst = enst.assign(region="transcript")
nmd = nmd.rename(columns={"nmd": "region"})

df = pd.concat([nmd, enst]).sort_values(["region", "enst", "csq"])

## Linear model for expected proportion of variants

From expecation_model_choices.ipynb, it seems the best model for predicting the expected number of variants (excluding CpG transitions) is a simple linear model of obs vs mu, weighted by the number of possible variants per context. 

In [6]:
# Linear model
fit = np.polyfit(syn_cpg["mu"], syn_cpg["prop_obs"], 1, w=syn_cpg["pos"])
lm_p = np.poly1d(fit)

## Calculate expected variants per transcript and context

In [7]:
def get_expected(df, prediction):
    """
    Get the expected proportion of variants per transcript and consequence.
    prediction is a lambda function used to predict the expected value.
    """
    df = (
        df.assign(
            prop_obs=lambda x: x["n_obs"] / x["n_pos"],
            prop_exp=prediction,
            n_exp=lambda x: np.round(x["n_pos"] * x["prop_exp"], 2),
            oe=lambda x: x["n_obs"] / x["n_exp"],
        )
        .set_index(["region", "enst", "csq"])
        .unstack(fill_value=0)
        .stack()
        .reset_index()[
            [
                "region",
                "enst",
                "csq",
                "mu",
                "n_pos",
                "n_obs",
                "n_exp",
                "oe",
                "prop_obs",
                "prop_exp",
            ]
        ]
    )
    # Z scores and p-values
    df["z"] = df.apply(
        lambda x: (
            proportions_ztest(
                x["n_obs"],
                x["n_pos"],
                x["prop_exp"],
                alternative="smaller",
                prop_var=x["prop_exp"],
            )[0]
        ),
        axis=1,
    )
    df["p"] = df.apply(
        lambda x: proportions_ztest(
            x["n_obs"],
            x["n_pos"],
            x["prop_exp"],
            alternative="smaller",
            prop_var=x["prop_exp"],
        )[1],
        axis=1,
    )
    return df

In [8]:
%%capture
# Ignore divide by 0 errors for regions with no variants

df = get_expected(df=df, prediction=lambda x: lm_p(x["mu"]))

## Get FDR-adjusted P-values
Calculate separately for whole-transcripts and constrained regions

In [21]:
def fdr_adjustment(df, regions=["transcript"], csq="nonsense"):
    """ Get FDR-adjusted P-values for a given region and variant consequence.  
    """
    # Mask regions and consequences
    m1 = df.region.isin(regions)
    m2 = df.csq == csq
    
    # Filter the dataframe and drop cases without a P-value
    p = df.loc[m1 & m2, ["region","p"]].dropna().copy()
    
    # FDR adjustment
    p["fdr_p"] = fdr(pvals=p["p"])[1]
    
    return p
    
r1 = ["transcript"]
r2 = ["distal_nmd","nmd_target","long_exon"]

fdr_p = pd.concat([fdr_adjustment(df, regions=r) for r in [r1, r2]])
df = df.join(fdr_p["fdr_p"])

## Write to output

In [35]:
df.to_csv(
    "../outputs/expected_variants_stats_all_regions_no_cpg.tsv", sep="\t", index=False
)