# Constraint statistics


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.multitest import fdrcorrection as fdr
from scipy import stats as _stats

sns.set_context("talk")

## Load the data

In [2]:
df = pd.read_csv("../outputs/expected_variants_all_regions_no_cpg.tsv", sep="\t")

# Print summary data
_ = (df.region.value_counts() / 3).astype(int)
print(f"Number of distinct regions:\n{_}")  # 3 consequences per region (syn/mis/non)

Number of distinct regions:
transcript        19623
start_proximal    19415
distal_nmd        19222
nmd_target        16444
long_exon          3565
Name: region, dtype: int64


## Get constraint Z scores

In [3]:
def per_row_ztest(row, statistic="z"):
    if statistic == "z":
        i = 0
    elif statistic == "p":
        i = 1

    stat = proportions_ztest(
        count=row["n_obs"],
        nobs=row["n_pos"],
        value=row["prop_exp"],
        alternative="smaller",
        prop_var=row["prop_exp"],
    )[i]

    return stat

In [4]:
%%capture
df["z"] = df.apply(per_row_ztest, statistic="z", axis=1)
df["p"] = df.apply(per_row_ztest, statistic="p", axis=1)

# Print summary data
_ = df.groupby(["region","csq"])["z"].count()
print(f"Constraint statistics by region and consequence:\n{_}")

## Get FDR-adjusted P-values
Calculate separately for whole-transcripts and constrained regions

In [5]:
def fdr_adjustment(df, regions=["transcript"], csq="nonsense"):
    """Get FDR-adjusted P-values for a given region and variant consequence."""
    # Mask regions and consequences
    m1 = df.region.isin(regions)
    m2 = df.csq == csq

    # Filter the dataframe and drop cases without a P-value
    p = df.loc[m1 & m2, ["region", "p"]].dropna().copy()

    # FDR adjustment
    p["fdr_p"] = fdr(pvals=p["p"])[1]

    return p

In [6]:
# FDR adjustment is done separately for transcripts and NMD regions
r1 = ["transcript"]
r2 = ["distal_nmd", "nmd_target", "long_exon"]

fdr_p = pd.concat([fdr_adjustment(df, regions=r) for r in [r1, r2]])

# Join FDR-adjusted p-values to the original dataframe
df = df.join(fdr_p["fdr_p"])

## Save to output

In [None]:
df.to_csv("../outputs/expected_variants_all_regions_no_cpg_stats.tsv", sep="\t")