# Constraint statistics


## Preliminaries
### Import modules

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.multitest import fdrcorrection as fdr
from scipy import stats as _stats
from scipy.stats import spearmanr

sns.set_context("talk")

### Download data

In [2]:
%%bash
dx download -f -o ../data/ data/supplementary_dataset_11_full_constraint_metrics.tsv

## Load the data

In [3]:
df = pd.read_csv("../outputs/expected_variants_all_regions_no_cpg.tsv", sep="\t")

# Print summary data
_ = (df.region.value_counts() / 3).astype(int)
print(f"Number of distinct regions:\n{_}")  # 3 consequences per region (syn/mis/non)

Number of distinct regions:
transcript        19623
start_proximal    19415
distal_nmd        19222
nmd_target        16444
long_exon          3565
Name: region, dtype: int64


## Get constraint Z scores

In [4]:
def per_row_ztest(row, statistic="z"):
    if statistic == "z":
        i = 0
    elif statistic == "p":
        i = 1

    stat = proportions_ztest(
        count=row["n_obs"],
        nobs=row["n_pos"],
        value=row["prop_exp"],
        alternative="smaller",
        prop_var=row["prop_exp"],
    )[i]

    return stat

In [5]:
%%capture
df["z"] = df.apply(per_row_ztest, statistic="z", axis=1)
df["p"] = df.apply(per_row_ztest, statistic="p", axis=1)

# Print summary data
_ = df.groupby(["region", "csq"])["z"].count()
print(f"Constraint statistics by region and consequence:\n{_}")

## Get FDR-adjusted P-values
For nonsense variants only. Calculate separately for whole-transcripts and constrained regions

In [6]:
def fdr_adjustment(df, regions=["transcript"], csq="nonsense"):
    """Get FDR-adjusted P-values for a given region and variant consequence."""
    # Mask regions and consequences
    m1 = df.region.isin(regions)
    m2 = df.csq == csq

    # Filter the dataframe and drop cases without a P-value
    p = df.loc[m1 & m2, ["region", "p"]].dropna().copy()

    # FDR adjustment
    p["fdr_p"] = fdr(pvals=p["p"])[1]

    return p

In [7]:
# FDR adjustment is done separately for transcripts and NMD regions
# and for each distinct consequence
r1 = ["transcript"]
r2 = ["distal_nmd", "nmd_target", "long_exon"]
csq = ["synonymous", "missense", "nonsense"]

fdr_p = pd.concat([fdr_adjustment(df, regions=r, csq=c) for r in [r1, r2] for c in csq])

# Join FDR-adjusted p-values to the original dataframe
df = df.join(fdr_p["fdr_p"])

## Merge with gnomAD constraint data

In [8]:
gnomad = pd.read_csv(
    "../data/supplementary_dataset_11_full_constraint_metrics.tsv",
    sep="\t",
    usecols=["transcript", "pLI", "oe_lof_upper"],
).rename(columns={"transcript": "enst", "pLI": "pli", "oe_lof_upper": "loeuf"})

df = df.merge(gnomad, how="left")

## Save to output

In [9]:
df.to_csv("../outputs/expected_variants_all_regions_no_cpg_stats.tsv", sep="\t", index=False)

---
## Statistics

In [10]:
# Re-load the data
df = pd.read_csv("../outputs/expected_variants_all_regions_no_cpg_stats.tsv", sep="\t")

### Exclude regions where the synonymous Z-score is < -1

In [11]:
m1 = df.csq == "synonymous"
m2 = df.z >= -1
m = df[m1 & m2][["region", "enst"]]
df2 = df.merge(m, how="inner")

### Count regions in p-value bins

In [12]:
def get_p_stats_region(df, region, csq="nonsense", bins=[0, 0.001, 0.01, 0.05, 1]):
    m1 = df.region == region
    m2 = df.csq == csq

    df = df[m1 & m2][["n_obs", "p", "fdr_p"]]

    for p in ["p", "fdr_p"]:
        b = f"{p}_bin"
        df[b] = pd.cut(df[p], bins=bins)
        g = df.groupby(b)
        stats = g.agg({f"{p}": "count"})
        stats["none_observed"] = g["n_obs"].apply(lambda x: (x == 0).sum())
        print(f"{csq}, {region}")
        print(f"{stats}\n")

In [13]:
for r in df.region.unique():
    get_p_stats_region(df2, region=r)

nonsense, distal_nmd
                   p  none_observed
p_bin                              
(0.0, 0.001]     290             59
(0.001, 0.01]    414             82
(0.01, 0.05]    1439            750
(0.05, 1.0]    12837           3489

nonsense, distal_nmd
               fdr_p  none_observed
fdr_p_bin                          
(0.0, 0.001]      97             15
(0.001, 0.01]    205             47
(0.01, 0.05]     365             68
(0.05, 1.0]    14313           4250

nonsense, long_exon
                  p  none_observed
p_bin                             
(0.0, 0.001]    266             56
(0.001, 0.01]   221             71
(0.01, 0.05]    397            156
(0.05, 1.0]    1949            644

nonsense, long_exon
               fdr_p  none_observed
fdr_p_bin                          
(0.0, 0.001]     164             33
(0.001, 0.01]    110             26
(0.01, 0.05]     189             60
(0.05, 1.0]     2370            808

nonsense, nmd_target
                  p  none_observed


### Spearman rank Z vs LOEUF

In [14]:
m1 = df["region"] == "transcript"
m2 = df["csq"] == "nonsense"

z = df[m1 & m2]["z"]
loeuf = df[m1 & m2]["loeuf"]

print(spearmanr(z, loeuf, nan_policy="omit", alternative="two-sided"))
print("\nSee below for smallest possible P-value ('tiny')")
print(np.finfo(np.float64))

SignificanceResult(statistic=0.7739073455743319, pvalue=0.0)

See below for smallest possible P-value ('tiny')
Machine parameters for float64
---------------------------------------------------------------
precision =  15   resolution = 1.0000000000000001e-15
machep =    -52   eps =        2.2204460492503131e-16
negep =     -53   epsneg =     1.1102230246251565e-16
minexp =  -1022   tiny =       2.2250738585072014e-308
maxexp =   1024   max =        1.7976931348623157e+308
nexp =       11   min =        -max
smallest_normal = 2.2250738585072014e-308   smallest_subnormal = 4.9406564584124654e-324
---------------------------------------------------------------

