# Constraint statistics
Explore trends of regional nonsense constraint. 

In [19]:
# Import modules
from pathlib import Path
import os

import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

from src import setup_logger
from src import constants as C

In [20]:
# Logging
logger = setup_logger(Path(os.path.abspath('')).stem)

In [35]:
# Read the data
df = pd.read_csv(C.REGIONAL_CONSTRAINT_STATS, sep="\t")

# Data validation
dups = df.duplicated(["enst","region","csq"]).sum()
logger.info(f"Duplicated by enst / region / csq: {dups}")

# Synonymous variants
syn = df[df.csq == "synonymous_variant"].copy()

# Nonsense variants
stop = df[df.csq == "stop_gained"].copy()

[22-Nov-23 14:33:20] INFO <module>(): Duplicated by enst / region / csq: 0


Find regions in which nonsense variants are possible, and in which constraint statistics could be calculated.

## Constraint statistics
Get the number of constrained regions

In [36]:
# Regions where synonymous z scores is >= -1
syn["syn_z_1sd"] = np.where(syn.z >= -1, True, False)

stop = stop.merge(syn[["enst","region","syn_z_1sd"]], how="inner")

In [40]:
# Regions in which nonsense variants are possible, and chi square stats can be calculated.
stop_valid = (
    stop
    .groupby("region")
    .agg(
        transcripts=("csq", "count"),
        valid_regions=("oe", "count"),
        valid_statistics=("z", "count"),
    )
)

# Masks for constraint criteria
m1 = ~stop.z.isna()
m2 = stop.syn_z_1sd == True
m3 = stop.fdr_p < 0.05
m4 = stop.oe < 0.35
m5 = stop.n_obs == 0

# Cumulative criteria
ma = (m1 & m2).rename("pass_syn_z")
mb = (ma & m3).rename("pass_fdr")
mc = (mb & m4).rename("pass_oe")
md = (mc & m5).rename("none_observed")

stop = pd.concat([stop, ma, mb, mc, md], axis=1)
stop


Unnamed: 0,enst,region,csq,n_pos,n_obs,n_exp,oe,prop_obs,prop_exp,mu,...,p,fdr_p,pli,loeuf,gnomad_flags,pass_syn_z,pass_syn_z.1,0,1,2
0,ENST00000000233,distal_nmd,stop_gained,20.0,2.0,4.866,0.411015,0.100000,0.243300,9.081590e-08,...,,,3.497900e-01,0.686,[],True,False,False,False,False
1,ENST00000000412,distal_nmd,stop_gained,23.0,3.0,4.985,0.601805,0.130435,0.216739,1.682922e-07,...,,,5.627900e-04,0.842,[],False,False,False,False,False
2,ENST00000000442,distal_nmd,stop_gained,34.0,3.0,7.019,0.427411,0.088235,0.206441,2.761964e-07,...,0.088586,0.164634,9.934800e-01,0.459,[],False,False,False,False,False
3,ENST00000001008,distal_nmd,stop_gained,23.0,1.0,3.473,0.287936,0.043478,0.151000,5.867466e-08,...,,,1.246700e-04,0.735,[],True,False,False,False,False
4,ENST00000001146,distal_nmd,stop_gained,28.0,0.0,5.404,0.000000,0.000000,0.193000,9.597732e-08,...,0.009661,0.028479,9.998700e-01,0.371,[],True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97155,ENST00000693451,transcript,stop_gained,148.0,24.0,33.568,0.714967,0.162162,0.226811,1.073428e-06,...,0.060369,0.101037,5.175200e-08,1.044,[],False,False,False,False,False
97156,ENST00000693505,transcript,stop_gained,98.0,18.0,23.437,0.768016,0.183673,0.239153,7.506564e-07,...,0.197907,0.271870,2.102700e-04,1.237,[],True,True,False,False,False
97157,ENST00000693548,transcript,stop_gained,852.0,141.0,150.898,0.934406,0.165493,0.177110,3.684247e-06,...,0.374407,0.459572,1.789800e-58,1.021,[],False,False,False,False,False
97158,ENST00000693561,transcript,stop_gained,364.0,41.0,59.325,0.691108,0.112637,0.162981,1.592563e-06,...,0.009309,0.020283,3.839700e-03,1.398,[],False,False,False,False,False


In [28]:
pd.concat([df_valid, syn_z, fdr_lt_005, oe_lt_035, obs_0], axis=1)

Unnamed: 0_level_0,total_regions,valid_regions,valid_statistics,syn_z,fdr_lt_005,oe_lt_035,obs_0
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
distal_nmd,19432,18762,9574,6509,1333,862,201
long_exon,19432,3475,2135,1417,633,439,52
nmd_target,19432,16423,14247,7968,3250,1681,137
start_proximal,19432,19161,1607,1252,134,131,96
transcript,19432,19419,19011,10836,4710,1913,90


In [30]:
# # ### Spearman rank Z vs LOEUF


# m1 = df["region"] == "transcript"
# m2 = df["csq"] == "nonsense"

# z = df[m1 & m2]["z"]
# loeuf = df[m1 & m2]["loeuf"]

# print(spearmanr(z, loeuf, nan_policy="omit", alternative="two-sided"))