# Constraint statistics
Explore trends of regional nonsense constraint. 

In [10]:
import pandas as pd
from scipy import stats

In [11]:
# Load data
df = pd.read_csv("data/final/regional_nonsense_constraint.tsv", sep="\t")
df.sample(2)

Unnamed: 0,enst,region,csq,n_pos,n_obs,n_exp,oe,prop_obs,prop_exp,mu,p,oe_ci_hi,fdr_p,pli,loeuf,gnomad_flags,syn_p,constraint
63841,ENST00000276616,transcript,stop_gained,139,19,24.660431,0.770465,0.136691,0.177413,1.800744e-08,0.124039,1.093996,0.2209,5.6e-05,1.074,[],0.382351,unconstrained
45635,ENST00000590720,nmd_target,stop_gained,60,2,8.289024,0.241283,0.033333,0.13815,1.402227e-08,0.007496,0.732795,0.055977,0.99995,0.342,[],0.702139,


## Number of constrained regions

In [12]:
# Masks for constraint criteria
m1 = df.syn_p >= stats.norm.cdf(-1)
m3 = df.fdr_p < 0.05
m4 = df.oe_ci_hi <= 0.6
m5 = df.n_obs == 0

# Cumulative criteria
ma = (m1).rename("pass_syn_p")
mb = (ma & m3).rename("pass_fdr")
mc = (mb & m4).rename("pass_oe_ci_hi")
md = (mc & m5).rename("none_observed")

df = pd.concat([df, ma, mb, mc, md], axis=1)

In [19]:
# Regions meeting cumulative criteria
valid = df.groupby("region").agg(
    transcripts=("enst","nunique"),
    valid_statistics=("p", "count"),
    pass_syn_p=("pass_syn_p", "sum"),
    pass_fdr=("pass_fdr", "sum"),
    pass_oe_ci_hi=("pass_oe_ci_hi", "sum"),
    none_observed=("none_observed", "sum"),
)

valid

Unnamed: 0_level_0,transcripts,valid_statistics,pass_syn_p,pass_fdr,pass_oe_ci_hi,none_observed
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
distal_nmd,18192,18189,15014,1023,555,190
long_exon,3315,3315,2740,596,374,59
nmd_target,15878,15878,12352,3500,1912,141
start_proximal,18272,18270,15645,127,92,89
transcript,18647,18647,14191,5518,2347,98
