# DNMs in constrained regions.

In [59]:
# Imports
import pandas as pd
import numpy as np

from src import constants as C

In [60]:
# Module constants
_N_DNMS = 3

In [61]:
# Load data
df = pd.read_csv(C.DNMS_ANNOTATED, sep="\t")

# Sort by source
df["source"] = pd.Categorical(df["source"], ["gel","ddd","genedx","rumc"])
df = df.sort_values("source")

#! Drop duplicated DNMs
df = df.drop_duplicates(["chr","pos","ref","alt","enst"])

df.shape

(55996, 13)

How many transcripts have three or more DNMs in a constrained region?

In [62]:
# Keep only truncating DNMs in constrained regions
m1 = df["csq"] == "stop_gained"
m2 = df["csq"] == "frameshift"
m3 = df["constraint"] == "constrained"

trunc = df[(m1 | m2) & m3].copy()

# Count truncating variants
trunc["n_truncating"] = trunc.groupby(["enst", "region"]).pos.transform("count")

# Transcripts with >= 3 truncating DNMs in a constrained region.
trunc[trunc["n_truncating"] >= _N_DNMS].enst.nunique()

135

How many transcripts with >=3 truncating DNMs have no disease association in OMIM? 

In [63]:
trunc[(trunc["n_truncating"] >= _N_DNMS) & (trunc["phenotype"].isna())].enst.nunique()

31

In [64]:
trunc[(trunc["n_truncating"] >= _N_DNMS) & (trunc["phenotype"].isna())]

Unnamed: 0,ensg,enst,chr,pos,ref,alt,symbol,csq,region,source,phenotype,inheritance,constraint,n_truncating
24641,ENSG00000136451,ENST00000581208,chr17,57980755,C,CT,VEZF1,frameshift,nmd_target,gel,,,constrained,4
16251,ENSG00000032219,ENST00000355431,chr14,58365276,CAA,C,ARID4A,frameshift,nmd_target,gel,,,constrained,4
17131,ENSG00000080824,ENST00000216281,chr14,102084474,TTTTC,T,HSP90AA1,frameshift,nmd_target,gel,,,constrained,3
26711,ENSG00000256463,ENST00000537592,chr18,78995025,C,T,SALL3,stop_gained,long_exon,gel,,,constrained,4
26705,ENSG00000256463,ENST00000537592,chr18,78992997,C,T,SALL3,stop_gained,long_exon,gel,,,constrained,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40766,ENSG00000285708,ENST00000647725,chr3,70977859,G,C,ENSG00000285708,stop_gained,nmd_target,rumc,,,constrained,21
21634,ENSG00000140836,ENST00000268489,chr16,72957962,G,T,ZFHX3,stop_gained,long_exon,rumc,,,constrained,6
20674,ENSG00000282034,ENST00000380361,chr16,30733360,C,T,ENSG00000282034,stop_gained,nmd_target,rumc,,,constrained,7
26712,ENSG00000256463,ENST00000537592,chr18,78995055,C,T,SALL3,stop_gained,long_exon,rumc,,,constrained,4
