# DNMs in constrained regions.

In [19]:
# Imports
import pandas as pd
import numpy as np

from src import constants as C

In [20]:
# Module constants
_N_DNMS = 3

In [21]:
# Load data
df = pd.read_csv(C.DNMS_ANNOTATED, sep="\t")

# Sort by source
df["source"] = pd.Categorical(df["source"], ["gel","ddd","genedx","rumc"])
df = df.sort_values("source")

#! Drop duplicated DNMs
df = df.drop_duplicates(["chr","pos","ref","alt","enst"])

df.shape

(55996, 13)

How many transcripts have three or more DNMs in a constrained region?

In [22]:
# Keep only truncating DNMs in constrained regions
m1 = df["csq"] == "stop_gained"
m2 = df["csq"] == "frameshift"
m3 = df["constraint"] == "constrained"

trunc = df[(m1 | m2) & m3].copy()

# Count truncating variants
trunc["n_truncating"] = trunc.groupby(["enst", "region"]).pos.transform("count")

# Transcripts with >= 3 truncating DNMs in a constrained region.
trunc[trunc["n_truncating"] >= _N_DNMS].enst.nunique()

131

How many transcripts with >=3 truncating DNMs have no disease association in OMIM? 

In [23]:
trunc[(trunc["n_truncating"] >= _N_DNMS) & (trunc["phenotype"].isna())].enst.nunique()

29

In [25]:
trunc[(trunc["n_truncating"] >= _N_DNMS) & (trunc["phenotype"].isna())].sample(10)

Unnamed: 0,ensg,enst,chr,pos,ref,alt,symbol,csq,region,source,phenotype,inheritance,constraint,n_truncating
53674,ENSG00000091656,ENST00000651372,chr8,76851476,CGG,CG,ZFHX4,frameshift,long_exon,ddd,,,constrained,6
30151,ENSG00000161681,ENST00000293441,chr19,50667460,GGGCTGGC,GGGC,SHANK1,frameshift,long_exon,ddd,,,constrained,3
26705,ENSG00000256463,ENST00000537592,chr18,78992997,C,T,SALL3,stop_gained,long_exon,gel,,,constrained,4
53681,ENSG00000091656,ENST00000651372,chr8,76853736,TATGGTTC,T,ZFHX4,frameshift,long_exon,ddd,,,constrained,6
12716,ENSG00000197111,ENST00000546463,chr12,53462493,T,TC,PCBP2,frameshift,nmd_target,genedx,,,constrained,3
54772,ENSG00000153707,ENST00000381196,chr9,8504405,G,A,PTPRD,stop_gained,nmd_target,genedx,,,constrained,4
40755,ENSG00000285708,ENST00000647725,chr3,70977013,T,TA,ENSG00000285708,frameshift,nmd_target,gel,,,constrained,21
40747,ENSG00000285708,ENST00000647725,chr3,70976964,G,A,ENSG00000285708,stop_gained,nmd_target,gel,,,constrained,21
32981,ENSG00000168702,ENST00000389484,chr2,140868127,C,A,LRP1B,stop_gained,nmd_target,ddd,,,constrained,4
20694,ENSG00000282034,ENST00000380361,chr16,30737733,G,T,ENSG00000282034,stop_gained,long_exon,ddd,,,constrained,10
