# Candidate disease genes

In [1]:
# Imports
import pandas as pd

In [2]:
# Module constants
_COLUMNS = (
    "enst chr pos ref alt symbol omim_inheritance_simple region constraint "
    "pli loeuf n_trunc_region n_trunc_transcript cohort id csq case_solved"
).split()

In [3]:
# Change directories to allow imports
%cd /re_gecip/enhanced_interpretation/AlexBlakes/nmd_dnms/

/nas/weka.gel.zone/re_gecip/enhanced_interpretation/AlexBlakes/nmd_dnms


In [4]:
# Read de novo data
df = pd.read_csv("data/interim/dnms_annotated_clinical.tsv", sep="\t", usecols=_COLUMNS)
print(f"Participant IDs: {df['id'].nunique()}")

Participant IDs: 32557


In [5]:
# Drop variants from DDD which are duplicated in GEL
gel = df[df["cohort"] == "gel"]
ddd = df[df["cohort"] == "ddd"]
gdx = df[df["cohort"] == "gdx"]
rumc = df[df["cohort"] == "rumc"]
print(f"DNMs in DDD cohort: {len(ddd)}")

# Preserve the index for later filtering
ddd_dups = (
    ddd.reset_index()
    .merge(gel[["chr", "pos", "ref", "alt"]], how="inner")
    .set_index("index")
)
print(f"DDD DNMs in GEL: {ddd_dups.index.nunique()}")

ddd = ddd.drop(ddd_dups.index)
print(f"DNMs in DDD cohort after cleaning: {len(ddd)}")

df = pd.concat([gel, ddd, gdx, rumc])
print(f"Total DNMs: {len(df)}")
print(f"Participant IDs: {df['id'].nunique()}")

DNMs in DDD cohort: 14220
DDD DNMs in GEL: 1362
DNMs in DDD cohort after cleaning: 12858
Total DNMs: 57277
Participant IDs: 31951


In [6]:
# Count dnPTVs per region and transcript
count_trunc = lambda x: x.isin(["frameshift_variant", "stop_gained"]).sum()

df["n_trunc_region"] = df.groupby(["enst", "region"])["csq"].transform(count_trunc)
df["n_trunc_transcript"] = df.groupby("enst")["csq"].transform(count_trunc)

In [7]:
# Filter for candidate disease genes
m0 = df["csq"].isin(["frameshift_variant", "stop_gained"])
m1 = df["constraint"] == "constrained"
m2 = ~df["omim_inheritance_simple"].isin(["AD", "AD_AR", "XL"])
# Ignore pLI / LOEUF scores for now
# m3 = (df["pli"] < 0.9) & (df["loeuf"] > 0.6)

print(f"Note that DNVs are duplicated if overlapping >1 transcript.")
print(f"dnPTVs: {len(df[m0])}")
print(f"dnPTVs in constrained regions: {len(df[m0 & m1])}")
print(f"dnPTVs in constrained regions of non-morbid genes: {len(df[m0 & m1 & m2])}")
# Ignore pLI / LOEUF for now
# print(f"dnPTVs in constrained regions of non-morbid genes with weak pLI scores: {len(df[m0 & m1 & m2 & m3])}")

for n in range(1,4):
    m4 = df["n_trunc_region"] == n
    enst = df[m0 & m1 & m2 & m4]["enst"].drop_duplicates()
    print(f"Non-morbid transcripts with exactly {n} dnPTVs in a constrained region: {len(enst)}")

Note that DNVs are duplicated if overlapping >1 transcript.
dnPTVs: 7247
dnPTVs in constrained regions: 2277
dnPTVs in constrained regions of non-morbid genes: 516
Non-morbid transcripts with exactly 1 dnPTVs in a constrained region: 266
Non-morbid transcripts with exactly 2 dnPTVs in a constrained region: 53
Non-morbid transcripts with exactly 3 dnPTVs in a constrained region: 16


In [8]:
# Genes with >=3 truncating DNMs
m1 = df["n_trunc_region"] >= 3
m2 = df["constraint"] == "constrained"

enst_3 = df[m1 & m2]["enst"].drop_duplicates()
print(f"Transcripts with at least 3 dnPTVs in a constrained region: {len(enst_3)}")

Transcripts with at least 3 dnPTVs in a constrained region: 130


In [9]:
# Non-morbid genes (not AD or XL)
m1 = ~df["omim_inheritance_simple"].isin(["AD", "AD_AR", "XL"])

enst_non_morbid = df[m1]["enst"].drop_duplicates()

In [10]:
# >=3 dnPTVs and non-morbid
m1 = enst_3.isin(enst_non_morbid)
enst_3_non_morbid = enst_3[m1]

print(f">=3 DNMs and non-morbid (not AD or XL): {len(enst_3_non_morbid)}")

>=3 DNMs and non-morbid (not AD or XL): 28


In [11]:
# gnomAD constraint data
gnomad = pd.read_csv(
    "data/raw/gnomad.v4.0.constraint_metrics.tsv",
    sep="\t",
    header=0,
    usecols=["transcript", "lof.pLI", "lof.oe_ci.upper"],
).set_axis(["enst", "pli", "loeuf"], axis=1)

# Get transcripts with strong pLI / LOEUF scores
m1 = gnomad.pli > 0.9
m2 = gnomad.loeuf < 0.6
hi_pli = gnomad[m1 | m2].enst

In [12]:
# Transcripts with >=3 dnPTVs, and weak pLI / LOEUF scores
m1 = enst_3.isin(hi_pli)

enst_new = enst_3[~m1]
print(f"Transcripts with >=3 dnPTVs and weak pLI / LOEUF scores: {len(enst_new)}")

# ... of which no AD/XL phenotype in OMIM
m1 = enst_3_non_morbid.isin(hi_pli)
enst_new_non_morbid = enst_3_non_morbid[~m1]
print(f"... of which no AD/XL phenotype in OMIM: {len(enst_new_non_morbid)}")

Transcripts with >=3 dnPTVs and weak pLI / LOEUF scores: 9
... of which no AD/XL phenotype in OMIM: 3


In [13]:
# Inspect DNMs in the new transcripts
m1 = df["enst"].isin(enst_new_non_morbid)
m2 = df["csq"].isin(["frameshift_variant", "stop_gained"])
df[m1 & m2].sort_values("enst")

Unnamed: 0,enst,chr,pos,ref,alt,symbol,omim_inheritance_simple,region,constraint,pli,loeuf,n_trunc_region,n_trunc_transcript,cohort,id,csq,case_solved
26477,ENST00000537592,chr18,78992997,C,T,SALL3,,long_exon,constrained,0.000408,0.754,4.0,4,gel,112007818_10004.1,stop_gained,no
26483,ENST00000537592,chr18,78995025,C,T,SALL3,,long_exon,constrained,0.000408,0.754,4.0,4,gel,125000331_10000.2,stop_gained,no
26482,ENST00000537592,chr18,78994230,A,T,SALL3,,long_exon,constrained,0.000408,0.754,4.0,4,ddd,DDD13k.06426,stop_gained,
26484,ENST00000537592,chr18,78995055,C,T,SALL3,,long_exon,constrained,0.000408,0.754,4.0,4,rumc,rumc_patient_910,stop_gained,
30166,ENST00000568956,chr19,55615759,G,A,ZNF865,,distal_nmd,constrained,0.63427,1.077,3.0,3,ddd,DDD13k.09150,stop_gained,
30168,ENST00000568956,chr19,55615913,-,G,ZNF865,,distal_nmd,constrained,0.63427,1.077,3.0,3,gdx,102598,frameshift_variant,
30169,ENST00000568956,chr19,55615978,C,-,ZNF865,,distal_nmd,constrained,0.63427,1.077,3.0,3,gdx,43787,frameshift_variant,
19011,ENST00000629685,chr15,92927258,T,A,ENSG00000279765,,distal_nmd,constrained,0.071589,1.199,3.0,3,ddd,DDD13k.01698,stop_gained,
19009,ENST00000629685,chr15,92924529,G,T,ENSG00000279765,,distal_nmd,constrained,0.071589,1.199,3.0,3,gdx,36725,stop_gained,
19013,ENST00000629685,chr15,92927289,C,T,ENSG00000279765,,distal_nmd,constrained,0.071589,1.199,3.0,3,gdx,67563,stop_gained,
