In [1]:
""" 
"""

' \n'

## Imports

In [2]:
import pandas as pd

## Read data

In [3]:
def get_dnms(path="../outputs/dnms_vep_clean.tsv"):
    dnms = pd.read_csv(path, sep="\t")
    return dnms

In [4]:
def get_constraint(path="../outputs/constraint_annotation.tsv"):
    cst = pd.read_csv(path, sep="\t")
    return cst

In [5]:
def get_nmd(path="../data/nmd_annotations.tsv"):
    nmd = pd.read_csv(
        path,
        sep="\t",
        header=0,
        names=["chr", "pos", "enst", "nmd_all", "nmd"],
        usecols=["chr", "pos", "enst", "nmd"],
    )
    return nmd

In [6]:
def get_genes(path="../data/gene_ids.tsv"):
    genes = pd.read_csv(
        path,
        sep="\t",
        header=0,
        usecols=["gene_id", "transcript_id"],
    ).set_axis(["ensg", "enst"], axis=1)

    # Remove version numbers
    genes["ensg"] = genes["ensg"].str.split(".").str[0]
    genes["enst"] = genes["enst"].str.split(".").str[0]

    # Drop duplicates
    genes = genes.drop_duplicates()

    return genes

In [7]:
def get_omim(path="../outputs/omim_genemap2_simple.tsv"):
    om = (
        pd.read_csv(path, sep="\t", usecols=["ensg", "inheritance"])
        .drop_duplicates()
    )
    return om

# TODO Keep phenotype column in OMIM dataset

In [8]:
dnms = get_dnms()
cst = get_constraint()
nmd = get_nmd()
genes = get_genes()
omim = get_omim()

## Merge data

In [9]:
# Merge OMIM with with gene ids
m1 = omim.merge(genes).drop("ensg", axis=1).assign(omim="morbid")

In [10]:
# Merge constraint with OMIM data
m2 = cst.merge(m1, how="left").fillna({"omim": "non_morbid"})

In [11]:
# Merge VEP and NMD annotations
m3 = dnms.merge(nmd, how="left").drop_duplicates(["chr", "pos", "ref", "alt", "enst"])

The NMD annotations file contains the annotations for each canonical transcript in GENCODE. DNMs lacking an NMD annotation are probably those which fall in non-canonical transcripts.

In [12]:
# Merge all annotations
m4 = m3.merge(m2[["nmd","enst","non_exp","constraint","inheritance","omim"]], how="left")

In [26]:
#TODO combine all OMIM annotations per gene prior to merge with DNMs. 

# This code belongs in a separate script!

In [None]:
# Annotate DNMs with regional constraint annotation
rr = df.merge(cst[["nmd", "enst", "constraint"]], how="left")

### Annotate with gene IDs and symbols

In [None]:
# Load gene ID data
ids = pd.read_csv(
    "../data/gene_ids.tsv", sep="\t", header=0, names=["ensg", "enst", "symbol"]
)

# Remove ENSG and ENST version numbers
for e in ["ensg", "enst"]:
    ids[e] = ids[e].str.split(".").str[0]

In [None]:
# Match gene identifiers to transcript IDs
rr = rr.merge(ids).drop_duplicates()  # Drop dup ENSTs on sex chromosomes

In [None]:
# Match OMIM phenotypes to DNM transcripts
rr = rr.merge(omim_simple, how="left")

### Save the annotated and curated DNMs to output

In [None]:
cols = [
    "chr",
    "pos",
    "ref",
    "alt",
    "source",
    "id",
    "csq",
    "symbol",
    "ensg",
    "enst",
    "nmd",
    "constraint",
    "omim",
]

rr = rr[cols]

rr.to_csv("../outputs/dnms_annotated.tsv", sep="\t", index=False)