# NMD annotations
This script labels genomic positions with an NMD annotation. The annotations include:
1) Start-proximal (<150nt from translation start site)
2) Long exons (>400nt upstream of the splice donor site)
3) Last exon
4) 50nt rule (within the most 3' 50nt of the penultimate exon)

There are several approaches I could take.
First, I could annotate every position in the exome with these criteria.
Alternatively, I could construct a bed file of different regions.

Generally, I would favour the per-site annotations for maximum flexibility.
So for each positions, I will need to annotate:
1) The distance from the translation start site
2) The distance upstream from the splice donor site
3) Whether it is the last exon
4) Whether it is the last 50nt of the penultimate exon

## Import modules

In [1]:
conda install -c bioconda gtfparse -y

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [2]:
%%bash
dx download -f -o ../data/ data/gencode.v39.annotation.gtf

In [3]:
# Import the relevant modules
import numpy as np
import pandas as pd
import gtfparse

In [4]:
def get_gencode_gtf(path):
    """Read a GENCODE .gtf into memory with gtfparse"""
    gtf = gtfparse.read_gtf(path)
    return gtf

In [5]:
def get_canonical_cds(gtf):
    """Identify all CDS features in each Ensembl_canonical from GENCODE"""

    # Subset to Ensembl_canonical features in protein coding genes
    canonical_cds = gtf[
        ((gtf.feature == "exon") | (gtf.feature == "CDS"))
        & (gtf.tag.str.contains("Ensembl_canonical"))
        & (gtf.gene_type == "protein_coding")
    ].copy()
    
    # Find the number of exons per transcript
    canonical_cds["exon_number"] = canonical_cds["exon_number"].astype(int)
    canonical_cds["exon_count"] = canonical_cds.groupby("transcript_id")["exon_number"].transform("max")
    
    # Find exon start and end positions (distinct from CDS starts and ends)
    exons = canonical_cds[canonical_cds.feature=="exon"]
    exons = exons[["exon_id","start","end"]]
    exons.columns = ["exon_id", "exon_start", "exon_end"]
    exons = exons.drop_duplicates()
    
    # Subset to CDS only
    canonical_cds = canonical_cds[canonical_cds.feature == "CDS"]
    
    # Merge with exon end positions
    canonical_cds = canonical_cds.merge(exons, how="left")
    canonical_cds = canonical_cds.drop_duplicates(["transcript_id","exon_id","start","end"])
    
    # Count the number of CDS exons in each MANE transcript
    canonical_cds["cds_number"] = (
        canonical_cds.groupby("transcript_id")["exon_number"].rank().astype(int)
    )
    return canonical_cds

In [6]:
if __name__ == "__main__":
    # Read GTF data
    gencode_path = "../data/gencode.v39.annotation.gtf"
    gtf = get_gencode_gtf(gencode_path)

    # Define regions of interest
    cds = get_canonical_cds(gtf)
    cds = cds[["seqname","start","end","exon_start", "exon_end","strand","transcript_id","exon_id","exon_count","exon_number","cds_number"]]
    cds["transcript_id"] = cds["transcript_id"].str.split(".").str[0]
    cds = cds.set_index(["seqname","transcript_id","exon_id"])
    cds["pos"] = cds.apply(lambda x: list(range(x["start"], x["end"] + 1)), axis=1)
    cds = cds.explode("pos")
    cds["cds_len"] = cds.groupby(level="transcript_id")["pos"].transform("count")
    cds["exon_len"] = cds.groupby(level="exon_id")["pos"].transform("count")



  chunk_iterator = pd.read_csv(


  chunk_iterator = pd.read_csv(
INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'protein_id', 'ccdsid']


## NMD annotations for + transcripts 

In [7]:
fwd = cds[cds["strand"]=="+"].copy()

### Start proximal (distance from start codon)

In [8]:
fwd["start_distance"] = fwd.groupby(level="transcript_id")["pos"].rank(ascending=True).astype(int)

### Long exons (distance upstream from splice junction)

In [9]:
fwd["splice_donor_distance"] = (fwd["exon_end"] - fwd["pos"]) + 1

# Where the CDS is in the last exon, there is no downstream splice donor site.
# Here I will drop the "splice_donor_distance" annotation:
fwd.loc[fwd["exon_number"] == fwd["exon_count"], "splice_donor_distance"] = np.nan

### Last exon

In [10]:
fwd["last_exon"] = np.where(fwd["exon_count"] == fwd["exon_number"], 1, 0) 

### 50nt rule

In [11]:
fwd["fifty_nt_rule"] = np.where((fwd["exon_number"] == fwd["exon_count"] - 1)
                            & ((fwd["exon_end"] - fwd["pos"]) + 1 <= 50),
                            1,
                            0
                           ) 

## NMD annotations for - transcripts 

In [15]:
rev = cds[cds["strand"]=="-"].copy()

### Start proximal (distance from start codon)

In [16]:
rev["start_distance"] = rev.groupby(level="transcript_id")["pos"].rank(ascending=False).astype(int)

### Long exons (distance upstream from splice junction)

In [17]:
rev["splice_donor_distance"] = (rev["pos"] - rev["exon_start"]) + 1

# Where the CDS is in the last exon (i.e. contiguous with the 3' UTR)
# we should drop the "splice_donor_distance" annotation:
rev.loc[rev["exon_number"] == rev["exon_count"], "splice_donor_distance"] = np.nan

### Last exon

In [18]:
rev["last_exon"] = np.where(rev["exon_count"] == rev["exon_number"], 1, 0) 

### 50nt rule

In [19]:
rev["fifty_nt_rule"] = np.where((rev["exon_number"] == rev["exon_count"] - 1)
                            & ((rev["pos"] - rev["exon_start"]) + 1 <= 50),
                            1,
                            0
                           ) 

## Merge fwd and rev annotations

In [20]:
df = pd.concat([fwd,rev])
df = df.reset_index()

## Unify NMD annotations

In [21]:
# Describe sites with overlapping NMD annotations 
a = pd.Series(np.where(df["start_distance"] <= 150, "start_proximal,", ""))
b = pd.Series(np.where(df["splice_donor_distance"] > 400, "long_exon,", ""))
c = pd.Series(np.where(df["fifty_nt_rule"] == 1, "fifty_nt,", ""))
d = pd.Series(np.where(df["last_exon"] == 1, "last_exon,", ""))

df["nmd"] = pd.Series(["".join([w,x,y,z]) for w,x,y,z in zip(a,b,c,d)])

# Sites with no NMD-escape annotation are NMD targets
df["nmd"] = df["nmd"].replace("", "nmd_target")

# Create a definitive NMD annotation:
## "last_exon" and "fifty_nt" annotations are merged into "distal_nmd"
## Overlapping annotations are removed, with this priority: 
## "start_proximal" > "distal_nmd" > "long_exon"
df["nmd_definitive"] = df["nmd"].copy()
df.loc[df["nmd_definitive"].str.contains("start_proximal"), "nmd_definitive"] = "start_proximal"
df.loc[(df["nmd_definitive"].str.contains("fifty_nt")) | (df["nmd_definitive"].str.contains("last_exon")), "nmd_definitive"] = "distal_nmd"
df["nmd_definitive"] = df["nmd_definitive"].replace("long_exon,", "long_exon")

## Drop unnecessary columns

In [22]:
df = ((df[['seqname', 'pos', 'transcript_id', "nmd", "nmd_definitive"]]
     ).rename(columns={"seqname":"chr"})
)

## Save to output

In [25]:
df.to_csv("../outputs/nmd_annotations.tsv", sep="\t", index=False)
! dx rm -f outputs/nmd_annotations.tsv
! dx upload --destination outputs/ ../outputs/nmd_annotations.tsv

ID                    file-GQ9V140J7yjKj42X7JBjkzY3
Class                 file
Project               project-GKK5xq0J7yj8yZZ863Jgg51x
Folder                /outputs
Name                  nmd_annotations.tsv
State                 [33mclosing[0m
Visibility            visible
Types                 -
Properties            -
Tags                  -
Outgoing links        -
Created               Thu Mar 16 12:50:24 2023
Created by            alexander.blakes
 via the job          job-GQ9Kqy8J7yjPxgjvxb9q489Q
Last modified         Thu Mar 16 12:50:31 2023
Media type            
archivalState         "live"
cloudAccount          "cloudaccount-dnanexus"
