# NMD annotations
This script labels genomic positions with an NMD annotation. The annotations include:
1) Start-proximal (<150nt from translation start site)
2) Long exons (>400nt upstream of the splice donor site)
3) Last exon
4) 50nt rule (within the most 3' 50nt of the penultimate exon)

There are several approaches I could take.
First, I could annotate every position in the exome with these criteria.
Alternatively, I could construct a bed file of different regions.

Generally, I would favour the per-site annotations for maximum flexibility.
So I will need to annotate:
1) The distance from the translation start site
2) The distance upstream from the splice donor site
3) Whether it is the last exon
4) Whether it is the last 50nt of the penultimate exon

## Import modules

In [None]:
conda install -c bioconda gtfparse -y

In [None]:
%%bash
dx download -o ../data/ data/gencode.v39.annotation.gtf

In [5]:
# Import the relevant modules
import numpy as np
import pandas as pd
import gtfparse

In [8]:
def get_gencode_gtf(path):
    """Read a GENCODE .gtf into memory with gtfparse"""
    gtf = gtfparse.read_gtf(path)
    return gtf

In [9]:
def get_canonical_cds(gtf):
    """Identify all CDS features in each Ensembl_canonical from GENCODE"""

    # Subset to Ensembl_canonical CDS features in protein coding genes
    canonical_cds = gtf[
        (gtf.feature == "CDS")
        & (gtf.tag.str.contains("Ensembl_canonical"))
        & (gtf.gene_type == "protein_coding")
    ].copy()

    # Count the number of CDS exons in each MANE transcript
    canonical_cds["exon_number"] = canonical_cds["exon_number"].astype(int)
    canonical_cds["cds_number"] = (
        canonical_cds.groupby("transcript_id")["exon_number"].rank().astype(int)
    )
    return canonical_cds

In [7]:
if __name__ == "__main__":
    # Read GTF data
    gencode_path = "../data/gencode.v39.annotation.gtf"
    gtf = get_gencode_gtf(gencode_path)

    # Define regions of interest
    cds = get_canonical_cds(gtf)
    cds = cds[["seqname","start","end","strand","transcript_id","exon_id","exon_number","cds_number"]]
    cds = cds.set_index(["seqname","transcript_id","exon_id"])
    cds["pos"] = cds.apply(lambda x: list(range(x["start"], x["end"] + 1)), axis=1)
    cds = cds.explode("pos")
    cds["cds_len"] = cds.groupby(["transcript_id"])["pos"].count()
    
    # Convert .gtf data to .bed format
    bed_ids = ["gene_id", "transcript_id", "exon_id", "cds_number"]
    cds_bed = gtf_to_bed(cds, bed_ids)

    # Write to output
    gencode_version = "v39"
    feature = "cds"
    out_path_chr = f"../outputs/gencode_{gencode_version}_canonical_{feature}_chr.bed"

    write_bed(cds_bed, out_path_chr, "chr")



  chunk_iterator = pd.read_csv(


  chunk_iterator = pd.read_csv(
INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'protein_id', 'ccdsid']


Unnamed: 0,seqname,start,end,id,score,strand
61,chr1,65564,65573,"ENSG00000186092.7,ENST00000641515.2,ENSE000038...",.,+
64,chr1,69036,70005,"ENSG00000186092.7,ENST00000641515.2,ENSE000038...",.,+
322,chr1,450742,451678,"ENSG00000284733.2,ENST00000426406.4,ENSE000023...",.,-
449,chr1,685718,686654,"ENSG00000284662.2,ENST00000332831.5,ENSE000023...",.,-
943,chr1,924431,924948,"ENSG00000187634.13,ENST00000616016.5,ENSE00001...",.,+
...,...,...,...,...,...,...
3240772,chrY,57209531,57209733,"ENSG00000182484.15_PAR_Y,ENST00000359512.8_PAR...",.,+
3240774,chrY,57209821,57209980,"ENSG00000182484.15_PAR_Y,ENST00000359512.8_PAR...",.,+
3240776,chrY,57210639,57210792,"ENSG00000182484.15_PAR_Y,ENST00000359512.8_PAR...",.,+
3240778,chrY,57211551,57211620,"ENSG00000182484.15_PAR_Y,ENST00000359512.8_PAR...",.,+
