# Get all annotated splice junctions based on GTF File (maxEnt.py)

The following file contains the function to extract all the splice junctions from a GTF file

In [1]:
#!usr/bin/python3
import pandas as pd
import numpy as np
import subprocess
from sys import argv
import time
import os
import itertools

### Inputs and Intital setup of function

The input to this function is the GTF file. We first get all the transcripts in the GTF file and for each of them iterate through to find all the exons associated with it. 

In [2]:
def all_spl_junctions(df):
    gtf = df
    gtf  = gtf[~gtf['transcript_id'].isin([np.nan])]
    transcripts = set(gtf['transcript_id'].tolist())
    gtf = gtf.set_index('transcript_id')
    gtf = gtf[(gtf['feature'].isin(['exon']))]
    gtf = gtf[['seqname', 'source', 'feature', 'start', 'end','strand','exon_id']]
    junctions = set()
    start = time.process_time()
    ind = 0

For each transcript, order the exons based on start site. After ordering each transcript, we measure the distance between each exon to make sure it is graeter than 1. (Can't have a splice junction if the exons are right next to each other). We then take the coordinates in betweeen each exon and define that as the splice junction.

In [3]:
    for tran in transcripts:
        extract_trans = gtf.loc[gtf.index.isin([tran])]
        #extract_trans = gtf[np.in1d(gtf['transcript_id'].values, [tran])]
        #exons_of_transcript = extract_trans[np.in1d(extract_trans['feature'].values, 'exon')]
        #exons_of_transcript = extract_trans[extract_trans['feature']=='exon']
        exon_list = extract_trans[['seqname', 'start','end','strand']].to_numpy().tolist()
        exon_list.sort()
        tmp_exons = [exon_list[0]]
        for i in range(1, len(exon_list)):
            if exon_list[i][1] - tmp_exons[-1][2] < 1:
                tmp_exons[-1][2] = exon_list[i][1]
            else:
                tmp_exons.append(exon_list[i])
        for i in range(1,len(tmp_exons)):
            junctions.add((tmp_exons[0][0],tmp_exons[i-1][2]+1,tmp_exons[i][1]-1, tmp_exons[0][3]))
        if ind % 10000 == 0:
            print(time.process_time() - start, '%i of %i' % (ind,len(transcripts)))
        ind+=1

NameError: name 'transcripts' is not defined

Output all the junctions as a dataframe

In [None]:
    df = pd.DataFrame(junctions, columns = ['seqname', 'start','end','strand'])
    convert_type = {'seqname': str, 'strand': str}
    df.astype(convert_type)
    return df