In [19]:
# 2018-03-29
# A. Pendleton
# Removal of duplicate PASA transcripts that have been processed through Transdecoder

In [2]:
#this uses iPython magic to make plots appear inline
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import subprocess
import sys
import numpy as np
import matplotlib.patches as patches
import gzip
import fileinput
import glob
from scipy import stats
import re
from matplotlib_venn import venn3, venn3_circles
from collections import OrderedDict


def count_lines(f):
    lineCount = 0
    with open(f, 'r') as f:
        for line in f:
            lineCount += 1
        return lineCount
def runCMD(cmd):
    val = subprocess.Popen(cmd, shell=True).wait()
    if val == 0:
        pass
    else:
        print ('command failed')
        print (cmd)
        sys.exit(1)
# TO REMOVE TOP AND RIGHT AXIS OF PLOTS
def simpleaxis(ax):
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()

In [42]:
###INPUT INFORMATION
inDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/results/'
bed_inFile = inDir + 'pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.genome.bed'


# Read in transcript ID data into dictionary

In [49]:
geneDict = {}
processed = []
transcript_count = 0

for line in open(bed_inFile, 'r'):
    line=line.rstrip().split()
    if 'track' in line[0]: #skip header
        continue
    #Read in information on gene
    chrom = line[0]

    #Keep track of how many have been processed
    transcript_count += 1
    """if transcript_count > 10:
        break"""
    geneID = line[3].split(';')[1]
    fullID = line[3]
    exonCount = int(line[9])
    exonLengths = line[10]
    #If geneID not already in dictionary, then add
    if geneID not in geneDict.keys():
        geneDict[geneID] = []
    geneDict[geneID].append([fullID, exonCount, exonLengths])
    
print('%i genes added to dictionary' % len(geneDict.keys()))
print('%i transcripts processed' % transcript_count)

61202 genes added to dictionary
198604 transcripts processed


In [65]:
largest_geneDict = {}
largest_noUTR_geneDict = {}

for LINE in open(bed_inFile, 'r'):
    line=LINE.rstrip().split()
    if 'track' in line[0]: #skip header
        continue
    #Read in information on gene
    chrom = line[0]

    #Keep track of how many have been processed
    transcript_count += 1
    geneID = line[3].split(';')[1]
    fullID = line[3]
    exonCount = int(line[9])
    
    if 'chr3.g2424' not in geneID:
        #print(length,exonCount,exonLengths,fullID)
        continue
    ##GET GENE LENGTH BY ADDING UP EXONS
    if ',' in line[10]:
        exonLengths = line[10].split(',')
        length = 0
        for l in exonLengths:
            length+=int(l)
    else:
        length = int(line[10])
        
    #GET UTR LENGTHS
    UTR5 = int(line[6])-int(line[1])
    UTR3 = int(line[2])-int(line[7])
    
    UTRlength = UTR5+UTR3
    lengthNonUTR = length-UTRlength
    
    if geneID not in largest_geneDict.keys():
        largest_geneDict[geneID] = [length, line]
    if length > int(largest_geneDict[geneID][0]):
        largest_geneDict[geneID] = [length, line]
    if 'chr3.g2424' in geneID:
        print(length,UTR5,UTR3,UTRlength,lengthNonUTR,exonCount,fullID)

5072 2120 78875 80995 -75923 24 ID=Gene.268735::chr3.g2424.i9::g.268735::m.268735;chr3.g2424;ORF
4999 2120 124522 126642 -121643 24 ID=Gene.268733::chr3.g2424.i5::g.268733::m.268733;chr3.g2424;ORF
5257 2120 124522 126642 -121385 26 ID=Gene.268699::chr3.g2424.i3::g.268699::m.268699;chr3.g2424;ORF
5527 2120 124522 126642 -121115 26 ID=Gene.268705::chr3.g2424.i2::g.268705::m.268705;chr3.g2424;ORF
3655 766 78875 79641 -75986 25 ID=Gene.268725::chr3.g2424.i17::g.268725::m.268725;chr3.g2424;ORF
4892 2120 124522 126642 -121750 23 ID=Gene.268694::chr3.g2424.i7::g.268694::m.268694;chr3.g2424;ORF
5096 2120 124522 126642 -121546 24 ID=Gene.268720::chr3.g2424.i8::g.268720::m.268720;chr3.g2424;ORF
5770 3664 369 4033 1737 10 ID=Gene.268730::chr3.g2424.i20::g.268730::m.268730;chr3.g2424;ORF
5126 2120 78875 80995 -75869 25 ID=Gene.268737::chr3.g2424.i10::g.268737::m.268737;chr3.g2424;ORF
4081 766 125358 126124 -122043 26 ID=Gene.268715::chr3.g2424.i18::g.268715::m.268715;chr3.g2424;ORF
5054 2120 12535

In [51]:
print('%i of the largest genes added' % len(largest_geneDict.keys()))
print('Reduced from %i transcripts' % transcript_count)

61202 of the largest genes added
Reduced from 397208 transcripts


In [52]:
#WRITE TO OUTFILE
outFile = open(inDir + '../ReducedGeneFiles_NoRedudant/' + 'ReducedTranscript_SingleLongestGene_PASA_Transdecoder.bed','w')

for gene in largest_geneDict.keys():
    outFile.write('\t'.join(map(str,largest_geneDict[gene][1]))+'\n')
outFile.close()

In [55]:
for gene in largest_geneDict.keys():
    if 'chr3.g2424' in gene:
        print(largest_geneDict[gene])

[399, ['chr3', '71030992', '71031391', 'ID=Gene.256627::chr3.g24240.i1::g.256627::m.256627;chr3.g24240;ORF', '0', '-', '71031023', '71031389', '0', '1', '399', '0']]
[7628, ['chr3', '7971229', '8006070', 'ID=Gene.268707::chr3.g2424.i19::g.268707::m.268707;chr3.g2424;ORF', '0', '-', '7974893', '8003408', '0', '9', '3876,54,100,116,201,211,73,117,2880', '0,9325,11636,16678,18794,19417,20556,22603,31961']]
