In [6]:
# 2018-05-01
# A. Pendleton
# Removal of duplicate PASA transcripts that have been processed through Transdecoder

In [7]:
#this uses iPython magic to make plots appear inline
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import subprocess
import sys
import numpy as np
import matplotlib.patches as patches
import gzip
import fileinput
import glob
from scipy import stats
import re
from matplotlib_venn import venn3, venn3_circles
from collections import OrderedDict


def count_lines(f):
    lineCount = 0
    with open(f, 'r') as f:
        for line in f:
            lineCount += 1
        return lineCount
def runCMD(cmd):
    val = subprocess.Popen(cmd, shell=True).wait()
    if val == 0:
        pass
    else:
        print ('command failed')
        print (cmd)
        sys.exit(1)
# TO REMOVE TOP AND RIGHT AXIS OF PLOTS
def simpleaxis(ax):
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()

In [8]:
###INPUT INFORMATION
inDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/results/'
bed_inFile = inDir + 'pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.genome.bed'


# Read in transcript ID data into dictionary

In [89]:
geneDict = {}
fullDict = {}
processed = []
transcript_count, dupes = 0, 0

for line in open(bed_inFile, 'r'):
    line=line.rstrip().split()
    if 'track' in line[0]: #skip header
        continue
    #Read in information on gene
    chrom = line[0]

    #Keep track of how many have been processed
    transcript_count += 1
    """# For script testing:
    if transcript_count > 10:
        break"""
    geneID = line[3].split(';')[1]
    ID = line[3].replace('ID=','').split(';')[0]

    exonCount = int(line[9])
    exonLengths = line[10]
    #If geneID not already in dictionary, then add
    if geneID not in geneDict.keys():
        geneDict[geneID] = []
    geneDict[geneID].append([fullID, exonCount, exonLengths])
    if ID in fullDict.keys():
        dupes+=1
        continue
    fullDict[ID] = [line]
print('%i genes added to dictionary' % len(geneDict.keys()))
print('%i transcripts processed' % transcript_count)
print('%i genes added to FULL Dictionary where keys are the full length ID' % len(fullDict.keys()))
print('dupes = ', dupes)

61202 genes added to dictionary
198604 transcripts processed
129903 genes added to FULL Dictionary where keys are the full length ID
dupes =  68701


In [105]:
peptideFile = inDir + 'pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.pep'
scoreDict = {}

for line in open(peptideFile,'r'):
    if '>' not in line: #skip those not a gene identifier
        continue
    if 'TCONS' in line: #not mapped
        continue
    if 'complete' not in line: #only want the complete gene models
        continue
    fullID = line.rstrip().split(' ')[0].replace('>','')
    line = line.rstrip().split('::')
    #print(line)

    #geneID = line[1].rsplit('.',1)
    geneID = line[1].split('.')[0] + '.' + line[1].split('.')[1]

    transcriptID = line[4]
    score = float(line[5].split(',')[1].split(' ')[0].split('=')[1])
    
    if geneID not in scoreDict.keys():
        scoreDict[geneID] = ['',0,'']#['highestscoring_transcriptID','highestScore']
    if score > scoreDict[geneID][1]:
        scoreDict[geneID] = [transcriptID,score,fullID]
    

print('%i transcripts with highest score added to the dictionary' % len(scoreDict.keys()))

30170 transcripts with highest score added to the dictionary


In [106]:
#Make file with just the transcript IDs that have the highest score
outfile = inDir + 'HighestScoringTranscripts_IDs.txt'
outFile = open(outfile,'w')

for key in scoreDict.keys():
    outFile.write('%s\n' % scoreDict[key][2])
outFile.close()

#Make BED file with just the transcript IDs that have the highest score
outfile = inDir + 'HighestScoringTranscripts_IDs.bed'
outFile = open(outfile,'w')
missing=[]
for key in scoreDict.keys():
    if scoreDict[key][2] not in fullDict.keys():
        missing.append(scoreDict[key][2])
        continue
    info = fullDict[scoreDict[key][2]]
    for i in info:
        outFile.write('\t'.join(i) + '\n')
        continue
outFile.close()


In [102]:
print(len(missing))

2099


In [103]:
print(missing[0:4])

['Gene.191260::chr21.g6285.i1::g.191260::m.191260', 'Gene.299138::chr37.g2475.i1::g.299138::m.299138', 'Gene.95864::chr15.g6429.i1::g.95864::m.95864', 'Gene.18733::chr1.g916.i1::g.18733::m.18733']


In [65]:
largest_geneDict = {}
largest_noUTR_geneDict = {}
transcript_count = 0

for LINE in open(bed_inFile, 'r'):
    line=LINE.rstrip().split()
    if 'track' in line[0]: #skip header
        continue
    #Read in information on gene
    chrom = line[0]
    #Keep track of how many have been processed
    transcript_count += 1
    geneID = line[3].split(';')[1]
    fullID = line[3]
    exonCount = int(line[9])
    
    print(line)
    print(chrom,geneID,fullID,exonCount)
    print(transcript_count)
    print(scoreDict[geneID])
    break

['CTG-1059', '27474', '28054', 'ID=Gene.15::CTG-1059.g15.i1::g.15::m.15;CTG-1059.g15;ORF', '0', '+', '27475', '27787', '0', '1', '580', '0']
CTG-1059 CTG-1059.g15 ID=Gene.15::CTG-1059.g15.i1::g.15::m.15;CTG-1059.g15;ORF 1
1
['CTG-1059.g15.i1', 20.04]


In [60]:
largest_geneDict = {}
largest_noUTR_geneDict = {}

for LINE in open(bed_inFile, 'r'):
    line=LINE.rstrip().split()
    print(line)
    break
    if 'track' in line[0]: #skip header
        continue
    #Read in information on gene
    chrom = line[0]

    #Keep track of how many have been processed
    transcript_count += 1
    geneID = line[3].split(';')[1]
    fullID = line[3]
    exonCount = int(line[9])
    
    ##GET GENE LENGTH BY ADDING UP EXONS
    if ',' in line[10]:
        exonLengths = line[10].split(',')
        length = 0
        for l in exonLengths:
            length+=int(l)
    else:
        length = int(line[10])
        
    #GET UTR LENGTHS
    UTR5 = int(line[6])-int(line[1])
    UTR3 = int(line[2])-int(line[7])
    
    UTRlength = UTR5+UTR3
    lengthNonUTR = length-UTRlength
    
    if geneID not in largest_geneDict.keys():
        largest_geneDict[geneID] = [length, line]
    if length > int(largest_geneDict[geneID][0]):
        largest_geneDict[geneID] = [length, line]
    if 'chr3.g2424' in geneID:
        print(length,UTR5,UTR3,UTRlength,lengthNonUTR,exonCount,fullID)

['track', "name='pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.genome.gff3'"]


In [51]:
print('%i of the largest genes added' % len(largest_geneDict.keys()))
print('Reduced from %i transcripts' % transcript_count)

61202 of the largest genes added
Reduced from 397208 transcripts


In [52]:
#WRITE TO OUTFILE
outFile = open(inDir + '../ReducedGeneFiles_NoRedudant/' + 'ReducedTranscript_SingleLongestGene_PASA_Transdecoder.bed','w')

for gene in largest_geneDict.keys():
    outFile.write('\t'.join(map(str,largest_geneDict[gene][1]))+'\n')
outFile.close()

In [55]:
for gene in largest_geneDict.keys():
    if 'chr3.g2424' in gene:
        print(largest_geneDict[gene])

[399, ['chr3', '71030992', '71031391', 'ID=Gene.256627::chr3.g24240.i1::g.256627::m.256627;chr3.g24240;ORF', '0', '-', '71031023', '71031389', '0', '1', '399', '0']]
[7628, ['chr3', '7971229', '8006070', 'ID=Gene.268707::chr3.g2424.i19::g.268707::m.268707;chr3.g2424;ORF', '0', '-', '7974893', '8003408', '0', '9', '3876,54,100,116,201,211,73,117,2880', '0,9325,11636,16678,18794,19417,20556,22603,31961']]
