In [6]:
# 2018-05-01
# A. Pendleton
# Removal of duplicate PASA transcripts that have been processed through Transdecoder

In [7]:
#this uses iPython magic to make plots appear inline
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import subprocess
import sys
import numpy as np
import matplotlib.patches as patches
import gzip
import fileinput
import glob
from scipy import stats
import re
from matplotlib_venn import venn3, venn3_circles
from collections import OrderedDict


def count_lines(f):
    lineCount = 0
    with open(f, 'r') as f:
        for line in f:
            lineCount += 1
        return lineCount
def runCMD(cmd):
    val = subprocess.Popen(cmd, shell=True).wait()
    if val == 0:
        pass
    else:
        print ('command failed')
        print (cmd)
        sys.exit(1)
# TO REMOVE TOP AND RIGHT AXIS OF PLOTS
def simpleaxis(ax):
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()

In [8]:
###INPUT INFORMATION
inDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/results/'
bed_inFile = inDir + 'pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.genome.bed'


# Read in transcript ID data into dictionary

### This step simply saves the coordinates and other information into a dictionary that can later be pulled out only for the gene models with the highest score (determined one cell down)

In [89]:
geneDict = {}
fullDict = {}
processed = []
transcript_count, dupes = 0, 0

for line in open(bed_inFile, 'r'):
    line=line.rstrip().split()
    if 'track' in line[0]: #skip header
        continue
    #Read in information on gene
    chrom = line[0]

    #Keep track of how many have been processed
    transcript_count += 1
    """# For script testing:
    if transcript_count > 10:
        break"""
    geneID = line[3].split(';')[1]
    ID = line[3].replace('ID=','').split(';')[0]

    exonCount = int(line[9])
    exonLengths = line[10]
    #If geneID not already in dictionary, then add
    if geneID not in geneDict.keys():
        geneDict[geneID] = []
    geneDict[geneID].append([fullID, exonCount, exonLengths])
    if ID in fullDict.keys():
        dupes+=1
        continue
    fullDict[ID] = [line]
print('%i genes added to dictionary' % len(geneDict.keys()))
print('%i transcripts processed' % transcript_count)
print('%i genes added to FULL Dictionary where keys are the full length ID' % len(fullDict.keys()))
print('dupes = ', dupes)

61202 genes added to dictionary
198604 transcripts processed
129903 genes added to FULL Dictionary where keys are the full length ID
dupes =  68701


### This step goes through the transdecoder peptide file and looks through each gene model for the model with the highest score. If a higher score is found, it replaces the gene model that previously had the highest score. 

#### Importantly, this step also skips over peptides that were not deemed as 'complete' gene models by transdecoder (e.g. skips over the 5' or 3' partial models). 

In [105]:
peptideFile = inDir + 'pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.pep'
scoreDict = {}

for line in open(peptideFile,'r'):
    if '>' not in line: #skip those not a gene identifier
        continue
    if 'TCONS' in line: #not mapped
        continue
    if 'complete' not in line: #only want the complete gene models
        continue
    fullID = line.rstrip().split(' ')[0].replace('>','')
    line = line.rstrip().split('::')
    #print(line)

    #geneID = line[1].rsplit('.',1)
    geneID = line[1].split('.')[0] + '.' + line[1].split('.')[1]

    transcriptID = line[4]
    score = float(line[5].split(',')[1].split(' ')[0].split('=')[1])
    
    if geneID not in scoreDict.keys():
        scoreDict[geneID] = ['',0,'']#['highestscoring_transcriptID','highestScore']
    if score > scoreDict[geneID][1]:
        scoreDict[geneID] = [transcriptID,score,fullID]
    

print('%i transcripts with highest score added to the dictionary' % len(scoreDict.keys()))

30170 transcripts with highest score added to the dictionary


## Generate output BED file with the non-redundant gene models

In [108]:
#Make BED file with just the transcript IDs that have the highest score
outfile = inDir + 'HighestScoringTranscripts_IDs.bed'
outFile = open(outfile,'w')
missing=[]
added = 0
for key in scoreDict.keys():
    if scoreDict[key][2] not in fullDict.keys():
        missing.append(scoreDict[key][2])
        continue
    info = fullDict[scoreDict[key][2]]
    for i in info:
        outFile.write('\t'.join(i) + '\n')
        added+=1
        continue
outFile.close()


In [109]:
print('%i of the largest genes added' % added)
print('Reduced from %i transcripts' % transcript_count)

28889 of the largest genes added
Reduced from 198604 transcripts
