In [1]:
# 2018-09-18
# A. Pendleton
# Generating UCSC tracks for:
#     - Raw Trinity gene models
#     - Transdecoder processed gene models
#     - Reduced gene models based on Transdecoder scores
#     - Kallisto tracks with expression bar charts

In [2]:
#this uses iPython magic to make plots appear inline
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import subprocess
import sys
import numpy as np
import matplotlib.patches as patches
import gzip
import fileinput
import glob
from scipy import stats
import re
from matplotlib_venn import venn3, venn3_circles
from collections import OrderedDict


def count_lines(f):
    lineCount = 0
    with open(f, 'r') as f:
        for line in f:
            lineCount += 1
        return lineCount
def runCMD(cmd):
    val = subprocess.Popen(cmd, shell=True).wait()
    if val == 0:
        pass
    else:
        print ('command failed')
        print (cmd)
        sys.exit(1)
# TO REMOVE TOP AND RIGHT AXIS OF PLOTS
def simpleaxis(ax):
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()

In [3]:
def write_pbs_file(wkDir,cmdsFile,jobName, mem, jobCount):
    #write PBS file
    if '/scripts/' in wkDir:
        wkDir = wkDir.replace('/scripts/','')
    pbsFile = open(cmdsFile.replace('.cmds','.pbs'),'w')
    print('Writing pbs file: %s'% pbsFile)
    pbsFile.write('#!/bin/bash\n')
    pbsFile.write('#PBS -S /bin/bash\n')
    pbsFile.write('#PBS -V\n')
    pbsFile.write('#PBS -M ampend@med.umich.edu\n')
    pbsFile.write('#PBS -j oe\n')
    pbsFile.write('#PBS -N %s\n' % jobName)
    pbsFile.write('#PBS -o %s\n' % (wkDir + 'logs/'))
    pbsFile.write('#PBS -l pmem=%iG\n' % mem)
    pbsFile.write('#PBS -l nodes=1:ppn=1,qos=flux,walltime=100:00:00\n')
    pbsFile.write('##PBS -A medbsm_flux\n')
    pbsFile.write('##PBS -q flux\n')
    pbsFile.write('#PBS -A jmkidd_fluxod\n')
    pbsFile.write('#PBS -q fluxod\n')
    pbsFile.write('#PBS -t 1-%s\n' % jobCount)
    pbsFile.write('cd %s\n' % (wkDir))
    pbsFile.write('/home/ampend/links/kidd-lab/jmkidd-projects/scripts/perlUtils/run-by-id-log.pl %s %sBLAT_commands.logs $PBS_ARRAYID' % (cmdsFile,cmdsFile.replace('/scripts/','/logs/')))
    pbsFile.close()

In [4]:
trackDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/UCSC_Tracks/Genes/'

zoeyChromSizeFile = '/home/ampend/links/kidd-lab/genomes/zoey/assemblies/2.3/ref/zoey.2.3.chrom.sizes'

# Raw Trinity Outputs

### Trinity genome-guided transcripts were BLATted against Zoey2.3 to get coordinates and parsed to select only the top hit for each predicted transcript. Below is the generation of tracks for the genome-guided BLAT top hits:

In [None]:
#sorted bed file
trinFile = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/Trinity_RNA-Seq_Analysis/trinity_alignments/zoey-2.3/AllLibraries_trinity/BLAT/Merged_BLAT_results/Total_TopHitsOnly_BLAT_zoey-2.3_Zoey_Trinity_sorted.12.bed'



cmd = 'bedToBigBed %s %s %s' % (zoeyChromSizeFile)




# PASA GTF 

#The PASA GTF File is here: 
'/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/PASA_Processing/pasa-lite_assemblyAssembler_results/pasa-lite_ZoeyTrinity.pasa_assembled_alignments.gtf'

#It was converted to BED file coordinates in zoey coordinates by transdecoder's first step:
/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/transdecoder/results/pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.genome.bed

In [None]:
transDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/transdecoder/'

transBed = transDir + 'pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.genome.bed'

In [None]:
transDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/transdecoder/'

peptideFile = transDir + 'pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.pep'
scoreDict = {}

for line in open(peptideFile,'r'):
    if '>' not in line: #skip those not a gene identifier
        continue
    """if 'TCONS' in line: #not mapped
        continue"""
    if 'complete' not in line: #only want the complete gene models
        continue
    fullID = line.rstrip().split(' ')[0].replace('>','')
    line = line.rstrip().split('::')
    #print(line)

    #geneID = line[1].rsplit('.',1)
    geneID = line[1].split('.')[0] + '.' + line[1].split('.')[1]

    transcriptID = line[4]
    score = float(line[5].split(',')[1].split(' ')[0].split('=')[1])
    
    if geneID not in scoreDict.keys():
        scoreDict[geneID] = ['',0,'']#['highestscoring_transcriptID','highestScore']
    if score > scoreDict[geneID][1]:
        scoreDict[geneID] = [transcriptID,score,fullID]
    

print('%i transcripts with highest score added to the dictionary' % len(scoreDict.keys()))

In [47]:
transDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/transdecoder/'
transBed = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/transdecoder/results/pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.genome.bed'


#Sort transbed
cmd = 'bedSort %s %s.sorted' % (transBed,transBed)
print(cmd)
runCMD(cmd)

#Make bigBed
cmd = 'bedToBigBed %s %s %s.bb' % (transBed,zoeyChromSizeFile,transBed.replace('.bed','.bb'))
print(cmd)
runCMD(cmd)


bedSort /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/transdecoder/results/pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.genome.bed /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/transdecoder/results/pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.genome.bed.sorted
command failed
bedSort /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/transdecoder/results/pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.genome.bed /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/transdecoder/results/pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.genome.bed.sorted


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


# Transdecoder Outputs

In [7]:
transDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/transdecoder/results/'

transBed = transDir + 'pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.genome.bed'

#Remove offending header line AND get IDs 
cmd = 'cat %s | sort | uniq | grep -v gff3 | sed \'s/ID=//g\' > %sBED' % (transBed,transDir)
print(cmd)
runCMD(cmd)

#Sort temp BED file
cmd = 'bedSort %sBED %s' % (transDir,transBed + '.sorted')
print(cmd)
runCMD(cmd)

#Remove temp bed file
cmd = 'rm %sBED' % transDir
runCMD(cmd)
print(cmd)

#Make into bigBed
cmd = 'bedToBigBed %s %s %s' % (transBed + '.sorted',zoeyChromSizeFile,transBed + '.bb')
runCMD(cmd)
print(cmd)

#Copy into tracks directory
cmd = 'cp %s %s' % (transBed + '.bb',trackDir)
runCMD(cmd)
print(cmd)

cat /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/transdecoder/results/pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.genome.bed | sort | uniq | grep -v gff3 | sed 's/ID=//g' > /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/transdecoder/results/BED
bedSort /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/transdecoder/results/BED /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/transdecoder/results/pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.genome.bed.sorted
rm /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/transdecoder/results/BED
bedToBigBed /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/transdecoder/results/pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.genome.bed.sor

# Reduced Transdecoder Output

In [11]:
transRedDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/NonRedundant_NoRMIntersect_FilteredGeneSet/'

transBed = transRedDir + 'TotalSet_NoRMSingleExons_AllMultiExons.bed'

#Remove offending labels
cmd = 'sed -i \'s/ID=//g\' %s' % (transBed)
print(cmd)
runCMD(cmd)

#Sort  BED file
cmd = 'bedSort %s %s' % (transBed,transBed + '.sorted')
print(cmd)
runCMD(cmd)

#Make into bigBed
cmd = 'bedToBigBed %s %s %s' % (transBed + '.sorted',zoeyChromSizeFile,transBed + '.bb')
runCMD(cmd)
print(cmd)

#Copy into tracks directory
cmd = 'cp %s %s' % (transBed + '.bb',trackDir)
runCMD(cmd)
print(cmd)

sed -i 's/ID=//g' /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/NonRedundant_NoRMIntersect_FilteredGeneSet/TotalSet_NoRMSingleExons_AllMultiExons.bed
bedSort /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/NonRedundant_NoRMIntersect_FilteredGeneSet/TotalSet_NoRMSingleExons_AllMultiExons.bed /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/NonRedundant_NoRMIntersect_FilteredGeneSet/TotalSet_NoRMSingleExons_AllMultiExons.bed.sorted
bedToBigBed /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/NonRedundant_NoRMIntersect_FilteredGeneSet/TotalSet_NoRMSingleExons_AllMultiExons.bed.sorted /home/ampend/links/kidd-lab/genomes/zoey/assemblies/2.3/ref/zoey.2.3.chrom.sizes /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/version3_intron500k/NonRedundant_NoRMIntersect_FilteredGeneSet/TotalSet_NoRMSingleE

# Make Final Genes

In [16]:

finalBed = trackDir + 'TEMP_FinalDeNovoGenes.bed'


#Sort  BED file
cmd = 'bedSort %s %s' % (finalBed,finalBed + '.sorted')
print(cmd)
runCMD(cmd)

#Make into bigBed
cmd = 'bedToBigBed %s %s %s' % (finalBed + '.sorted',zoeyChromSizeFile,finalBed + '.bb')
runCMD(cmd)
print(cmd)



bedSort /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/UCSC_Tracks/Genes/TEMP_FinalDeNovoGenes.bed /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/UCSC_Tracks/Genes/TEMP_FinalDeNovoGenes.bed.sorted
bedToBigBed /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/UCSC_Tracks/Genes/TEMP_FinalDeNovoGenes.bed.sorted /home/ampend/links/kidd-lab/genomes/zoey/assemblies/2.3/ref/zoey.2.3.chrom.sizes /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/UCSC_Tracks/Genes/TEMP_FinalDeNovoGenes.bed.bb
