In [4]:
# 2018-07-12
# A. Pendleton
# Removal of duplicate PASA transcripts that have been processed through Transdecoder
#    and generating the resulting FASTA file with only the transcripts that passed

In [5]:
#this uses iPython magic to make plots appear inline
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import subprocess
import sys
import numpy as np
import matplotlib.patches as patches
import gzip
import fileinput
import glob
from scipy import stats
import re
from matplotlib_venn import venn3, venn3_circles
from collections import OrderedDict


def count_lines(f):
    lineCount = 0
    with open(f, 'r') as f:
        for line in f:
            lineCount += 1
        return lineCount
def runCMD(cmd):
    val = subprocess.Popen(cmd, shell=True).wait()
    if val == 0:
        pass
    else:
        print ('command failed')
        print (cmd)
        sys.exit(1)
# TO REMOVE TOP AND RIGHT AXIS OF PLOTS
def simpleaxis(ax):
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()

In [6]:
###INPUT INFORMATION
inDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/results/'
bed_inFile = inDir + 'pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.genome.bed'


# Read in transcript ID data into dictionary

### This step simply saves the coordinates and other information into a dictionary that can later be pulled out only for the gene models with the highest score (determined one cell down)

In [7]:
geneDict = {}
fullDict = {}
processed = []
transcript_count, dupes = 0, 0

for LINE in open(bed_inFile, 'r'):
    line=LINE.rstrip().split()
    if 'track' in line[0]: #skip header
        continue
    #Read in information on gene
    chrom = line[0]

    #Keep track of how many have been processed
    transcript_count += 1
    fullID = line[3]
    geneID = line[3].split(';')[1]
    ID = line[3].replace('ID=','').split(';')[0]

    exonCount = int(line[9])
    exonLengths = line[10]
    #If geneID not already in dictionary, then add
    if geneID not in geneDict.keys():
        geneDict[geneID] = []
    geneDict[geneID].append([fullID, exonCount, exonLengths])
    
    if 'chr20.g27627' in LINE:
        print(line)
    
    if ID in fullDict.keys():
        dupes+=1
        continue
    fullDict[ID] = [line]
print('%i genes added to dictionary' % len(geneDict.keys()))
print('%i transcripts processed' % transcript_count)
print('%i genes added to FULL Dictionary where keys are the full length ID' % len(fullDict.keys()))
print('dupes = ', dupes)

62049 genes added to dictionary
199821 transcripts processed
130935 genes added to FULL Dictionary where keys are the full length ID
dupes =  68886


### This step goes through the transdecoder peptide file and looks through each gene model for the model with the highest score. If a higher score is found, it replaces the gene model that previously had the highest score. 

#### Importantly, this step also skips over peptides that were not deemed as 'complete' gene models by transdecoder (e.g. skips over the 5' or 3' partial models). 

In [8]:
peptideFile = inDir + 'pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.pep'
scoreDict = {}

for line in open(peptideFile,'r'):
    if '>' not in line: #skip those not a gene identifier
        continue
    if 'TCONS' in line: #not mapped
        continue
    if 'complete' not in line: #only want the complete gene models
        continue
    fullID = line.rstrip().split(' ')[0].replace('>','')
    line = line.rstrip().split('::')
    #print(line)

    #geneID = line[1].rsplit('.',1)
    geneID = line[1].split('.')[0] + '.' + line[1].split('.')[1]

    transcriptID = line[4]
    score = float(line[5].split(',')[1].split(' ')[0].split('=')[1])
    
    if geneID not in scoreDict.keys():
        scoreDict[geneID] = ['',0,'']#['highestscoring_transcriptID','highestScore']
    if score > scoreDict[geneID][1]:
        scoreDict[geneID] = [transcriptID,score,fullID]
    

print('%i transcripts with highest score added to the dictionary' % len(scoreDict.keys()))

30492 transcripts with highest score added to the dictionary


## Generate output BED file with the non-redundant gene models

In [37]:
#Make BED file with just the transcript IDs that have the highest score
print(inDir)
outfile = inDir + 'HighestScoringTranscripts_IDs.bed'
outFile = open(outfile,'w')
print(outfile)

outDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/ReducedGeneFiles_NoRedudant/IntersectWithRepeatElements/'
cmd = 'mkdir -p %s' % outDir #conditionally make this output directory
runCMD(cmd)

singleExonFile = open(outDir + 'singleExons_highestscoringtranscripts.bed','w')
nonSingleExonFile = open(outDir + '../multiExons_highestscoringtranscripts.bed','w')
nonSingleExonIDFile = open(outDir + '../multiExons_highestscoringtranscripts_IDs.txt','w')

missing=[]
added,singleExon = 0, 0
for key in scoreDict.keys():
    if scoreDict[key][2] not in fullDict.keys():
        missing.append(scoreDict[key][2])
        continue
    info = fullDict[scoreDict[key][2]]
    for i in info:
        outFile.write('\t'.join(i) + '\n')
        added+=1
        exonCount = int(i[9])
        if exonCount == 1:
            singleExonFile.write('\t'.join(i) + '\n')
            singleExon += 1
        else:
            nonSingleExonFile.write('\t'.join(i) + '\n')
            nonSingleExonIDFile.write('%s\n' % i[3].split(';')[0].replace('ID=',''))
outFile.close()
singleExonFile.close()
nonSingleExonFile.close()
nonSingleExonIDFile.close()

#STATS
print('%i of the highest scoring genes added' % added)
print('\t%i of which are single exons' % singleExon)
print('Reduced from %i transcripts' % len(fullDict.keys()))

/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/results/
/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/results/HighestScoringTranscripts_IDs.bed
29192 of the highest scoring genes added
	14923 of which are single exons
Reduced from 130935 transcripts


### Intersect the single exons with LINEs -- to eliminate those that correspond



In [14]:
outDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/ReducedGeneFiles_NoRedudant/IntersectWithRepeatElements/'
singleExonFile = outDir + 'singleExons_highestscoringtranscripts.bed'
zoeyL1File = '/home/ampend/links/kidd-lab/genomes/zoey/assemblies/2.3/rm-tracks/zoey2.3.rmsk.LINE.bed'
zoeyLINESINELTRFile = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/ReducedGeneFiles_NoRedudant/zoey2.3.rmsk.LINE_SINE_LTR.bed'

#Write command to do intersect
cmd = 'bedtools intersect -wo -a %s -b %s > %sINTERSECT_singleExonsHighestScoring_with_ZoeyLINE_SINE_LTR.txt' % (singleExonFile,zoeyLINESINELTRFile,outDir)
print(cmd)
runCMD(cmd)


bedtools intersect -wo -a /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/ReducedGeneFiles_NoRedudant/IntersectWithRepeatElements/singleExons_highestscoringtranscripts.bed -b /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/ReducedGeneFiles_NoRedudant/zoey2.3.rmsk.LINE_SINE_LTR.bed > /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/ReducedGeneFiles_NoRedudant/IntersectWithRepeatElements/INTERSECT_singleExonsHighestScoring_with_ZoeyLINE_SINE_LTR.txt


In [32]:
intersectFile = outDir + 'INTERSECT_singleExonsHighestScoring_with_ZoeyLINE_SINE_LTR.txt'
print('Parsing intersect file:\n',intersectFile)
intDict = {}

totalIntersectCount, noExonIntersectCount = 0, 0 #for keeping track 

for line in open(intersectFile,'r'):
    #FIRST SET INTERSECT == FALSE
    intersect = False
    totalIntersectCount += 1
    
    line = line.rstrip() #removing extraneous whitespace characters
    line = line.split('\t') #delimiting "columns" in the file based on tabs		
    winID = line[3]
    linelength = len(line)

    #Save gene info to dictionary if first time seeing it
    gene = line[3] #GENE
    geneStart, geneEnd = int(line[1]), int(line[2])
    exonStart, exonEnd = int(line[6]), int(line[7])
    lengthOfGene = geneEnd - geneStart
    lengthOfSingleExon = exonEnd - exonStart
    if gene not in intDict.keys():
        #0 = gene, #1 = length of gene, #2 = length of single exon
        #3 = all IDs of intersecting elements
        #4 = all lengths of intersecting elements
        #5 = all lengths of intersecting elements with EXONS only
        #6 = sum of all intersecting bp (set to zero)
        #7 = sum of all intersecting bp WITH EXONS only (set to zero)
        #8 = Prop of all GENE intersection (set to zero)
        #9 = proportion of all EXON intersection (sum of intersecting bp / length of single exon) (set to zero)
        #10 all the line information from the intersect file to use later to write out pass/fail exons
        #11 whether the gene passes/fails (determined in step below - default = Fail)
        intDict[gene] = [gene, lengthOfGene, lengthOfSingleExon, [], [], [], 0, 0, 0, 0, line[0:11], 'Fail']
     
    #DETERMINE IF RM ELEMENT INTERSECTS WITH EXON AT ALL
    RMstart = int(line[13])
    RMend = int(line[14])
    
    """ #Illustration of below intersect with Exon (E) and Repeat element (R)
    RRRRRRRRR
         EEEEEEEEEE
    """
    if RMstart <= exonStart and exonStart <= RMend and exonStart <= RMend:
        lengthOfIntersectWithExon = RMend - exonStart 
        intersect = True
    
    """ #Illustration of below intersect with Exon (E) and Repeat element (R)
    EEEEEEEEEE
        RRRRRRRRR
    """
    if exonStart <= RMstart and exonEnd <= RMend and RMstart <= exonEnd:
        lengthOfIntersectWithExon = exonEnd - RMstart  
        intersect = True
    
    """ #Illustration of below intersect with Exon (E) and Repeat element (R)
    EEEEEEEEEEEEEEEEEEEEE
           RRRRRRRRRR
    """
    if exonStart <= RMstart and exonEnd >= RMend:
        lengthOfIntersectWithExon = RMend - RMstart 
        intersect = True
    
    """ #Illustration of below intersect with Exon (E) and Repeat element (R)
    RRRRRRRRRRRRRRRRRRRRRRRRR
           EEEEEEEEEE
    """    
    if RMstart <= exonStart and RMend >= exonEnd:
        lengthOfIntersectWithExon = exonEnd - exonStart 
        intersect = True
    
    #Only want to add up the intersecting data for RMs intersecting EXONS
    if intersect is False:
        noExonIntersectCount+=1
        lengthOfIntersectWithExon = 0
        #continue
    
    intersectingElement = str(line[linelength-14])
    lengthOfGeneIntersect = int(line[linelength-1]) #WITH WHOLE GENE

    
    #propOfGeneIntersecting = float(lengthOfIntersect/lengthOfGene)
    propOfGeneIntersecting = float(lengthOfIntersectWithExon/lengthOfSingleExon)
    
    #Add info to dictionary
    intDict[gene][3].append(intersectingElement)
    intDict[gene][4].append(lengthOfGeneIntersect)
    intDict[gene][5].append(lengthOfIntersectWithExon)
print('%i of the %i total intersects do not intersect with the exons' % (noExonIntersectCount,totalIntersectCount))



Parsing intersect file:
 /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/ReducedGeneFiles_NoRedudant/IntersectWithRepeatElements/INTERSECT_singleExonsHighestScoring_with_ZoeyLINE_SINE_LTR.txt
20038 of the 26225 total intersects do not intersect with the exons


### Now go through each threshold of intersect proportion of alignment and generate outfiles for the transcripts that pass/fail at each threshold

In [16]:
#INTERSECT THRESHOLD
thresholds = [0.0001, 0.05, 0.1, 0.3333, 0.5, 0.6666, 0.75]
#threshold = 0.5

for threshold in thresholds:
    passIDs, failIDs = [], []
    passFile = open(outDir + 'SingleExon_PassRMIntersect_%fthreshold.txt' % threshold,'w')
    failFile = open(outDir + 'SingleExon_FailRMIntersect_%fthreshold.txt' % threshold,'w')
    count = 0

    for gene in intDict.keys():
        #Calculate how much of exon is covered by RMs
        GENEintersectSum = np.sum(intDict[gene][4])
        EXONintersectSum = np.sum(intDict[gene][5])
        intDict[gene][6] = GENEintersectSum
        intDict[gene][7] = EXONintersectSum
        #Now calculate what this sum is as a proportion of the exon length
        propOfExonIntersecting = float(EXONintersectSum/intDict[gene][2])
        intDict[gene][9] = propOfExonIntersecting
        #What would this be compared to with proportion of gene covered?
        propOfGeneIntersecting = float(GENEintersectSum/intDict[gene][1])
        intDict[gene][8] = propOfGeneIntersecting

        if propOfGeneIntersecting < 0.5 and propOfExonIntersecting > 0.5:
            count +=1
            coord = intDict[gene][10][0]+':'+str(intDict[gene][10][1])+'-'+str(intDict[gene][10][2])
            """print(coord)
            print(gene)
            print('gene length', intDict[gene][1])
            print('exon length', intDict[gene][2])
            print('gene intersect sum',GENEintersectSum)
            print('exon intersect sum',EXONintersectSum)
            print(propOfGeneIntersecting,propOfExonIntersecting,'\n')"""

        #Check if greater than threshold
        if propOfExonIntersecting < threshold:
            passFile.write('\t'.join(intDict[gene][10]) + '\n')
            passIDs.append(gene)
            intDict[gene][11] = 'Pass'
        else:
            failIDs.append(gene)
            failFile.write('\t'.join(intDict[gene][10]) + '\n')
    passFile.close()
    failFile.close()
    
    print('## Threshold = %f' % threshold)
    print('%i of the %i single exon genes intersect LINEs/SINEs/LTRs with more than %f of their length' % (len(set(failIDs)),singleExon,threshold))
    print('%i of the %i single exon genes PASS intersect thresholds\n' % (len(set(passIDs)),singleExon))


## Threshold = 0.000100
4878 of the 14923 single exon genes intersect LINEs/SINEs/LTRs with more than 0.000100 of their length
5044 of the 14923 single exon genes PASS intersect thresholds

## Threshold = 0.050000
4590 of the 14923 single exon genes intersect LINEs/SINEs/LTRs with more than 0.050000 of their length
5332 of the 14923 single exon genes PASS intersect thresholds

## Threshold = 0.100000
4301 of the 14923 single exon genes intersect LINEs/SINEs/LTRs with more than 0.100000 of their length
5621 of the 14923 single exon genes PASS intersect thresholds

## Threshold = 0.333300
3015 of the 14923 single exon genes intersect LINEs/SINEs/LTRs with more than 0.333300 of their length
6907 of the 14923 single exon genes PASS intersect thresholds

## Threshold = 0.500000
2272 of the 14923 single exon genes intersect LINEs/SINEs/LTRs with more than 0.500000 of their length
7650 of the 14923 single exon genes PASS intersect thresholds

## Threshold = 0.666600
1771 of the 14923 single e

### Now generate two FASTA files for the threshold you want to go with that contains 1) those transcripts that PASS at that cutoff 2) those transcripts that FAIL at that cutoff

#### Make ID list of only those single exon genes that pass at cutoff

In [41]:
threshold = 0.1 #******

inFile = outDir + 'SingleExon_PassRMIntersect_%fthreshold.txt' % threshold
outFile = outDir + 'SingleExon_PassRMIntersect_%fthreshold_IDs.txt' % threshold

cmd = 'cut -f 4 %s > %s' % (inFile,outFile)
print(cmd)
runCMD(cmd)

lineCount = count_lines(outFile)
print('\n%i single exons pass this cutoff and had their IDs written out' % lineCount)

cut -f 4 /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/ReducedGeneFiles_NoRedudant/IntersectWithRepeatElements/SingleExon_PassRMIntersect_0.100000threshold.txt > /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/ReducedGeneFiles_NoRedudant/IntersectWithRepeatElements/SingleExon_PassRMIntersect_0.100000threshold_IDs.txt

5621 single exons pass this cutoff and had their IDs written out


In [49]:
#INTERSECT THRESHOLD
threshold = 0.1 #*******

passfileIDs = outDir + 'SingleExon_PassRMIntersect_%fthreshold_IDs.txt' % threshold
passFileIDs = open(passfileIDs, 'w')
count = 0

failfileIDs = outDir + 'SingleExon_FailRMIntersect_%fthreshold_IDs.txt' % threshold
failFileIDs = open(failfileIDs, 'w')

for gene in intDict.keys():
    ID = gene.split(';')[0] #TO MATCH THE FASTA FILE WE WANT TO EXTRACT FROM

    #Calculate how much of exon is covered by RMs
    GENEintersectSum = np.sum(intDict[gene][4])
    EXONintersectSum = np.sum(intDict[gene][5])
    intDict[gene][6] = GENEintersectSum
    intDict[gene][7] = EXONintersectSum
    #Now calculate what this sum is as a proportion of the exon length
    propOfExonIntersecting = float(EXONintersectSum/intDict[gene][2])
    intDict[gene][9] = propOfExonIntersecting
    #What would this be compared to with proportion of gene covered?
    propOfGeneIntersecting = float(GENEintersectSum/intDict[gene][1])
    intDict[gene][8] = propOfGeneIntersecting

    #Check if greater than threshold
    if propOfExonIntersecting < threshold:
        passFileIDs.write('%s\n' % ID.replace('ID=',''))
        count+=1        
    else:
        failFileIDs.write('%s\n' % ID.replace('ID=',''))
passFileIDs.close()
failFileIDs.close()
print('Wrote out IDs for %i passing single exon transcripts' % count)

### Copy to new directory for only the passing genes
nonRedundDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/NonRedundant_NoRMIntersect_FilteredGeneSet/version_2/'
cmd = 'mkdir -p %s' % nonRedundDir #conditionally make output directory
print(cmd)
runCMD(cmd)


print('#Copying to new directory')
cmd = 'cp %s %s' % (passfileIDs,nonRedundDir)
runCMD(cmd)
print(cmd)


Wrote out IDs for 5621 passing single exon transcripts
mkdir -p /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/NonRedundant_NoRMIntersect_FilteredGeneSet/version_2/
#Copying to new directory
cp /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/ReducedGeneFiles_NoRedudant/IntersectWithRepeatElements/SingleExon_PassRMIntersect_0.100000threshold_IDs.txt /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/NonRedundant_NoRMIntersect_FilteredGeneSet/version_2/


#### Copying the multi exon file to same directory

In [43]:
print('#Copying to new directory')
multiExon = outDir + '../multiExons_highestscoringtranscripts_IDs.txt'

cmd = 'cp %s %s' % (multiExon,nonRedundDir)
runCMD(cmd)
print(cmd)



#Copying to new directory
cp /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/ReducedGeneFiles_NoRedudant/IntersectWithRepeatElements/../multiExons_highestscoringtranscripts_IDs.txt /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/NonRedundant_NoRMIntersect_FilteredGeneSet/version_2/


### Now need to get all the transcript IDs (multi and single exon) genes that pass the cut off --- can use FGREP from the total highest scoring transcript file to get the file transcript list, using the *FAILING* genes as what to fgrep -v (all but == -v)

In [50]:
#Use fgrep to get the all IDs that are NOT in the failing RM threshold file

cmd = 'fgrep -v -f %sSingleExon_FailRMIntersect_%fthreshold_IDs.txt %sHighestScoringTranscripts_IDs.bed | cut -f 4 > %sTotalSet_NoRMSingleExons_AllMultiExons_IDs.txt' % (outDir,threshold,inDir,nonRedundDir)
print(cmd)
runCMD(cmd)

lineCount = count_lines('%sTotalSet_NoRMSingleExons_AllMultiExons_IDs.txt' % nonRedundDir)
print('%i total genes remain after eliminating single exons with too much RM' % lineCount)

fgrep -v -f /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/ReducedGeneFiles_NoRedudant/IntersectWithRepeatElements/SingleExon_FailRMIntersect_0.100000threshold_IDs.txt /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/results/HighestScoringTranscripts_IDs.bed | cut -f 4 > /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/NonRedundant_NoRMIntersect_FilteredGeneSet/version_2/TotalSet_NoRMSingleExons_AllMultiExons_IDs.txt
24891 total genes remain after eliminating single exons with too much RM


## Index the total fasta file from transdecoder so you can extract by ID

In [51]:
transDDir = '/home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/results/'
pepFasta = transDDir + 'pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.pep'
cdsFasta = transDDir + 'pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.cds'

#indexing the peptide fasta file
cmd = 'samtools faidx %s' % pepFasta
runCMD(cmd)
print(cmd)

#indexing the CDS fasta file
cmd = 'samtools faidx %s' % cdsFasta
runCMD(cmd)
print(cmd)

samtools faidx /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/results/pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.pep
samtools faidx /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/results/pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.cds


### Getting the FASTA 

In [54]:
#using the ID file generated above to extract from the fasta file
#__Example Usage__
# xargs samtools faidx test.fa < names.txt

##PEPTIDE FASTA
IDFile = nonRedundDir + 'TotalSet_NoRMSingleExons_AllMultiExons_IDs.txt'
outFasta = IDFile.replace('_IDs.txt','_pep.fa')

cmd = 'xargs samtools faidx %s < %s > %s' % (pepFasta,IDFile,outFasta)
print(cmd)
#runCMD(cmd)

cmd = 'samtools faidx %s' % outFasta
print(cmd)
#runCMD(cmd)

##CDS FASTA
outFasta = IDFile.replace('_IDs.txt','_cds.fa')

cmd = 'xargs samtools faidx %s < %s > %s' % (cdsFasta,IDFile,outFasta)
print(cmd)
#runCMD(cmd)

cmd = 'samtools faidx %s' % outFasta
print(cmd)
#runCMD(cmd)

xargs samtools faidx /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/results/pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.pep < /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/NonRedundant_NoRMIntersect_FilteredGeneSet/version_2/TotalSet_NoRMSingleExons_AllMultiExons_IDs.txt > /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/NonRedundant_NoRMIntersect_FilteredGeneSet/version_2/TotalSet_NoRMSingleExons_AllMultiExons_pep.fa
samtools faidx /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/NonRedundant_NoRMIntersect_FilteredGeneSet/version_2/TotalSet_NoRMSingleExons_AllMultiExons_pep.fa
xargs samtools faidx /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/transdecoder/version_2/results/pasa-lite_ZoeyTrinity.pasa_assembled_alignments.fa.transdecoder.cds < /home/ampend/links/kidd-lab/ampend-projects/Zoey_Genome_Project/rna-seq/NonRedund

## How many genes are there in the final set?

In [55]:
lineCount = count_lines(IDFile)
print('%i transcripts in the final, nonredundant, nonRM transcript set' % lineCount)

24891 transcripts in the final, nonredundant, nonRM transcript set


In [212]:
count = 0
for gene in intDict.keys():
    if intDict[gene][8] != 'Fail':
        if intDict[gene][6] > 0.25:
            coord = intDict[gene][7][0]+':'+str(intDict[gene][7][1])+'-'+str(intDict[gene][7][2])
            print(gene,'\n',coord,'\n','BP overlap =', intDict[gene][5], '\nProp Overlap of Exon with RM =', intDict[gene][6],'\n')
            count+=1
            if count > 10:
                break

ID=Gene.148105::chr2.g22285.i1::g.148105::m.148105;chr2.g22285;ORF 
 chr2:56821343-56824318 
 BP overlap = 106 
Prop Overlap of Exon with RM = 0.2994350282485876 

ID=Gene.333527::chr5.g24367.i1::g.333527::m.333527;chr5.g24367;ORF 
 chr5:56932344-56933172 
 BP overlap = 132 
Prop Overlap of Exon with RM = 0.42718446601941745 

ID=Gene.417427::chr9.g22752.i1::g.417427::m.417427;chr9.g22752;ORF 
 chr9:40426435-40427940 
 BP overlap = 216 
Prop Overlap of Exon with RM = 0.47058823529411764 

ID=Gene.362173::chr6.g12517.i1::g.362173::m.362173;chr6.g12517;ORF 
 chr6:22521552-22522615 
 BP overlap = 178 
Prop Overlap of Exon with RM = 0.4529262086513995 

ID=Gene.192525::chr21.g3854.i1::g.192525::m.192525;chr21.g3854;ORF 
 chr21:20571489-20572755 
 BP overlap = 163 
Prop Overlap of Exon with RM = 0.4054726368159204 

ID=Gene.162622::chr20.g9661.i1::g.162622::m.162622;chr20.g9661;ORF 
 chr20:25324980-25325645 
 BP overlap = 151 
Prop Overlap of Exon with RM = 0.4793650793650794 

ID=Gene.1382