## Select best dammit annotations for a trinity transcriptome. Create tx2gene maps for use with tximport; aggregate contigs with identical "best" annotations

In [1]:
import pandas as pd
# requires dammit env (mine-->source activate dammit), or dammit installed in main env
# if using an env: if you haven't done so yet, first install an ipy kernel in dammit env:
# ipython kernel install --user --name dammit
# then start new notebook using "dammit" kernel, or for existing nb, select "switch kernel" in the Kernel menu
# voila! :D
from dammit.fileio.gff3 import GFF3Parser

#### Link Dammit and Trinity names
Use Dammit Namemap to create Dammit_Name : Trinity Transcript/Gene map

In [2]:
namemap = "dammit_w_nr/Trinity.fasta.dammit.namemap.csv"
trin2dammit = pd.read_csv(namemap)

# create trinity gene: trans map while we're at it
trin2dammit['Trinity_transcript'] = trin2dammit['original'].str.split(' ', 1).str[0]
trin2dammit['Trinity_gene'] = trin2dammit['Trinity_transcript'].str.rsplit('_i', 1).str[0] 

# drop 'original' (long) trinity name; rename dammit name column
trin2dammit.rename(index=str, columns={"renamed": "Dammit_transcript"}, inplace=True)
trin2dammit.drop(columns=['original'], inplace=True)

#### Parse Dammit GFF3
Use Dammit GFF3 to read in annotations, and link with Trinity Names

In [3]:
annots_gff3_file = "dammit_w_nr/Trinity.fasta.dammit.gff3"
annots = GFF3Parser(filename=annots_gff3_file).read() # read in annotation gff3
#print(annots.shape)
#annots.head()

  dtype=dict(self.columns)):
  dtype=dict(self.columns)):


#### Merge annotation gff3 with dammit-trinity namemap

In [4]:
annotsTrin = pd.merge(trin2dammit, annots, how='outer', left_on="Dammit_transcript", right_on="seqid")
#print(annotsTrin.shape)
#annotsTrin.head()

In [5]:
#count transcripts/annotations & do a sanity check on the merge:
numTranscripts = trin2dammit['Dammit_transcript'].nunique()
numWithAnnots = annots['seqid'].nunique()
numNoAnnots = numTranscripts - numWithAnnots
numMergedTranscripts = annotsTrin['Dammit_transcript'].nunique()

if (numMergedTranscripts != numTranscripts):
    print('something went wrong during merge')
else:
    print('Of ' + str(numTranscripts) + ' total transcripts:')
    print('    ' + str(numWithAnnots) + ' transcripts have at least one annotation')
    print('    ' + str(numNoAnnots)  + ' transcripts have no annotations')

Of 150663 total transcripts:
    70012 transcripts have at least one annotation
    80651 transcripts have no annotations


#### Select annotations with best e-values

Create annotation csv with the 1. best e-val hit, and 2. best Obimac hit for each transcript

In [6]:
annotsTrin.head()

Unnamed: 0,Dammit_transcript,Trinity_transcript,Trinity_gene,seqid,source,type,start,end,score,strand,...,ID,Name,Note,Parent,Target,accuracy,bitscore,database,env_coords,trunc
0,Transcript_0,TRINITY_DN49110_c0_g1_i1,TRINITY_DN49110_c0_g1,,,,,,,,...,,,,,,,,,,
1,Transcript_1,TRINITY_DN49156_c0_g1_i1,TRINITY_DN49156_c0_g1,,,,,,,,...,,,,,,,,,,
2,Transcript_2,TRINITY_DN49121_c0_g1_i1,TRINITY_DN49121_c0_g1,,,,,,,,...,,,,,,,,,,
3,Transcript_3,TRINITY_DN49116_c0_g1_i1,TRINITY_DN49116_c0_g1,Transcript_3,transdecoder,CDS,173.0,482.0,,+,...,cds.Transcript_3.p2,,,Transcript_3.p2,,,,,,
4,Transcript_3,TRINITY_DN49116_c0_g1_i1,TRINITY_DN49116_c0_g1,Transcript_3,transdecoder,exon,0.0,482.0,,+,...,Transcript_3.p2.exon1,,,Transcript_3.p2,,,,,,


In [7]:
# find best Obimac hit per transcript (using best evalue)
Obimac = annotsTrin[annotsTrin['database'] == "Obimac_refseq_protein.fa.gz"]
Obimac = Obimac.sort_values(by=['Trinity_transcript', 'score'], ascending=True).query('score < 1e-05').drop_duplicates(subset='Trinity_transcript')[['Trinity_transcript','Name', 'score', 'start', 'end']]
#Obimac = Obimac.dropna(axis=0,how="all")
Obimac.rename(index=str, columns={'Name': 'Obimac_Name', 'score': 'Obimac_score', 'start': 'Obimac_start', 'end':'Obimac_end'}, inplace=True)

#Obimac.tail()

#print(Obimac.shape)
#Obimac.head()
#list(Obimac['Name']) # if you want to read the names
# find best Eval that is *not* an Obimac hit

noObimac = annotsTrin[annotsTrin['database'] != 'Obimac_refseq_protein.fa.gz']
bestEval = noObimac.sort_values(by=['Trinity_transcript', 'score'], ascending=True).query('score < 1e-05').drop_duplicates(subset='Trinity_transcript')
#bestEval = bestEval.dropna(axis=0,how="all")

print(bestEval.shape)
#bestEval.head()
# merge bestEval and Obimac annotation info
bestAnnots = pd.merge(bestEval, Obimac, how='outer', left_on="Trinity_transcript", right_on="Trinity_transcript")
print(bestAnnots.shape)
bestAnnots.to_csv("dammit_w_nr/dammit_bestEvalpertranscript_onlyannots.tsv", index=False, sep = '\t') # csv of best eval, with Obimac hit if no best eval

# to keep the unannotated transcripts, merge those back in
# merge will have not know which trinity gene or dammit transcript name to keep: NaN from Annots, or real gene from trin2dammit
bestAnnots.drop(columns=['Trinity_gene', "Dammit_transcript"], inplace=True) # remove them first, then merge in trin2dammit
bestAnnotFull = pd.merge(trin2dammit, bestAnnots, how='outer', on=["Trinity_transcript"])
print(bestAnnotFull.shape)
bestAnnotFull.to_csv("dammit_w_nr/dammit_bestEvalperTranscript.tsv", index=False, sep='\t')


(54896, 22)
(55001, 26)
(150663, 26)


In [8]:
bestAnnotFull.tail()

Unnamed: 0,Dammit_transcript,Trinity_transcript,Trinity_gene,seqid,source,type,start,end,score,strand,...,Target,accuracy,bitscore,database,env_coords,trunc,Obimac_Name,Obimac_score,Obimac_start,Obimac_end
150658,Transcript_150658,TRINITY_DN46499_c0_g1_i1,TRINITY_DN46499_c0_g1,,,,,,,,...,,,,,,,,,,
150659,Transcript_150659,TRINITY_DN46470_c0_g1_i1,TRINITY_DN46470_c0_g1,,,,,,,,...,,,,,,,,,,
150660,Transcript_150660,TRINITY_DN46465_c0_g1_i1,TRINITY_DN46465_c0_g1,Transcript_150660,LAST,translated_nucleotide_match,1.0,247.0,1.5e-32,+,...,UniRef90_A0CRS7 2 85 +,,,uniref90,,,,,,
150661,Transcript_150661,TRINITY_DN46482_c0_g1_i1,TRINITY_DN46482_c0_g1,Transcript_150661,LAST,translated_nucleotide_match,2.0,101.0,2.9e-17,+,...,UniRef90_Q05973 1489 1522 +,,,uniref90,,,gi|961088763|ref|XP_014771226.1| PREDICTED: so...,1.9e-13,1.0,33.0
150662,Transcript_150662,TRINITY_DN46441_c0_g1_i1,TRINITY_DN46441_c0_g1,,,,,,,,...,,,,,,,,,,


The above keeps all identified trinity transcripts, and just reports the best annotation (determined by eval), as well as the best Obimac annotation (also det. by eval).

However, there are undoubtedly some duplicate annotations, as there are v. likely not that many genes in D. opalescens. For differential expression, we're primarily interested in functional changes, and spreading the expression of a gene across different contigs with the exact same best annotation is not desirable. Can we collapse by best annotation?

#### Create Tx2Gene files
Create Transcript to Gene (tx2gene) files that assign transcripts to their annotated gene names,
rather than just the Trinity gene name. If we want to keep unannotated transcripts, we can retain 
Trinity gene name for unannotated transcripts. These can be used with tximport to aggregate transcript-level
quantification to gene-level counts for differential expression analysis.

In [9]:
bestAnnot_tx2gene = bestAnnots[['Trinity_transcript', 'Name']].dropna()
#bestAnnots['Name'].nunique() # just checking that there are duplicate 'Name' column values. There are.
bestAnnot_tx2gene.head()
bestAnnot_tx2gene.to_csv('dammit_w_nr/bestEval_tx2gene.txt', index=False, sep = '\t')
# do this using the Obimac best annotations instead:
bestObimac_tx2gene = bestAnnots[['Trinity_transcript', 'Obimac_Name']].dropna()
bestObimac_tx2gene.to_csv('dammit_w_nr/bestObimac_tx2gene.txt', index=False, sep = '\t') 

Let's make a version that includes unannotated transcripts. Use Trinity gene info as gene name

In [10]:
#remake df: (easier for testing)
bestAnnotFull = pd.merge(bestAnnots, trin2dammit, how='outer', on=["Trinity_transcript"])
print(bestAnnotFull.shape)
#bestAnnotFull.head()
# first, fill NA in the 'Name' column with Obimac annotations

numAnnots = len(list(bestAnnotFull.Name.dropna()))
print('# annotations using only best Evalue: ' + str(numAnnots))
bestAnnotFull.Name = bestAnnotFull.Name.fillna(value=bestAnnotFull.Obimac_Name)
numAnnots_wOb = len(list(bestAnnotFull.Name.dropna()))
print('# annotations using best Evalue AND best Obimac Evalue: ' + str(numAnnots_wOb))

bestEandObimac_tx2gene = bestAnnotFull[['Trinity_transcript', 'Name']].dropna()
bestEandObimac_tx2gene.to_csv('dammit_w_nr/bestEval_Obimac_tx2gene.txt', index=False, sep = '\t')

(150663, 26)
# annotations using only best Evalue: 54896
# annotations using best Evalue AND best Obimac Evalue: 55001


In [11]:
# Then, fill remaining NA values in the 'Name' column with the Trinity Gene name:
bestAnnotFull.Name = bestAnnotFull.Name.fillna(value=bestAnnotFull.Trinity_gene)
bestAnnotFull_tx2gene = bestAnnotFull[['Trinity_transcript', 'Name']]
print(bestAnnotFull_tx2gene.shape)
bestAnnotFull_tx2gene.to_csv('dammit_w_nr/bestEvalFull_tx2gene.txt', index=False, sep = '\t')
bestAnnotFull.head()

(150663, 2)


Unnamed: 0,Trinity_transcript,seqid,source,type,start,end,score,strand,phase,Dbxref,...,bitscore,database,env_coords,trunc,Obimac_Name,Obimac_score,Obimac_start,Obimac_end,Dammit_transcript,Trinity_gene
0,TRINITY_DN10002_c0_g1_i1,Transcript_55709,LAST,translated_nucleotide_match,1.0,400.0,2.9999999999999997e-109,-,,,...,,uniref90,,,,,,,Transcript_55709,TRINITY_DN10002_c0_g1
1,TRINITY_DN10005_c0_g1_i1,Transcript_55708,LAST,translated_nucleotide_match,7.0,160.0,2.6e-14,+,,,...,,uniref90,,,,,,,Transcript_55708,TRINITY_DN10005_c0_g1
2,TRINITY_DN10017_c0_g1_i1,Transcript_55702,HMMER,protein_hmm_match,141.0,345.0,9.9e-16,,,"""Pfam:PF00085.16""",...,,,127 345,,,,,,Transcript_55702,TRINITY_DN10017_c0_g1
3,TRINITY_DN10022_c0_g1_i1,Transcript_55699,LAST,translated_nucleotide_match,97.0,544.0,2.2999999999999997e-57,-,,,...,,uniref90,,,,,,,Transcript_55699,TRINITY_DN10022_c0_g1
4,TRINITY_DN10022_c0_g1_i2,Transcript_55700,LAST,translated_nucleotide_match,97.0,187.0,1.8e-11,-,,,...,,uniref90,,,gi|961088520|ref|XP_014771140.1| PREDICTED: po...,1.4e-11,32.0,62.0,Transcript_55700,TRINITY_DN10022_c0_g1


In [12]:
#---> WORKING HERE NOW
# try to collapse annots using best annot database --> I think OrthoDB? Alt we could check if ANY of them are the same. 
# any of them --> would need to collapse based on "Name column" rather than on score... hmmmmm can try that! 
# assuming all of them are decent e-values, this could work. 
#WHAT ABOUT BEST EVAL ON ORTHO DB (or any other db, just want to choose a primary one for collapsing purposes)
#... really want to collapse if Any annotations are the same...

In [13]:
# best OrthoDB hit
ortho = annotsTrin[annotsTrin['database'] == 'OrthoDB']
bestOrtho = ortho.sort_values(by=['Trinity_transcript', 'score'], ascending=True).query('score < 1e-05').drop_duplicates(subset='Trinity_transcript')
numAnnotsOrtho = len(list(bestOrtho.Name.dropna()))
print('# annotations using only best Ortho: ' + str(numAnnotsOrtho))

# remake bestEval (for better testing)
noObimac = annotsTrin[annotsTrin['database'] != 'Obimac_refseq_protein.fa.gz']
bestEval = noObimac.sort_values(by=['Trinity_transcript', 'score'], ascending=True).query('score < 1e-05').drop_duplicates(subset='Trinity_transcript')
# subset by columns
bestEval = bestEval[['Trinity_transcript','Name', 'score', 'start', 'end']]
bestEval.rename(index=str, columns={'Name': 'bEval_Name', 'score': 'bEval_score', 'start': 'bEval_start', 'end':'bEval_end'}, inplace=True)
#merge
bestOrthoEv = pd.merge(bestEval, bestOrtho, how='outer', on="Trinity_transcript")
numAnnotsOrthoEv = len(list(bestOrthoEv.Trinity_transcript.dropna()))
print('# annotations using only best Ortho + best Eval: ' + str(numAnnotsOrthoEv))
bestOrthoEv.head()
#merge with Obimac hits
bestOrthoEvOb = pd.merge(bestOrthoEv, Obimac, how='outer', on="Trinity_transcript")
numAnnots_wEv_Ob = len(list(bestOrthoEvOb.Trinity_transcript.dropna()))
print('# annotations using best OrthoDB, Evalue, best Obimac Evalue: ' + str(numAnnots_wEv_Ob))

# ok, now add in the unannotated hits
bestOrthoEvOb.drop(columns=['Trinity_gene', "Dammit_transcript"], inplace=True) # remove them first, then merge in trin2dammit
bestOrthoEvFull = pd.merge(trin2dammit, bestOrthoEvOb, how='outer', on=["Trinity_transcript"])
print(bestOrthoEvFull.shape)
bestOrthoEvFull.to_csv("dammit_w_nr/dammit_bestOrtho_Ev_Ob_perTranscript.tsv", index=False, sep='\t')


# To generate the appropriate tx2gene files:
#fill NA's in the 'Name' column with 1. best Eval, 2. Ob, 3, Trinity gene name to Name 
bestOrthoEvFull.Name = bestOrthoEvFull.Name.fillna(value=bestOrthoEvFull.bEval_Name)
bestOrthoEvFull.Name = bestOrthoEvFull.Name.fillna(value=bestOrthoEvFull.Obimac_Name)
bestOrthoEvFull.Name = bestOrthoEvFull.Name.fillna(value=bestOrthoEvFull.Trinity_gene)
# now just take the Transcript, Name cols:
bestOrthoEvFull_tx2gene = bestOrthoEvFull[['Trinity_transcript', 'Name']]
bestOrthoEvFull_tx2gene.to_csv('dammit_w_nr/bestOrtho_EvOb_tx2gene.txt', index=False, sep = '\t')

# annotations using only best Ortho: 41250
# annotations using only best Ortho + best Eval: 54896
# annotations using best OrthoDB, Evalue, best Obimac Evalue: 55001
(150663, 30)


In [14]:
bestOrthoEvFull_tx2gene.tail()


Unnamed: 0,Trinity_transcript,Name
150658,TRINITY_DN46499_c0_g1_i1,TRINITY_DN46499_c0_g1
150659,TRINITY_DN46470_c0_g1_i1,TRINITY_DN46470_c0_g1
150660,TRINITY_DN46465_c0_g1_i1,RL17_YARLI
150661,TRINITY_DN46482_c0_g1_i1,UniRef90_Q05973
150662,TRINITY_DN46441_c0_g1_i1,TRINITY_DN46441_c0_g1


In [15]:
# Let's try Ob annotation 1st, then best Eval, then Trinity Gene name:
# best Obimac hit
Obimac = annotsTrin[annotsTrin['database'] == "Obimac_refseq_protein.fa.gz"]
Obimac = Obimac.sort_values(by=['Trinity_transcript', 'score'], ascending=True).query('score < 1e-05').drop_duplicates(subset='Trinity_transcript')#[['Trinity_transcript','Name', 'score', 'start', 'end']]
numAnnotsOb = len(list(Obimac.Trinity_transcript.dropna()))
print('# annotations using only best Obimac: ' + str(numAnnotsOb))
#Obimac = Obimac.dropna(axis=0,how="all")
# no need to rename, since they're going to be primary hit this time
#Obimac.rename(index=str, columns={'Name': 'Obimac_Name', 'score': 'Obimac_score', 'start': 'Obimac_start', 'end':'Obimac_end'}, inplace=True)
#Obimac.tail()
# remake bestEval (for better testing)
noObimac = annotsTrin[annotsTrin['database'] != 'Obimac_refseq_protein.fa.gz']
bestEval = noObimac.sort_values(by=['Trinity_transcript', 'score'], ascending=True).query('score < 1e-05').drop_duplicates(subset='Trinity_transcript')
# subset by columns
bestEval = bestEval[['Trinity_transcript','Name', 'score', 'start', 'end']]
bestEval.rename(index=str, columns={'Name': 'bEval_Name', 'score': 'bEval_score', 'start': 'bEval_start', 'end':'bEval_end'}, inplace=True)
#merge
bestObEv = pd.merge(Obimac, bestEval, how='outer', on="Trinity_transcript")
numAnnotsObEv = len(list(bestObEv.Trinity_transcript.dropna()))
print('# annotations using only best Obimac + best Eval: ' + str(numAnnotsObEv))
bestObEv.head()
# ok, now add in the unannotated hits
bestObEv.drop(columns=['Trinity_gene', "Dammit_transcript"], inplace=True) # remove them first, then merge in trin2dammit
bestObEvFull = pd.merge(trin2dammit, bestObEv, how='outer', on=["Trinity_transcript"])
print(bestObEvFull.shape)
bestObEvFull.to_csv("dammit_w_nr/dammit_bestObEvannot.tsv", index=False, sep='\t')


# To generate the appropriate tx2gene files:
#fill NA's in the 'Name' column with 1. best Eval, 2. Ob, 3, Trinity gene name to Name 
bestObEvFull.Name = bestObEvFull.Name.fillna(value=bestObEvFull.bEval_Name)
bestObEvFull.Name = bestObEvFull.Name.fillna(value=bestObEvFull.Trinity_gene)
# now just take the Transcript, Name cols:
bestObEvFull_tx2gene = bestObEvFull[['Trinity_transcript', 'Name']]
bestObEvFull_tx2gene.to_csv('dammit_w_nr/bestObEv_tx2gene.txt', index=False, sep = '\t')




# annotations using only best Obimac: 21254
# annotations using only best Obimac + best Eval: 55001
(150663, 26)


In [16]:
# That worked better! Let's try: Ob -> Uniref90 --> bestEval --> Trinity Gene
# Let's try Ob annotation 1st, then best Eval, then Trinity Gene name:
# best Obimac hit
Obimac = annotsTrin[annotsTrin['database'] == "Obimac_refseq_protein.fa.gz"]
Obimac = Obimac.sort_values(by=['Trinity_transcript', 'score'], ascending=True).query('score < 1e-05').drop_duplicates(subset='Trinity_transcript')#[['Trinity_transcript','Name', 'score', 'start', 'end']]
numAnnotsOb = len(list(Obimac.Trinity_transcript.dropna()))
print('# annotations using only best Obimac: ' + str(numAnnotsOb))
#Uniref90
ur90 = annotsTrin[annotsTrin['database'] == "uniref90"]
ur90 = ur90.sort_values(by=['Trinity_transcript', 'score'], ascending=True).query('score < 1e-05').drop_duplicates(subset='Trinity_transcript')[['Trinity_transcript','Name', 'score', 'start', 'end']]
ur90.rename(index=str, columns={'Name': 'ur90_Name', 'score': 'ur90_score', 'start': 'ur90_start', 'end':'ur90_end'}, inplace=True)

# remake bestEval (for better testing)
noObimac = annotsTrin[annotsTrin['database'] != 'Obimac_refseq_protein.fa.gz']
noObUr90 = noObimac[noObimac['database']!= 'uniref90']
bestE = noObUr90.sort_values(by=['Trinity_transcript', 'score'], ascending=True).query('score < 1e-05').drop_duplicates(subset='Trinity_transcript')
# subset by columns
bestE = bestE[['Trinity_transcript','Name', 'score', 'start', 'end']]
bestE.rename(index=str, columns={'Name': 'bEval_Name', 'score': 'bEval_score', 'start': 'bEval_start', 'end':'bEval_end'}, inplace=True)


#ok, now we have Ob, ur90, and bestE. merge.
bestOb90 = pd.merge(Obimac, ur90, how='outer', on="Trinity_transcript")
numAnnotsOb90 = len(list(bestOb90.Trinity_transcript.dropna()))
print('# annotations using only best Obimac + best ur90: ' + str(numAnnotsOb90))
bestOb90e =  pd.merge(bestOb90, bestE, how='outer', on="Trinity_transcript")
numAnnotsOb90e = len(list(bestOb90e.Trinity_transcript.dropna()))
print('# annotations using best Obimac + best ur90 + best Eval: ' + str(numAnnotsOb90e))

bestOb90e.head()
#print tx2gene of only Annotated contigs:
bestOb90e_tx2gene = bestOb90e[['Trinity_transcript', 'Name']]
bestOb90e_tx2gene.to_csv("dammit_w_nr/dammit_bestOb_ur90Ev_onlyAnnots_tx2gene.txt", index=False, sep='\t')

#ok, now add in the unannotated hits
bestOb90e.drop(columns=['Trinity_gene', "Dammit_transcript"], inplace=True) # remove them first, then merge in trin2dammit
bestOb90eFull = pd.merge(trin2dammit, bestOb90e, how='outer', on=["Trinity_transcript"])
print(bestOb90eFull.shape)
bestOb90eFull.to_csv("dammit_w_nr/dammit_bestOb_ur90Evannot.tsv", index=False, sep='\t')


# To generate the appropriate tx2gene files:
#fill NA's in the 'Name' column with 1. best Eval, 2. Ob, 3, Trinity gene name to Name 
bestOb90eFull.Name = bestOb90eFull.Name.fillna(value=bestOb90eFull.ur90_Name)
bestOb90eFull.Name = bestOb90eFull.Name.fillna(value=bestOb90eFull.bEval_Name)
bestOb90eFull.Name = bestOb90eFull.Name.fillna(value=bestOb90eFull.Trinity_gene)
# now just take the Transcript, Name cols:
bestOb90eFull_tx2gene = bestOb90eFull[['Trinity_transcript', 'Name']]
bestOb90eFull_tx2gene.to_csv('dammit_w_nr/bestOb_ur90Ev_tx2gene.txt', index=False, sep = '\t')


bestOb90e.columns

# annotations using only best Obimac: 21254
# annotations using only best Obimac + best ur90: 51974
# annotations using best Obimac + best ur90 + best Eval: 55001
(150663, 30)


Index(['Trinity_transcript', 'seqid', 'source', 'type', 'start', 'end',
       'score', 'strand', 'phase', 'Dbxref', 'ID', 'Name', 'Note', 'Parent',
       'Target', 'accuracy', 'bitscore', 'database', 'env_coords', 'trunc',
       'ur90_Name', 'ur90_score', 'ur90_start', 'ur90_end', 'bEval_Name',
       'bEval_score', 'bEval_start', 'bEval_end'],
      dtype='object')

In [17]:
#Not used: Aggregate by identical best eval hits:
# Collapse using best Eval hit:
#bestAnnots.groupby('Name',as_index=False)[['Dammit_transcript']].aggregate(lambda x: list(x))
#Obimac.groupby('Name',as_index=False)[['seqid']].aggregate(lambda x: list(x))
# this groups by Name, but it it necessary for what I want? I just need a transcript \t gene, where transcript 
# is the trinity transcript name, and 'gene' is the 'Name' value.