# formerly intersect_counts_with_blast_9-27-2016
- Intersects the filtered assembly with the blast results to annotate contigs with high scoring matches
- Clusters contigs according to their blast result.

In [41]:
import pandas as pd
import numpy as np
import os
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC

In [2]:
wd = '/home/bay001/projects/kes_20160307/data/'
blast_head = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch',
              'gapopen', 'qstart', 'qend', 'sstart', 'send',
              'evalue', 'bitscore']

### Load BLAST results from blastx to chicken ensembl protein
- Filter out e-value less than threshold (set at 1e-10)
- Sort by e-value (ascending)
- Remove duplicate contigs (contigs which map to multiple proteins, only take the lowest e-value)

In [3]:
evalue_threshold = 1e-10
blast = pd.read_table(os.path.join(wd,'blast/chicken.blastx'),names=blast_head)
print("number of starting hits: {}".format(blast.shape[0]))
blast = blast[blast['evalue'] < evalue_threshold]
print("number of filtered for e-value hits: {}".format(blast.shape[0])) 
blast.sort_values(by='evalue',inplace=True) # sort by evalue (lower is better)
blast.drop_duplicates('qseqid',inplace=True, keep='first') # keep only the top hit per query
print("number of filtered hits for de-duplicated contigs: {}".format(blast.shape[0]))
blast.set_index('qseqid',inplace=True)
blast.head()

number of starting hits: 331175
number of filtered for e-value hits: 83725
number of filtered hits for de-duplicated contigs: 55116


Unnamed: 0_level_0,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
qseqid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
unmapped-49-contig_list_contig_207806-0,ENSGALG00000028975,84.913,517,74,2,1549,2,375,888,0.0,853.0
unmapped-49-contig_list_contig_48921-0,ENSGALG00000002638,92.483,439,33,0,6438,5122,1,439,0.0,815.0
unmapped-49-contig_list_contig_33476-0,ENSGALG00000002798,98.047,717,14,0,3247,1097,16,732,0.0,1429.0
unmapped-49-contig_list_contig_67081-0,ENSGALG00000005123,99.714,350,1,0,2639,1590,1,350,0.0,697.0
EC-4AK111_TAGCTT_R1_(paired)_contig_5821-0,ENSGALG00000006425,94.286,315,10,2,54,992,3,311,0.0,559.0


In [4]:
def get_start(row):
    """ Returns 1-based position of contig start """
    return row['qstart'] if row['qstart'] < row['qend'] else row['qend']
def get_end(row):
    """ Returns the closed contig end """
    return row['qend'] if row['qend'] > row['qstart'] else row['qstart']
def is_reversed(row):
    """ If the reported query end < query start, that's a reversed. returns 1 if reversed """
    return 1 if row['qend'] < row['qstart'] else 0
def revcomp(seq):
    """ Returns reverse complement """
    seq2 = Seq(seq)
    return str(seq2.reverse_complement())
    
def get_blast_seq(row):
    """ Returns the part of the contig sequence that BLAST matched """
    if(row['reverse']==1):
        return revcomp(row['seq'][row['start']-1:row['end']])
    else:
        return row['seq'][row['start']-1:row['end']]

In [5]:
contig2chicken = blast.loc[:,['length','qstart','qend','sstart','send','sseqid']]
contig2chicken['start'] = contig2chicken.apply(get_start,axis=1)
contig2chicken['end'] = contig2chicken.apply(get_end,axis=1)
contig2chicken['reverse'] = contig2chicken.apply(is_reversed,axis=1)
contig2chicken.head()

Unnamed: 0_level_0,length,qstart,qend,sstart,send,sseqid,start,end,reverse
qseqid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
unmapped-49-contig_list_contig_207806-0,517,1549,2,375,888,ENSGALG00000028975,2,1549,1
unmapped-49-contig_list_contig_48921-0,439,6438,5122,1,439,ENSGALG00000002638,5122,6438,1
unmapped-49-contig_list_contig_33476-0,717,3247,1097,16,732,ENSGALG00000002798,1097,3247,1
unmapped-49-contig_list_contig_67081-0,350,2639,1590,1,350,ENSGALG00000005123,1590,2639,1
EC-4AK111_TAGCTT_R1_(paired)_contig_5821-0,315,54,992,3,311,ENSGALG00000006425,54,992,0


In [8]:
# build a header:seq dictionary

def get_full_seq():
    seq_dict = {}
    handle = open("/home/bay001/projects/kes_20160307/data/kestrel5-reclustered.no-aerv.no-mtdna.no-vec.no-virus.no-bac.200.fasta", "rU")
    for record in SeqIO.parse(handle, "fasta"):
        seq_dict[record.id] = str(record.seq)
    return seq_dict

seq_dictionary = get_full_seq()

In [9]:
seq_df = pd.DataFrame(seq_dictionary.items(), columns=['id','seq'])
seq_df.set_index('id',inplace=True)
seq_df.head()

Unnamed: 0_level_0,seq
id,Unnamed: 1_level_1
unmapped-49-contig_list_contig_164132-0,CTATGAACAACTGGACTACCCTTTTGGAGAACAGCAGGCATTGCCT...
unmapped-49-contig_list_contig_190596-0,CATCTGGAAGGCTTAAGAGCATTTGGAATGTATTATTCAGTTATAT...
unmapped-49-contig_list_contig_69758-0,CTGCCTTCACAAAGCTTCCTGACTGCGTCTGATTTGGGAGGGCGAA...
unmapped-49-contig_list_contig_98789-0,CTGATTGCCCGGTCGTACATCTAAAGCAACATCACCAGGCACAGTT...
unmapped-49-contig_list_contig_251301-0,GGATTTTAAAAGAATATTTCCAATCCACAAAAGCCCACTAAAGTTG...


In [10]:
blast_with_seqs = pd.merge(contig2chicken,seq_df,how='left',left_index=True,right_index=True)
blast_with_seqs.head()

Unnamed: 0_level_0,length,qstart,qend,sstart,send,sseqid,start,end,reverse,seq
qseqid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
unmapped-49-contig_list_contig_207806-0,517,1549,2,375,888,ENSGALG00000028975,2,1549,1,CTGGAGCCCCCCTGTCCTGCTCATTAACCTGGGGGGTCAGCACCGG...
unmapped-49-contig_list_contig_48921-0,439,6438,5122,1,439,ENSGALG00000002638,5122,6438,1,GGGGGAAATGCAACTGCCAGACCTTGTTCCTGAGGATGCATCTATT...
unmapped-49-contig_list_contig_33476-0,717,3247,1097,16,732,ENSGALG00000002798,1097,3247,1,AGCTGCGAGTAATGAGCCAGCCTGGGAAATCAAAAAGACAGACAGT...
unmapped-49-contig_list_contig_67081-0,350,2639,1590,1,350,ENSGALG00000005123,1590,2639,1,AAGTCATGAAGATAAACCAGTCAGTTTATGCTGTCTGCAAATATAA...
EC-4AK111_TAGCTT_R1_(paired)_contig_5821-0,315,54,992,3,311,ENSGALG00000006425,54,992,0,CGCCGCACCGCTCCGCCGGACGCGCCGCACCGCCGCAGGGGCGCAG...


In [11]:
blast_with_seqs['blast_seq_only'] = blast_with_seqs.apply(get_blast_seq,axis=1)

# Sort and see if there are overlapping or contigs which can be patched together by sseqid
- need to resolve areas where multiple contigs map to the same gene.
- traditionally we've clustered them using cd-hit-est
- need to subset each gene then.

In [17]:
blast_with_seqs.sort_values('sseqid',inplace=True)
blast_with_seqs.head()

Unnamed: 0_level_0,length,qstart,qend,sstart,send,sseqid,start,end,reverse,seq,blast_seq_only
qseqid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
unmapped-49-contig_list_contig_81985-0,440,569,1888,66,505,ENSGALG00000000003,569,1888,0,AGTTAACACACGAGACTCTGTCCTGGTTAGTCACACTTTAATTCAA...,TTTTCTCTCTGTTATACAGAGGAGCCAATATACTGTTACACACCAC...
unmapped-49-contig_list_contig_48481-0,44,2267,2136,27,70,ENSGALG00000000004,2136,2267,1,GGGGGCTGTATCCAGCCCATGGGCCGTAGTTTGAGGACCCCTGAAT...,GCAGGTAACTTTTCTGAGCAAGCAGTTGAAAACTTTCCATCTTATA...
unmapped-49-contig_list_contig_35267-0,37,114,4,79,115,ENSGALG00000000004,4,114,1,CTAAAAGGCCTCTAAGGAATCAAAGTTTTTTTCTGGTCGAATATAT...,GAAACACACATTATCCATACCTTCAAAGAAGACTTTTATGGAGAAA...
unmapped-49-contig_list_contig_35266-0,94,283,2,27,120,ENSGALG00000000004,2,283,1,ATTGAATTGCTGAAATGAGCGCCTCTAAGGAATCAAAGTTTTTTTC...,ACCGCTAACTTTTCTGACCAAGTAGTTGAAAGCTTTCCGTCTGATA...
EC-4AK111_TAGCTT_R1_(paired)_contig_5796-0,81,3,245,76,156,ENSGALG00000000004,3,245,0,GGAAATCAGTGGAAACACACATTATCCATACCTTCAAAGAAGACTT...,AAATCAGTGGAAACACACATTATCCATACCTTCAAAGAAGACTTTT...


In [106]:
# These such sequences need to be clustered together since they are part of the same gene and shouldn't be treated as separate
print(blast_with_seqs.ix['unmapped-49-contig_list_contig_48481-0']['blast_seq_only'])
print(blast_with_seqs.ix['unmapped-49-contig_list_contig_35266-0']['blast_seq_only'])
# Number of unclustered de-duped contigs annotated (should be the same as above)
len((blast_with_seqs['sseqid']))

GCAGGTAACTTTTCTGAGCAAGCAGTTGAAAACTTTCCATCTTATATCTGTACTGGTATATACTATGGATGGTCCTGTGCTGGAAATGGAGATGTGCATAAAATGGTTTTGAACACAGGATGGAATCCTTTC
ACCGCTAACTTTTCTGACCAAGTAGTTGAAAGCTTTCCGTCTGATATCTCTACTGGTATATACTATGGATGGGCCTGTGTTGGAAATGGAGATGTGCATAAAATGGTTTTGAGCATAGGATGGAATCCTTTCTATAAGAATATTAAGAAATCAGTGGAAACACACATTATCCATACCTTCAAAGAAGACTTTTATGGAGAAATTCTTAGTATAGTCATAATTGGATATATTCGACCAGAAAAAAACTTTGATTCCTTAGAGGCGCTCATTTCAGCAATTCAA


55116

In [102]:
# Number of genes partially annotated
X = blast_with_seqs.reset_index().set_index(['sseqid']) # switch indices
len(set(X.index))

12756

In [90]:
def group_by_geneid_and_cluster():
    """
    This function groups sequences by their gene id and runs cd-hit-est to cluster the groups once more.
    """
    unclustered_dir = '/home/bay001/projects/kes_20160307/data/unclustered_genes'
    clustered_dir = '/home/bay001/projects/kes_20160307/data/clustered_genes'
    x = 0  # just a counter that prints a dot every 100 entries (use tqdm for this!!)
    log = {} # capture the log file to look at maybe
    skip = False  # If True, don't run cd hit 
    for gene in list(set(X.index)):
        outfile = os.path.join(unclustered_dir,'{}.fasta'.format(gene))
        clustered_file = os.path.join(clustered_dir,'{}.clustered.fasta'.format(gene))
        dfx = X.ix[gene]
        if(type(dfx)==pd.core.series.Series): # if dfx is a series, we just have one sequence in gene.
            dfx = pd.DataFrame(X.ix[gene]).T
            skip = True
        records = [] # list of records per gene
        try:
            for col,row in dfx.iterrows():
                i = row['qseqid']
                j = row['blast_seq_only']
                record = SeqRecord(Seq(j,IUPAC.IUPACUnambiguousDNA),id=i,description="unclustered_contig_of_{}".format(gene))
                records.append(record)
            SeqIO.write(records,outfile,"fasta")
            if(skip!=True):
                capture = ! /projects/ps-yeolab3/bay001/software/cdhit/cd-hit-est -i $outfile -o $clustered_file -c 0.95
                log[gene] = capture
            else:
                ! mv $outfile $clustered_file
            x = x + 1
            if x%100 == 0:
                print('.'),
        except Exception as e:
            print(e)
            print(gene)
        
log = group_by_geneid_and_cluster()

. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .


# Concatenate all clustered fasta sequences to get the final gene annotations

In [100]:
allgenes = ! ls /home/bay001/projects/kes_20160307/data/clustered_genes/*.fasta
len(allgenes)

12756

In [101]:
full_clustered_output = '/home/bay001/projects/kes_20160307/data/clustered_genes/kestrel_blast_clustered_transcripts.fa'
records = []
for gene in allgenes:
    handle = open(gene, "rU")
    for record in SeqIO.parse(handle, "fasta"):
        new_record = SeqIO.SeqRecord(record.seq,record.id,description=record.description.replace('unclustered','clustered'))
        
        records.append(new_record)
SeqIO.write(records,full_clustered_output,"fasta")

38289

In [None]:

full_clustered_output = '/home/bay001/projects/kes_20160307/data/unclustered_genes/kestrel_blast_unclustered_transcripts.fa'
records = []
for gene in allgenes:
    handle = open(gene, "rU")
    for record in SeqIO.parse(handle, "fasta"):
        new_record = SeqIO.SeqRecord(record.seq,record.id,description=record.description.replace('unclustered','clustered'))
        
        records.append(new_record)
SeqIO.write(records,full_clustered_output,"fasta")

In [None]:
to_export = pd.concat([blast_with_seqs['sseqid'],blast_with_seqs['blast_seq_only']],axis=1)
to_export_full = pd.concat([blast_with_seqs['sseqid'],blast_with_seqs['seq']],axis=1)

In [None]:
o = open('/home/bay001/projects/kes_20160307/data/clustered_genes/log.txt','w')
for gene, out in log.iteritems():
    o.write(gene)
    for line in out:
        o.write(line+'\n')
o.close()

In [None]:
to_export.to_csv(os.path.join(wd,'blast_seq_only.txt'),sep="\t",header=None)
to_export_full.to_csv(os.path.join(wd,'blast_seq_full.txt'),sep="\t",header=None)

### Load BLAST results from blastx (actually used diamond for this analysis because it's faster) to uniref90
- Filter out e-value less than threshold (set at 1e-10)
- Sort by e-value (ascending)
- Remove duplicate contigs (contigs which map to multiple proteins, only take the lowest e-value)

In [None]:
uniref = pd.read_table(os.path.join(wd,'diamond/all.blast'),names=blast_head)
print("number of starting hits: {}".format(uniref.shape[0]))
uniref = uniref[uniref['evalue'] < 1e-10]
print("number of filtered for e-value hits: {}".format(uniref.shape[0]))
uniref.sort_values(by='evalue',inplace=True)
uniref.drop_duplicates('qseqid',inplace=True, keep='first')
print("number of filtered hits for de-duplicated contigs: {}".format(uniref.shape[0]))
uniref.head()

### This is a silly converter since it's difficult to map existing sseqids to any database. So I need to re-map them to their RepID (gene name).
- grep '>' uniref90.fasta > uniref90.headers
- extract from the fasta header the RepID
- create translation table (UniRef90_* -> RepID)

In [None]:
uniref_translation = pd.read_table('/home/bay001/projects/kes_20160307/data/uniref90.headers',names=['uniref'])
uniref_translation.head(2)

In [None]:
uniref_translation = uniref_translation['uniref'].str.replace('>','')

In [None]:
uniref2gene = uniref_translation.str.extract('(^[\w\d-]+).+ RepID=([\w-]+)$')

In [None]:
uniref2gene.reset_index(inplace=True)
uniref2gene.head()

In [None]:
# save this intermediate step because this takes a long time.
uniref2gene.to_csv('/home/bay001/projects/kes_20160307/data/uniref2gene.txt',sep='\t',header=None,index=None)

In [None]:
uniref2gene = pd.read_table('/home/bay001/projects/kes_20160307/data/uniref2gene.txt',names=[0,1])
uniref2gene.head(2)

In [None]:
# make sure we capture all annotations, that the regex expression is correct. This list should be empty.
uniref2gene[uniref2gene.isnull().any(axis=1)]

In [None]:
uniref2blast = pd.merge(uniref,uniref2gene,how='left',left_on='sseqid',right_on=0)
uniref2blast.head()

In [None]:
contig2uniref = uniref2blast.loc[:,['qseqid','sseqid']]
contig2uniref.set_index('qseqid',inplace=True)
contig2uniref.rename(columns={'sseqid':'uniref'},inplace=True)
print(contig2uniref.shape)
contig2uniref.drop_duplicates() # Sanity check.
print(contig2uniref.shape)
contig2uniref.head()

In [None]:
uniref2blast[uniref2blast[0]=='UniRef90_Q9H3D4'] # more sanity check. Make sure this is mapping to just one

# Append the annotations to our read counts table.
- merge chicken ensembl blast hits
- merge uniref90 blast hits

In [None]:
countsdf = pd.read_table(counts,index_col=0)
print(countsdf.shape[0])
countsdf.head(2)

In [None]:
merged_ensembl = pd.merge(countsdf,contig2chicken,how='left',left_index=True,right_index=True)
merged_ensembl.head(2)

In [None]:
# contig2chicken.ix['EC-4AK111_TAGCTT_R1_(paired)_contig_1003-0']
contig2chicken.drop_duplicates().shape

In [None]:
# Print some basic stats after annotating with ensembl blast hits
print("number of total contigs: {}".format(merged_ensembl.shape[0]))
print("number of annotated contigs: {}".format(merged_ensembl.shape[0] - 
                                               merged_ensembl[merged_ensembl.isnull().any(axis=1)].shape[0]))
print("number of still missing annotated contigs: {}".format(merged_ensembl[merged_ensembl.isnull().any(axis=1)].shape[0]))

In [None]:
merged_ensembl_uniref = pd.merge(merged_ensembl,contig2uniref,how='left',left_index=True,right_index=True)
merged_ensembl_uniref.head(2)

In [None]:
merged_ensembl_uniref['annotation'] = merged_ensembl_uniref['ensembl']
merged_ensembl_uniref['annotation'].fillna(merged_ensembl_uniref['uniref'],inplace=True)
merged_ensembl_uniref.head()

In [None]:
# Print some basic stats after annotating with ensembl blast hits
print("number of total contigs: {}".format(merged_ensembl_uniref.shape[0]))
print("number of annotated contigs: {}".format(merged_ensembl_uniref.dropna(subset=['annotation']).shape[0]))
print("number of uniref annotations not ensembl: {}".format(merged_ensembl_uniref[merged_ensembl_uniref['annotation'].str.contains('(UniRef)')==True].shape[0]))
print("number of ensembl annotations: {}".format(merged_ensembl_uniref[merged_ensembl_uniref['annotation'].str.contains('(ENSG)')==True].shape[0]))

In [None]:
# translate counts into an annotated counts table.
merged_ensembl_uniref.to_csv('/home/bay001/projects/kes_20160307/permanent_data/9-27-2016/annotations.txt',sep='\t')
new_counts_df = merged_ensembl_uniref.reset_index()
new_counts_df['annotation'].fillna(new_counts_df['gene_id'],inplace=True)
del new_counts_df['ensembl']
del new_counts_df['uniref']
del new_counts_df['gene_id']
cols = new_counts_df.columns.tolist()
cols.insert(0,cols.pop(cols.index('annotation')))
new_counts_df = new_counts_df.reindex(columns = cols)
new_counts_df.to_csv('/home/bay001/projects/kes_20160307/permanent_data/9-27-2016/counts.RSEM.txt.annotated',sep='\t',
                    index=None)
new_counts_df.head()


In [None]:
# give the counts table gene names
merged_ensembl_uniref['uniref'].dropna().to_csv('/home/bay001/projects/kes_20160307/data/uniref_ids.txt',index=None)

In [None]:
ensembl2name = pd.read_table('/home/bay001/projects/kes_20160307/permanent_data/9-27-2016/ensembl_to_genename.txt')
ensembl2name.head(2)

In [None]:
new_counts_df2 = pd.merge(new_counts_df,ensembl2name,how='left',left_on='annotation',right_on='Ensembl Gene ID')
new_counts_df2['Associated Gene Name'].fillna(new_counts_df2['annotation'],inplace=True)
del new_counts_df2['Ensembl Gene ID']
del new_counts_df2['annotation']
cols = new_counts_df2.columns.tolist()
cols.insert(0,cols.pop(cols.index('Associated Gene Name')))
new_counts_df2 = new_counts_df2.reindex(columns = cols)
# new_counts_df2.rename(columns={'Associated Gene Name':'annotation'},inplace=True)
new_counts_df2.to_csv('/home/bay001/projects/kes_20160307/permanent_data/9-27-2016/counts.RSEM.txt.gene-name.annotated',sep='\t',
                    index=None)
new_counts_df2.head()

# Create a new gene to trans map with the blast hits

In [None]:
allcontigs = pd.DataFrame(countsdf.reset_index()['gene_id'])
allcontigs.rename(columns = {'gene_id':'qseqid'},inplace=True)
allcontigs['gene'] = np.nan

gene_to_trans = pd.concat([contig2chicken.rename(columns={'ensembl':'gene'}),
                           contig2uniref.rename(columns={'uniref':'gene'}),
                           allcontigs.set_index('qseqid')])

gene_to_trans = gene_to_trans.reset_index().drop_duplicates(subset='qseqid')
gene_to_trans.head()

In [None]:
# reorder gene-to-trans map
cols = gene_to_trans.columns.tolist()
cols.insert(0,cols.pop(cols.index('gene')))
gene_to_trans = gene_to_trans.reindex(columns = cols)
gene_to_trans.head()

In [None]:
# Uh oh. Looks like the gene to trans map contains more transcripts that don't exist in the current assembly.
print(gene_to_trans.shape[0])
! wc -l /home/bay001/projects/kes_20160307/data/kestrel.headers

# Creating new gene-to-trans-map
##### There are more contigs in our map than what exists in the filtered assembly. This is due to the fact that I used the blast hits (which will include some filtered vectors/viruses/bacteria/etc., so I'll need to re-make the trans-to-gene map without this. 
- import headers from the assembly
- join assembly headers with existing gene-to-trans-map and remove the contigs that dropped out

In [None]:
X = pd.read_table('/home/bay001/projects/kes_20160307/data/kestrel.headers',names=['headers'])
X['qseqid'] = X['headers'].str.replace('>','')
del X['headers']
Y = pd.merge(X,gene_to_trans,how='left', on='qseqid')
Y['gene'].fillna(Y['qseqid'],inplace=True)
# reorder gene-to-trans map
cols = Y.columns.tolist()
cols.insert(0,cols.pop(cols.index('gene')))
Y = Y.reindex(columns = cols)
Y.to_csv('/home/bay001/projects/kes_20160307/permanent_data/9-27-2016/gene_to_trans.map',
                    sep='\t',header=None,index=None)

In [None]:
Y.shape # now contains the exact number of contigs (transcripts) to corresponding genes. 

In [None]:
# How many gene names are captured?
new_counts_df3 = pd.merge(new_counts_df,ensembl2name,how='left',left_on='annotation',right_on='Ensembl Gene ID')
len(set(new_counts_df3['Associated Gene Name'].dropna()))

# Create test counts matrix for deseq2 (while we wait for RSEM to finish)

In [None]:
testdf = new_counts_df3[new_counts_df3['annotation'].duplicated()==True].sort_values('annotation').drop_duplicates('annotation')
del testdf['Ensembl Gene ID']
del testdf['Associated Gene Name']
testdf.to_csv('/home/bay001/projects/kes_20160307/data/counts.TEST.txt',sep='\t',index=None)

# load up new countsdf

In [None]:
cdf = pd.read_table('/home/bay001/projects/kes_20160307/permanent_data/9-27-2016/counts.RSEM.txt')

In [None]:
cdf[cdf['gene_id'].str.contains('ENSG')].shape