In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
wd = '/home/bay001/projects/kes_20160307/data/'
blast_head = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch',
              'gapopen', 'qstart', 'qend', 'sstart', 'send',
              'evalue', 'bitscore']
counts = '/home/bay001/projects/kes_20160307/permanent_data/9-27-2016/counts.RSEM.txt'
! wc -l $counts

239838 /home/bay001/projects/kes_20160307/permanent_data/9-27-2016/counts.RSEM.txt


### Load BLAST results from blastx to chicken ensembl protein
- Filter out e-value less than threshold (set at 1e-10)
- Sort by e-value (ascending)
- Remove duplicate contigs (contigs which map to multiple proteins, only take the lowest e-value)

In [3]:
evalue_threshold = 1e-10
blast = pd.read_table(os.path.join(wd,'blast/chicken.blastx'),names=blast_head)
print("number of starting hits: {}".format(blast.shape[0]))
blast = blast[blast['evalue'] < evalue_threshold]
print("number of filtered for e-value hits: {}".format(blast.shape[0]))
blast.sort_values(by='evalue',inplace=True)
blast.drop_duplicates('qseqid',inplace=True, keep='first')
print("number of filtered hits for de-duplicated contigs: {}".format(blast.shape[0]))
blast.set_index('qseqid',inplace=True)
blast.head()

number of starting hits: 331175
number of filtered for e-value hits: 83725
number of filtered hits for de-duplicated contigs: 55116


Unnamed: 0_level_0,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
qseqid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
unmapped-49-contig_list_contig_207806-0,ENSGALG00000028975,84.913,517,74,2,1549,2,375,888,0.0,853.0
unmapped-49-contig_list_contig_48921-0,ENSGALG00000002638,92.483,439,33,0,6438,5122,1,439,0.0,815.0
unmapped-49-contig_list_contig_33476-0,ENSGALG00000002798,98.047,717,14,0,3247,1097,16,732,0.0,1429.0
unmapped-49-contig_list_contig_67081-0,ENSGALG00000005123,99.714,350,1,0,2639,1590,1,350,0.0,697.0
EC-4AK111_TAGCTT_R1_(paired)_contig_5821-0,ENSGALG00000006425,94.286,315,10,2,54,992,3,311,0.0,559.0


### Reformat BLAST hits into a translation table (contig -> ensembl id)

In [4]:
contig2chicken = blast.loc[:,['sseqid']]
contig2chicken.rename(columns={'sseqid':'ensembl'},inplace=True)
contig2chicken.head()

Unnamed: 0_level_0,ensembl
qseqid,Unnamed: 1_level_1
unmapped-49-contig_list_contig_207806-0,ENSGALG00000028975
unmapped-49-contig_list_contig_48921-0,ENSGALG00000002638
unmapped-49-contig_list_contig_33476-0,ENSGALG00000002798
unmapped-49-contig_list_contig_67081-0,ENSGALG00000005123
EC-4AK111_TAGCTT_R1_(paired)_contig_5821-0,ENSGALG00000006425


### Load BLAST results from blastx (actually used diamond for this analysis because it's faster) to uniref90
- Filter out e-value less than threshold (set at 1e-10)
- Sort by e-value (ascending)
- Remove duplicate contigs (contigs which map to multiple proteins, only take the lowest e-value)

In [5]:
uniref = pd.read_table(os.path.join(wd,'diamond/all.blast'),names=blast_head)
print("number of starting hits: {}".format(uniref.shape[0]))
uniref = uniref[uniref['evalue'] < 1e-10]
print("number of filtered for e-value hits: {}".format(uniref.shape[0]))
uniref.sort_values(by='evalue',inplace=True)
uniref.drop_duplicates('qseqid',inplace=True, keep='first')
print("number of filtered hits for de-duplicated contigs: {}".format(uniref.shape[0]))
uniref.head()

number of starting hits: 3001772
number of filtered for e-value hits: 1944271
number of filtered hits for de-duplicated contigs: 60670


Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
1363364,unmapped-49-contig_list_contig_64936-0,UniRef90_G1KX46,74.5,768,190,3,265,2565,1,763,0.0,1107.0
928192,unmapped-49-contig_list_contig_6250-0,UniRef90_A0A091G0Z8,92.1,1077,76,2,499,3714,1,1073,0.0,1980.7
75828,unmapped-49-contig_list_contig_29731-0,UniRef90_UPI0005226B71,82.5,1056,185,0,5095,1928,1233,2288,0.0,1697.2
1255090,EC-4AK111_TAGCTT_R1_(paired)_contig_6330-0,UniRef90_V8NBR3,78.4,828,177,2,2712,232,23,849,0.0,1323.1
1254783,unmapped-49-contig_list_contig_56574-0,UniRef90_UPI000642AD0D,71.4,767,199,6,2794,539,394,1155,0.0,1077.0


### This is a silly converter since it's difficult to map existing sseqids to any database. So I need to re-map them to their RepID (gene name).
- grep '>' uniref90.fasta > uniref90.headers
- extract from the fasta header the RepID
- create translation table (UniRef90_* -> RepID)

In [None]:
uniref_translation = pd.read_table('/home/bay001/projects/kes_20160307/data/uniref90.headers',names=['uniref'])
uniref_translation.head(2)

In [None]:
uniref_translation = uniref_translation['uniref'].str.replace('>','')

In [None]:
uniref2gene = uniref_translation.str.extract('(^[\w\d-]+).+ RepID=([\w-]+)$')

In [None]:
uniref2gene.reset_index(inplace=True)
uniref2gene.head()

In [None]:
# save this intermediate step because this takes a long time.
uniref2gene.to_csv('/home/bay001/projects/kes_20160307/data/uniref2gene.txt',sep='\t',header=None,index=None)

In [6]:
uniref2gene = pd.read_table('/home/bay001/projects/kes_20160307/data/uniref2gene.txt',names=[0,1])
uniref2gene.head(2)

Unnamed: 0,0,1
0,UniRef90_Q6GZX4,001R_FRG3G
1,UniRef90_Q6GZX3,002L_FRG3G


In [7]:
# make sure we capture all annotations, that the regex expression is correct. This list should be empty.
uniref2gene[uniref2gene.isnull().any(axis=1)]

Unnamed: 0,0,1


In [8]:
uniref2blast = pd.merge(uniref,uniref2gene,how='left',left_on='sseqid',right_on=0)
uniref2blast.head()

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,0,1
0,unmapped-49-contig_list_contig_64936-0,UniRef90_G1KX46,74.5,768,190,3,265,2565,1,763,0.0,1107.0,UniRef90_G1KX46,G1KX46_ANOCA
1,unmapped-49-contig_list_contig_6250-0,UniRef90_A0A091G0Z8,92.1,1077,76,2,499,3714,1,1073,0.0,1980.7,UniRef90_A0A091G0Z8,A0A091G0Z8_9AVES
2,unmapped-49-contig_list_contig_29731-0,UniRef90_UPI0005226B71,82.5,1056,185,0,5095,1928,1233,2288,0.0,1697.2,UniRef90_UPI0005226B71,UPI0005226B71
3,EC-4AK111_TAGCTT_R1_(paired)_contig_6330-0,UniRef90_V8NBR3,78.4,828,177,2,2712,232,23,849,0.0,1323.1,UniRef90_V8NBR3,V8NBR3_OPHHA
4,unmapped-49-contig_list_contig_56574-0,UniRef90_UPI000642AD0D,71.4,767,199,6,2794,539,394,1155,0.0,1077.0,UniRef90_UPI000642AD0D,UPI000642AD0D


In [9]:
contig2uniref = uniref2blast.loc[:,['qseqid','sseqid']]
contig2uniref.set_index('qseqid',inplace=True)
contig2uniref.rename(columns={'sseqid':'uniref'},inplace=True)
print(contig2uniref.shape)
contig2uniref.drop_duplicates() # Sanity check.
print(contig2uniref.shape)
contig2uniref.head()

(60670, 1)
(60670, 1)


Unnamed: 0_level_0,uniref
qseqid,Unnamed: 1_level_1
unmapped-49-contig_list_contig_64936-0,UniRef90_G1KX46
unmapped-49-contig_list_contig_6250-0,UniRef90_A0A091G0Z8
unmapped-49-contig_list_contig_29731-0,UniRef90_UPI0005226B71
EC-4AK111_TAGCTT_R1_(paired)_contig_6330-0,UniRef90_V8NBR3
unmapped-49-contig_list_contig_56574-0,UniRef90_UPI000642AD0D


In [10]:
uniref2blast[uniref2blast[0]=='UniRef90_Q9H3D4'] # more sanity check. Make sure this is mapping to just one

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,0,1
10,unmapped-49-contig_list_contig_124317-0,UniRef90_Q9H3D4,93.0,572,40,0,2143,428,109,680,0.0,1083.2,UniRef90_Q9H3D4,P63_HUMAN


# Append the annotations to our read counts table.
- merge chicken ensembl blast hits
- merge uniref90 blast hits

In [18]:
countsdf = pd.read_table(counts,index_col=0)
print(countsdf.shape[0])
countsdf.head(2)

239837


Unnamed: 0_level_0,EC-1AK228_CAGATC,EC-2AK546_ACTTGA,EC-3AK436_GATCAG,EC-4AK111_TAGCTT,EC-5AK453_GGCTAC,EC-6AK100_CTTGTA,EC-7AK501_AGTCAA,EC-8AK511_AGTTCC,EC-9AK123_ATGTCA,EC-10AK244_CCGTCC,EC-11AK330_GTCCGC,EC-12AK422_GTGAAA,EC-13AK430_GTGGCC,EC-14AK418_GTTTCG,EC-15AK219_CGTACG,EC-16AK327_GAGTGG,EC-17AK506_ACTGAT,EC-18AK105_ATTCCT
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
EC-4AK111_TAGCTT_R1_(paired)_contig_1003-0,858,916,863,2406,978,1103,805,919,801,998,960,788,774,1143,1523,1334,1335,971
EC-4AK111_TAGCTT_R1_(paired)_contig_10031-0,404,390,596,754,585,348,422,460,602,633,615,388,531,447,563,549,537,434


In [12]:
merged_ensembl = pd.merge(countsdf,contig2chicken,how='left',left_index=True,right_index=True)
merged_ensembl.head(2)

Unnamed: 0_level_0,EC-1AK228_CAGATC,EC-2AK546_ACTTGA,EC-3AK436_GATCAG,EC-4AK111_TAGCTT,EC-5AK453_GGCTAC,EC-6AK100_CTTGTA,EC-7AK501_AGTCAA,EC-8AK511_AGTTCC,EC-9AK123_ATGTCA,EC-10AK244_CCGTCC,EC-11AK330_GTCCGC,EC-12AK422_GTGAAA,EC-13AK430_GTGGCC,EC-14AK418_GTTTCG,EC-15AK219_CGTACG,EC-16AK327_GAGTGG,EC-17AK506_ACTGAT,EC-18AK105_ATTCCT,ensembl
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
EC-4AK111_TAGCTT_R1_(paired)_contig_1003-0,858,916,863,2406,978,1103,805,919,801,998,960,788,774,1143,1523,1334,1335,971,
EC-4AK111_TAGCTT_R1_(paired)_contig_10031-0,404,390,596,754,585,348,422,460,602,633,615,388,531,447,563,549,537,434,


In [22]:
# contig2chicken.ix['EC-4AK111_TAGCTT_R1_(paired)_contig_1003-0']
contig2chicken.drop_duplicates().shape

(12756, 1)

In [13]:
# Print some basic stats after annotating with ensembl blast hits
print("number of total contigs: {}".format(merged_ensembl.shape[0]))
print("number of annotated contigs: {}".format(merged_ensembl.shape[0] - 
                                               merged_ensembl[merged_ensembl.isnull().any(axis=1)].shape[0]))
print("number of still missing annotated contigs: {}".format(merged_ensembl[merged_ensembl.isnull().any(axis=1)].shape[0]))

number of total contigs: 239837
number of annotated contigs: 0
number of still missing annotated contigs: 239837


In [None]:
merged_ensembl_uniref = pd.merge(merged_ensembl,contig2uniref,how='left',left_index=True,right_index=True)
merged_ensembl_uniref.head(2)

In [None]:
merged_ensembl_uniref['annotation'] = merged_ensembl_uniref['ensembl']
merged_ensembl_uniref['annotation'].fillna(merged_ensembl_uniref['uniref'],inplace=True)
merged_ensembl_uniref.head()

In [None]:
# Print some basic stats after annotating with ensembl blast hits
print("number of total contigs: {}".format(merged_ensembl_uniref.shape[0]))
print("number of annotated contigs: {}".format(merged_ensembl_uniref.dropna(subset=['annotation']).shape[0]))
print("number of uniref annotations not ensembl: {}".format(merged_ensembl_uniref[merged_ensembl_uniref['annotation'].str.contains('(UniRef)')==True].shape[0]))
print("number of ensembl annotations: {}".format(merged_ensembl_uniref[merged_ensembl_uniref['annotation'].str.contains('(ENSG)')==True].shape[0]))

In [None]:
# translate counts into an annotated counts table.
merged_ensembl_uniref.to_csv('/home/bay001/projects/kes_20160307/permanent_data/9-27-2016/annotations.txt',sep='\t')
new_counts_df = merged_ensembl_uniref.reset_index()
new_counts_df['annotation'].fillna(new_counts_df['gene_id'],inplace=True)
del new_counts_df['ensembl']
del new_counts_df['uniref']
del new_counts_df['gene_id']
cols = new_counts_df.columns.tolist()
cols.insert(0,cols.pop(cols.index('annotation')))
new_counts_df = new_counts_df.reindex(columns = cols)
new_counts_df.to_csv('/home/bay001/projects/kes_20160307/permanent_data/9-27-2016/counts.RSEM.txt.annotated',sep='\t',
                    index=None)
new_counts_df.head()


In [None]:
# give the counts table gene names
merged_ensembl_uniref['uniref'].dropna().to_csv('/home/bay001/projects/kes_20160307/data/uniref_ids.txt',index=None)

In [None]:
ensembl2name = pd.read_table('/home/bay001/projects/kes_20160307/permanent_data/9-27-2016/ensembl_to_genename.txt')
ensembl2name.head(2)

In [None]:
new_counts_df2 = pd.merge(new_counts_df,ensembl2name,how='left',left_on='annotation',right_on='Ensembl Gene ID')
new_counts_df2['Associated Gene Name'].fillna(new_counts_df2['annotation'],inplace=True)
del new_counts_df2['Ensembl Gene ID']
del new_counts_df2['annotation']
cols = new_counts_df2.columns.tolist()
cols.insert(0,cols.pop(cols.index('Associated Gene Name')))
new_counts_df2 = new_counts_df2.reindex(columns = cols)
# new_counts_df2.rename(columns={'Associated Gene Name':'annotation'},inplace=True)
new_counts_df2.to_csv('/home/bay001/projects/kes_20160307/permanent_data/9-27-2016/counts.RSEM.txt.gene-name.annotated',sep='\t',
                    index=None)
new_counts_df2.head()

# Create a new gene to trans map with the blast hits

In [None]:
allcontigs = pd.DataFrame(countsdf.reset_index()['gene_id'])
allcontigs.rename(columns = {'gene_id':'qseqid'},inplace=True)
allcontigs['gene'] = np.nan

gene_to_trans = pd.concat([contig2chicken.rename(columns={'ensembl':'gene'}),
                           contig2uniref.rename(columns={'uniref':'gene'}),
                           allcontigs.set_index('qseqid')])

gene_to_trans = gene_to_trans.reset_index().drop_duplicates(subset='qseqid')
gene_to_trans.head()

In [None]:
# reorder gene-to-trans map
cols = gene_to_trans.columns.tolist()
cols.insert(0,cols.pop(cols.index('gene')))
gene_to_trans = gene_to_trans.reindex(columns = cols)
gene_to_trans.head()

In [None]:
# Uh oh. Looks like the gene to trans map contains more transcripts that don't exist in the current assembly.
print(gene_to_trans.shape[0])
! wc -l /home/bay001/projects/kes_20160307/data/kestrel.headers

# Creating new gene-to-trans-map
##### There are more contigs in our map than what exists in the filtered assembly. This is due to the fact that I used the blast hits (which will include some filtered vectors/viruses/bacteria/etc., so I'll need to re-make the trans-to-gene map without this. 
- import headers from the assembly
- join assembly headers with existing gene-to-trans-map and remove the contigs that dropped out

In [None]:
X = pd.read_table('/home/bay001/projects/kes_20160307/data/kestrel.headers',names=['headers'])
X['qseqid'] = X['headers'].str.replace('>','')
del X['headers']
Y = pd.merge(X,gene_to_trans,how='left', on='qseqid')
Y['gene'].fillna(Y['qseqid'],inplace=True)
# reorder gene-to-trans map
cols = Y.columns.tolist()
cols.insert(0,cols.pop(cols.index('gene')))
Y = Y.reindex(columns = cols)
Y.to_csv('/home/bay001/projects/kes_20160307/permanent_data/9-27-2016/gene_to_trans.map',
                    sep='\t',header=None,index=None)

In [None]:
Y.shape # now contains the exact number of contigs (transcripts) to corresponding genes. 

In [None]:
# How many gene names are captured?
new_counts_df3 = pd.merge(new_counts_df,ensembl2name,how='left',left_on='annotation',right_on='Ensembl Gene ID')
len(set(new_counts_df3['Associated Gene Name'].dropna()))

# Create test counts matrix for deseq2 (while we wait for RSEM to finish)

In [None]:
testdf = new_counts_df3[new_counts_df3['annotation'].duplicated()==True].sort_values('annotation').drop_duplicates('annotation')
del testdf['Ensembl Gene ID']
del testdf['Associated Gene Name']
testdf.to_csv('/home/bay001/projects/kes_20160307/data/counts.TEST.txt',sep='\t',index=None)

# load up new countsdf

In [23]:
cdf = pd.read_table('/home/bay001/projects/kes_20160307/permanent_data/9-27-2016/counts.RSEM.txt')

In [26]:
cdf[cdf['gene_id'].str.contains('ENSG')].shape

(12756, 19)