In [134]:
import re
import os
import numpy as np
import pandas as pd
import itertools
from __future__ import print_function, division
from IPython.display import display

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("notebook", font_scale=1)
matplotlib.style.use('ggplot')
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

In [366]:
# match this pattern G[E/D]R[L/M]R[V/F]T
m = ['G',['E','D'],'R',['L','M'],'R',['V','F'],'T']
mp = [['E','D'],['L','M'],['V','F']]

motifs = []
for i in itertools.product(mp[0],mp[1],mp[2]):
    motifs.append(''.join([m[0],i[0],m[2],i[1],m[4],i[2],m[6]]))

# write out a file of target sequences
!rm pblast_targets.fasta 
!touch pblast_targets.fasta 
for i,m in enumerate(motifs):
    ! echo '>'{m} >> blast_targets.fasta 
    ! echo {m} >> blast_targets.fasta 

## Fetch pBLAST against Agrobacterium

In [75]:
# TODO: Get correct query against NCBIWWW to work instead of manually downloading the files
# from Bio.Blast import NCBIWWW 
# result_handle = NCBIWWW.qblast("blastp", "Microbial proteins from nr", motifs[0])
# blast_file = open("Blast_Results/"+motifs[0]+".xml", "w")  

# blast_file.write(result_handle.read()) 
# blast_file.close()# tidy up
# result_handle.close()

In [84]:
# Currently works based on first 100 results for each motif
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&PROG_DEFAULTS=on&PROG_DEF=blastp&BLAST_SPEC=MicrobialGenomes_1435057&DB_GROUP=AllMG

## Construct a dataframe of all the motifs

In [360]:
from Bio import SeqIO
from Bio.Alphabet import IUPAC

dataFrames = []

for motif in motifs:
    fasta_sequences = SeqIO.parse(open('Blast_Results/'+motif+'.fasta'),'fasta',alphabet=IUPAC.protein)
    for fasta in fasta_sequences:
        row = [motif,fasta.id, fasta.description, fasta.seq.tostring()]
        df = pd.DataFrame(data=[row],columns=['motif','name', 'desc', 'seq'])
        dataFrames.append(df)

df = pd.concat(dataFrames,ignore_index=True)

# clean up description
df.desc = df.apply(lambda x: x.desc.split(x['name'])[1],axis=1)

df.sample(10)

Unnamed: 0,motif,name,desc,seq
134,GERLRFT,WP_038495509.1,MULTISPECIES: hypothetical protein [Rhizobium...,MNIIRSTLIASAAFAASASIAVSAPVLGLTGDKTLVMFDTNKPAVT...
738,GDRMRFT,WP_038496390.1,MULTISPECIES: transcriptional regulator [Rhiz...,MLPKDRARNASEQVSSAIFGAATGRGPWRDVCSSLTQAFPGSYAAL...
585,GDRLRFT,WP_003516797.1,MULTISPECIES: ABC transporter permease [Rhizo...,MLRYILKRILVMIPTLILISMLVFTIIELPPGDYFESYVAELRAMG...
130,GERLRFT,WP_038491607.1,MULTISPECIES: DNA repair protein RecN [Rhizob...,MLVQLSIRDIVLIERLDLGFEAGLSVLTGETGAGKSILLDSLSLAL...
573,GDRLRFT,WP_038491430.1,MULTISPECIES: ATPase AAA [Rhizobium/Agrobacte...,MSDDLFAPQVPVEVANRRPLADRLRPKTLAEVTGQPHLTGEEGVLR...
772,GDRMRFT,WP_003511718.1,MULTISPECIES: BA14K family protein [Rhizobium...,MMNFRTTSVATAVVLFLTSFTPSQAFQAPVPMAKPAISTENVVPVQ...
631,GDRMRVT,WP_038490565.1,MULTISPECIES: glycosyl transferase family 1 [...,MITITDLSARIAGRLLLDHASVTLPAGVKVGLVGRNGAGKSTLFRV...
263,GERMRVT,WP_038490464.1,MULTISPECIES: MOSC domain-containing protein ...,MRVTELNIYPLKSARGIVLSKSDVSAEGLPGDRRAMLTDPSGHFIT...
770,GDRMRFT,WP_003493096.1,MULTISPECIES: uracil phosphoribosyltransferas...,MDGVTVIEHPLVRHKLTIMRKKETSTAGFRRLLREISTLLCYEVTR...
133,GERLRFT,WP_051489688.1,hypothetical protein [Agrobacterium tumefacie...,MTLYLSNSWQVGTSYGEFFYGDRYGVVRDDVFQLGGSAYGGLGGVR...


In [361]:
def matchMotifs(m,seq):
    regex = '('+m[0]+'?'+m[1]+'?)('+m[2:5]+')('+m[5]+'?'+m[6]+'?)' # (G?E?)(RMR)(F?T?)
    results = [(m.start(0)/len(seq), m.group()) for m in re.finditer(regex, seq)]
    results = filter(lambda x: len(x)>0 and x[0]>.97,results)
    return results
    
df['matches'] = df.apply(lambda x: matchMotifs(x.motif,x.seq),axis=1)
dfMotif = df[map(lambda x: len(x)>0,df.matches)]
dfMotif.reset_index(inplace=True,drop=True)

display(dfMotif.sample(5))

Unnamed: 0,motif,name,desc,seq,matches
5,GERLRFT,WP_038494935.1,MULTISPECIES: agrobactin synthetase subunit F...,MAADNSRMLARRIEQTFDNADEAPRSGSQPSQYEERVWFQQFQDPD...,"[(0.971608832808, RLRT)]"
1,GERLRVT,WP_038494935.1,MULTISPECIES: agrobactin synthetase subunit F...,MAADNSRMLARRIEQTFDNADEAPRSGSQPSQYEERVWFQQFQDPD...,"[(0.971608832808, RLRT)]"
14,GDRLRVT,WP_038494935.1,MULTISPECIES: agrobactin synthetase subunit F...,MAADNSRMLARRIEQTFDNADEAPRSGSQPSQYEERVWFQQFQDPD...,"[(0.970557308097, DRLRT)]"
3,GERLRVT,WP_038493383.1,hypothetical protein [Agrobacterium tumefacie...,MRTISGNEARALIESQLAAHGHGVFSVLAQYRRDNAVAAWHETIRA...,"[(0.978473581213, ERLR)]"
13,GERMRFT,WP_038494278.1,MULTISPECIES: glutamate synthase subunit alph...,MTNKLQADNVAATLTTDCPTAAAAAPVSAKGRFAGGLPEKQGLYDP...,"[(0.993674889311, ERMR)]"


In [367]:
def sortPos(x):
    return sorted(x,key=lambda x: x[0], reverse=True)[0]

def sortSize(x):
    return sorted(x,key=lambda x: len(x[1]), reverse=True)[0]

dfMotif['pos'] = map(lambda x: sortPos(x), dfMotif.matches)
dfMotif['size'] = map(lambda x: sortSize(x), dfMotif.matches)

# sanity check
print('total:',len(df.matches))
print('filter:',len(dfMotif))

display(dfMotif)

total: 800
filter: 23


Unnamed: 0,motif,name,desc,seq,matches,pos,size
0,GERLRVT,WP_019566865.1,MULTISPECIES: sarcosine oxidase subunit alpha...,MRLKDGLIDRSKVLHFTFDGKNYQGHPGDTLASALLANDVRLMGRS...,"[(0.993839835729, GERLR)]","(0.993839835729, GERLR)","(0.993839835729, GERLR)"
1,GERLRVT,WP_038494935.1,MULTISPECIES: agrobactin synthetase subunit F...,MAADNSRMLARRIEQTFDNADEAPRSGSQPSQYEERVWFQQFQDPD...,"[(0.971608832808, RLRT)]","(0.971608832808, RLRT)","(0.971608832808, RLRT)"
2,GERLRVT,WP_038491802.1,MULTISPECIES: sodium:phosphate symporter [Rhi...,MQSTIVIVNLLGAVALLLFGLAQVKDGVTRAFGMKLRSVLATGTQS...,"[(0.983636363636, RLR)]","(0.983636363636, RLR)","(0.983636363636, RLR)"
3,GERLRVT,WP_038493383.1,hypothetical protein [Agrobacterium tumefacie...,MRTISGNEARALIESQLAAHGHGVFSVLAQYRRDNAVAAWHETIRA...,"[(0.978473581213, ERLR)]","(0.978473581213, ERLR)","(0.978473581213, ERLR)"
4,GERLRVT,AHK02717.1,hypothetical protein X971_2856 [Agrobacterium...,MDGAFVSNPGPTWLAVRQALAPYLEPSVVARFTRIMLYAGAMGVAF...,"[(0.975, ERLR)]","(0.975, ERLR)","(0.975, ERLR)"
5,GERLRFT,WP_038494935.1,MULTISPECIES: agrobactin synthetase subunit F...,MAADNSRMLARRIEQTFDNADEAPRSGSQPSQYEERVWFQQFQDPD...,"[(0.971608832808, RLRT)]","(0.971608832808, RLRT)","(0.971608832808, RLRT)"
6,GERLRFT,WP_019566865.1,MULTISPECIES: sarcosine oxidase subunit alpha...,MRLKDGLIDRSKVLHFTFDGKNYQGHPGDTLASALLANDVRLMGRS...,"[(0.993839835729, GERLR)]","(0.993839835729, GERLR)","(0.993839835729, GERLR)"
7,GERLRFT,WP_038491802.1,MULTISPECIES: sodium:phosphate symporter [Rhi...,MQSTIVIVNLLGAVALLLFGLAQVKDGVTRAFGMKLRSVLATGTQS...,"[(0.983636363636, RLR)]","(0.983636363636, RLR)","(0.983636363636, RLR)"
8,GERLRFT,WP_038493383.1,hypothetical protein [Agrobacterium tumefacie...,MRTISGNEARALIESQLAAHGHGVFSVLAQYRRDNAVAAWHETIRA...,"[(0.978473581213, ERLR)]","(0.978473581213, ERLR)","(0.978473581213, ERLR)"
9,GERLRFT,AHK02717.1,hypothetical protein X971_2856 [Agrobacterium...,MDGAFVSNPGPTWLAVRQALAPYLEPSVVARFTRIMLYAGAMGVAF...,"[(0.975, ERLR)]","(0.975, ERLR)","(0.975, ERLR)"


In [368]:
dfMotif.to_csv('pblast_matches.csv')