In [82]:
import re
import os
import numpy as np
import pandas as pd
import itertools

from IPython.display import display

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("notebook", font_scale=1)
matplotlib.style.use('ggplot')
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

In [83]:
# match this pattern G[E/D]R[L/M]R[V/F]T
m = ['G',['E','D'],'R',['L','M'],'R',['V','F'],'T']
mp = [['E','D'],['L','M'],['V','F']]

motifs = []
for i in itertools.product(mp[0],mp[1],mp[2]):
    motifs.append(''.join([m[0],i[0],m[2],i[1],m[4],i[2],m[6]]))

# write out a file of target sequences
!rm blast_targets.fasta 
!touch blast_targets.fasta 
for i,m in enumerate(motifs):
    ! echo '>target'{i} >> blast_targets.fasta 
    ! echo {m} >> blast_targets.fasta 

## Fetch pBLAST against Agrobacterium

In [75]:
# TODO: Get query against NCBIWWW to work instead of manually downloading the files
# from Bio.Blast import NCBIWWW 
# result_handle = NCBIWWW.qblast("blastp", "Microbial proteins from nr", motifs[0])
# blast_file = open("Blast_Results/"+motifs[0]+".xml", "w")  

# blast_file.write(result_handle.read()) 
# blast_file.close()# tidy up
# result_handle.close()

In [84]:
# Currently works based on first 100 results for each motif
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&PROG_DEFAULTS=on&PROG_DEF=blastp&BLAST_SPEC=MicrobialGenomes_1435057&DB_GROUP=AllMG

## Construct a dataframe of all the motifs

In [88]:
from Bio import SeqIO
from Bio.Alphabet import IUPAC

dataFrames = []

for motif in motifs:
    fasta_sequences = SeqIO.parse(open('Blast_Results/'+motif+'.fasta'),'fasta',alphabet=IUPAC.protein)
    for fasta in fasta_sequences:
        row = [motif,fasta.id, fasta.description, fasta.seq.tostring()]
        df = pd.DataFrame(data=[row],columns=['motif','name', 'desc', 'seq'])
        dataFrames.append(df)

df = pd.concat(dataFrames,ignore_index=True)
df.sample(10)

Unnamed: 0,motif,name,desc,seq
139,GERLRFT,WP_003516634.1,WP_003516634.1 MULTISPECIES: aconitate hydrata...,MPKSLDSFHCRSVLTVDGKDYVYFSLPKAEANGLKGVSKLPYSMKV...
722,GDRMRFT,AHK00242.1,AHK00242.1 DNA mismatch repair protein MutS [A...,MEQYIEIKANNPGSLLFYRMGDFYELFFDDAVEASRSLGITLTKRG...
657,GDRMRVT,AHK03178.1,AHK03178.1 maltose operon transcriptional repr...,MAENMKLKEFAEKVGLSPTTVSRALGGYPEVREETRQRVMDAALKY...
761,GDRMRFT,WP_038492348.1,WP_038492348.1 MULTISPECIES: polysaccharide bi...,MQSLIKPQNLTATLMGKTGDTLLWLLAALHVTAVAILFATLLSLGE...
265,GERMRVT,WP_038493908.1,WP_038493908.1 MULTISPECIES: peptide ABC trans...,MSFTLRLLRSFEGAAGAIILTLLAVTALAAPLLFPGDPLSIVGEPL...
292,GERMRVT,AHK01130.1,AHK01130.1 ATP-dependent protease La [Agrobact...,MKGNDMTNITSAASGGIYPVLPLRDIVVFPHMIVPLFVGREKSIRA...
399,GERMRFT,WP_003515403.1,WP_003515403.1 MULTISPECIES: 3-deoxy-7-phospho...,MAQNWTPGSWRQKPIQQVPEYPDAAALAATEATLATYPPLVFAGEA...
681,GDRMRVT,WP_003516002.1,WP_003516002.1 MULTISPECIES: phosphoribosylami...,MNRRRRIYEGKAKILYEGPEPGTLIQFFKDDATAFNKKKHEVIDGK...
628,GDRMRVT,WP_038497356.1,WP_038497356.1 MULTISPECIES: type I secretion ...,MPLSDMTTEQWPDPNAAGTDFASWSEALQYVARHYGVPFSPGGAQQ...
184,GERLRFT,WP_003513631.1,WP_003513631.1 MULTISPECIES: ribosome biogenes...,MSFTVAIVGRPNVGKSTLFNRLVGKKLALVDDTPGVTRDRRPGDAK...


In [112]:
# df.iloc[0].seq
# (G?E?RMRF?T?)
# /(G?E?RMRF?T?)+g/
# re.findall(regex, line)

for m in motifs:
    regex = '('+m[0]+'?'+m[1]+'?'+m[2:4]+m[5]+'?'+m[6]+'?)'
    matches = re.finditer(regex, df.iloc[0].seq)
    results = [(len(df.iloc[0].seq)-m.start(0),m.group(0)) for m in matches]
    print(results)

[(168, 'RLT'), (37, 'ERL')]
[(168, 'RLT'), (37, 'ERL')]
[]
[]
[(169, 'DRLT'), (36, 'RL')]
[(169, 'DRLT'), (36, 'RL')]
[]
[]
