In [20]:
from mmseqs_query import run_mmseqs2
import requests
from io import StringIO
from Bio import SeqIO
from pathlib import Path
import re

In [24]:
def get_query_sequence_uniprot(uniprot_name, results_dir):
  url = f'https://rest.uniprot.org/uniprotkb/{uniprot_name}.fasta'
  r = requests.get(url)
  if r.ok:
    fasta_io = StringIO(r.text) 
    seq_record = SeqIO.read(fasta_io, "fasta") 
    SeqIO.write(seq_record, results_dir.joinpath(f'{uniprot_name}.fasta'),format='fasta')
    return str(seq_record.seq)
  else:
    raise Exception('Error retrieving target sequence from Uniprot!')
  
def remove_inserts_a3m(query_name, raw_msa):
  clean_msa = []
  for i, line in enumerate(raw_msa):
    if i == 0:
      clean_msa.append('>' + query_name)
    elif line[0] == '>':
      name = line.split('\t')[0]
      clean_msa.append(name)
    else:
      aligned_seq_without_inserts = re.sub(r'[^A-Z_.-]',r'',line)
      clean_msa.append(aligned_seq_without_inserts)
  return [line + '\n' for line in clean_msa]

In [3]:
results_dir = Path('examples/ADRB2_HUMAN/results')
if not results_dir.exists():
    results_dir.mkdir(parents=True)
query_sequence = get_query_sequence_uniprot('ADRB2_HUMAN', results_dir)

In [9]:
jobname = 'ADRB2_HUMAN'
msa_lines = run_mmseqs2(query_sequence, jobname, use_env=True, use_filter=True)

COMPLETE: 100%|██████████| 150/150 [elapsed: 00:03 remaining: 00:00]


In [13]:
msa_file = f'{jobname}_env/uniref.a3m'

In [14]:
with open(msa_file,'r') as fid:
    msa_lines = fid.readlines()         

In [25]:
clean_msa = remove_inserts_a3m('ADRB2_HUMAN',msa_lines)

In [27]:
with open(f'{jobname}_env/uniref.fasta','w') as fid:
    fid.writelines(clean_msa)