In [22]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

import re

In [23]:
def BLAST_prot (protein_name):
    """
    Função que realiza o BLASTp para a proteína fornecida a a partir do ficheiro fasta desta
    """
    protein = SeqIO.read(open(f'{protein_name}.fasta'), format="fasta") 
    result = NCBIWWW.qblast("blastp", "swissprot", protein.seq)
    with open(f'{protein_name}_protein.xml', "w") as save_file:
        save_file.write(result.read())
    save_file.close()
    result.close()

In [24]:
BLAST_prot('genes/ptsP')

ValueError: Error message from NCBI: Message ID#24 Error: Failed to read the Blast query: Nucleotide FASTA provided for protein sequence

In [10]:
BLAST_prot('genes/butyrylCoA')

In [11]:
def first_alignment (protein_name):
    result_handle = open(f'{protein_name}_protein.xml')
    blast_record = NCBIXML.read(result_handle)

    first = blast_record.alignments[0]
    print("FIRST ALIGNMENT: ", "\nAcession:" + first.accession, "\nHit id:" + first.hit_id)
    print("Definition: " + first.hit_def, "\nAlignment lenght: " , first.length, "\nNumber of HPSs: " , len(first.hsps))
    result_handle.close()

In [12]:
def fliter_blast(protein_name, e_value_threshold, percent_identity_threshold, coverage_threshold):
    """ 
    Função para filtrar os melhores alinhamentos com base no e-value, percent identity e coverage
    Output: Nº de hits do blast, informação sobre os 3 melhores alinhamentos e especificação das espécies a que as 3 proteínas homologas pertencem
    """
    result_handle = open(f'{protein_name}_protein.xml')
    blast_record = NCBIXML.read(result_handle)
    
    print("PARAMETERS: \nDatabase: ", blast_record.database)
    print("Matriz: " + blast_record.matrix)
    print("Gap penalties: " , blast_record.gap_penalties)
    print('\nHits: ', len(blast_record.alignments))
    
    top_alignments = []
    for alignment in blast_record.alignments:
        if 'synthetic construct' not in alignment.title.lower():
            alignment_info = {
                'title': alignment.title,
                'length': alignment.length,
                'e_value': float('inf'),
                'percent_identity': 0.0,
                'coverage': 0.0,
                'query': '',
                'match': '',
                'sbjct': ''       }
            for hsp in alignment.hsps:
                if hsp.expect < alignment_info['e_value'] and \
                 (hsp.identities / hsp.align_length) * 100 >= percent_identity_threshold and \
                 (hsp.align_length / alignment.length) * 100 >= coverage_threshold:

                    alignment_info['e_value'] = hsp.expect
                    alignment_info['percent_identity'] = (hsp.identities / hsp.align_length) * 100
                    alignment_info['coverage'] = (hsp.align_length / alignment.length) * 100
                    alignment_info['query'] = hsp.query[0:75] + '...'
                    alignment_info['match'] = hsp.match[0:75] + '...'
                    alignment_info['sbjct'] = hsp.sbjct[0:75] + '...'

            if alignment_info['e_value'] < e_value_threshold:
                top_alignments.append(alignment_info)
                if alignment_info['coverage'] < coverage_threshold:
                    top_alignments.append(alignment_info)
                    if alignment_info['percent_identity'] < percent_identity_threshold:
                        top_alignments.append(alignment_info)
    for i, alignment_info in enumerate(top_alignments[:4]):
        print(f'****Alignment {i+1}****')
        print('Sequence: ', alignment_info['title'])
        print('Length: ', alignment_info['length'])
        print('E-value: ', alignment_info['e_value'])
        print('Percent identity: {:.2f}%'.format(alignment_info['percent_identity']))
        print('Coverage: {:.2f}%'.format(alignment_info['coverage']))
        print(alignment_info['query'])
        print(alignment_info['match'])
        print(alignment_info['sbjct'], '\n')

        species_list = []

    for alignment_info in top_alignments[:4]:
        title = alignment_info['title']
        match = re.search(r"\[(.*?)\]", title)
        if match:
            species = match.group(1)
            species_list.append(species)
    print("\nOrganisms:")
    for species in species_list:
        print(species)
        
    result_handle.close()
    

# ptsP

In [17]:
first_alignment('genes/ptsP')

FIRST ALIGNMENT:  
Acession:Q9K8D3 
Hit id:sp|Q9K8D3.1|
Definition: RecName: Full=Phosphoenolpyruvate-protein phosphotransferase; AltName: Full=Phosphotransferase system, enzyme I [Halalkalibacterium halodurans C-125] 
Alignment lenght:  572 
Number of HPSs:  1


In [18]:
e_value_threshold = 0.001
percent_identity_threshold = 90
coverage_threshold = 90 
fliter_blast('genes/ptsP', e_value_threshold, percent_identity_threshold, coverage_threshold)

PARAMETERS: 
Database:  swissprot
Matriz: BLOSUM62
Gap penalties:  (11, 1)

Hits:  50

Organisms:


UnboundLocalError: local variable 'species_list' referenced before assignment