**Análise de Homologias por BLAST**

In [1]:
pip install biopython

Collecting biopython
  Using cached biopython-1.84-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting numpy (from biopython)
  Downloading numpy-2.2.1-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Using cached biopython-1.84-cp312-cp312-macosx_11_0_arm64.whl (2.7 MB)
Downloading numpy-2.2.1-cp312-cp312-macosx_14_0_arm64.whl (5.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy, biopython
Successfully installed biopython-1.84 numpy-2.2.1
Note: you may need to restart the kernel to use updated packages.


In [2]:

from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

import re

In [3]:
def BLAST_prot (protein_name):
    """
    Função que realiza o BLASTp para a proteína fornecida a partir do ficheiro fasta desta
    """
    protein = SeqIO.read(open(f'{protein_name}.fasta'), format="fasta") 
    result = NCBIWWW.qblast("blastp", "swissprot", protein.seq)
    with open(f'{protein_name}.xml', "w") as save_file:
        save_file.write(result.read())
    save_file.close()
    result.close()

In [4]:
BLAST_prot('ptsp')

In [5]:
def first_alignment (protein_name):
    result_handle = open(f'{protein_name}.xml')
    blast_record = NCBIXML.read(result_handle)

    first = blast_record.alignments[0]
    print("FIRST ALIGNMENT: ", "\nAcession:" + first.accession, "\nHit id:" + first.hit_id)
    print("Definition: " + first.hit_def, "\nAlignment lenght: " , first.length, "\nNumber of HPSs: " , len(first.hsps))
    result_handle.close()

In [6]:
def fliter_blast(protein_name, e_value_threshold, percent_identity_threshold, coverage_threshold):
    """ 
    Função para filtrar os melhores alinhamentos com base no e-value, percent identity e coverage
    Output: Nº de hits do blast, informação sobre os 3 melhores alinhamentos e especificação das espécies a que as 3 proteínas homologas pertencem
    """
    result_handle = open(f'{protein_name}.xml')
    blast_record = NCBIXML.read(result_handle)
    
    print("PARAMETERS: \nDatabase: ", blast_record.database)
    print("Matriz: " + blast_record.matrix)
    print("Gap penalties: " , blast_record.gap_penalties)
    print('\nHits: ', len(blast_record.alignments))
    
    top_alignments = []
    for alignment in blast_record.alignments:
        if 'synthetic construct' not in alignment.title.lower():
            alignment_info = {
                'title': alignment.title,
                'length': alignment.length,
                'e_value': float('inf'),
                'percent_identity': 0.0,
                'coverage': 0.0,
                'query': '',
                'match': '',
                'sbjct': ''       }
            for hsp in alignment.hsps:
                if hsp.expect < alignment_info['e_value'] and \
                 (hsp.identities / hsp.align_length) * 100 >= percent_identity_threshold and \
                 (hsp.align_length / alignment.length) * 100 >= coverage_threshold:

                    alignment_info['e_value'] = hsp.expect
                    alignment_info['percent_identity'] = (hsp.identities / hsp.align_length) * 100
                    alignment_info['coverage'] = (hsp.align_length / alignment.length) * 100
                    alignment_info['query'] = hsp.query[0:75] + '...'
                    alignment_info['match'] = hsp.match[0:75] + '...'
                    alignment_info['sbjct'] = hsp.sbjct[0:75] + '...'

            if alignment_info['e_value'] < e_value_threshold:
                top_alignments.append(alignment_info)
                if alignment_info['coverage'] < coverage_threshold:
                    top_alignments.append(alignment_info)
                    if alignment_info['percent_identity'] < percent_identity_threshold:
                        top_alignments.append(alignment_info)
    for i, alignment_info in enumerate(top_alignments[:4]):
        print(f'****Alignment {i+1}****')
        print('Sequence: ', alignment_info['title'])
        print('Length: ', alignment_info['length'])
        print('E-value: ', alignment_info['e_value'])
        print('Percent identity: {:.2f}%'.format(alignment_info['percent_identity']))
        print('Coverage: {:.2f}%'.format(alignment_info['coverage']))
        print(alignment_info['query'])
        print(alignment_info['match'])
        print(alignment_info['sbjct'], '\n')

        species_list = []

    for alignment_info in top_alignments[:4]:
        title = alignment_info['title']
        match = re.search(r"\[(.*?)\]", title)
        if match:
            species = match.group(1)
            species_list.append(species)
    print("\nOrganisms:")
    for species in species_list:
        print(species)
        
    result_handle.close()

In [7]:
# 1º gene
first_alignment('ptsp')

FIRST ALIGNMENT:  
Acession:Q9K8D3 
Hit id:sp|Q9K8D3.1|
Definition: RecName: Full=Phosphoenolpyruvate-protein phosphotransferase; AltName: Full=Phosphotransferase system, enzyme I [Halalkalibacterium halodurans C-125] 
Alignment lenght:  572 
Number of HPSs:  1


In [9]:
def fliter_blast(result_handle, protein_name, e_value_threshold, percent_identity_threshold, coverage_threshold):
    species_list = []  # Inicializa a variável como uma lista vazia

    # Itera sobre o handle fornecido
    for line in result_handle:
        # Substitua pela lógica real para preencher species_list
        if "some_condition" in line:  # Substitua "some_condition" pela lógica correta
            species = line.strip()  # Ajuste conforme necessário
            species_list.append(species)
    
    print("Organisms:")
    for species in species_list:
        print(species)


In [15]:
e_value_threshold = 1.0
percent_identity_threshold = 50
coverage_threshold = 50


In [17]:
from Bio.Blast import NCBIXML

with open('ptsp.xml') as result_handle:
    blast_record = NCBIXML.read(result_handle)
    print("Número de alinhamentos:", len(blast_record.alignments))


Número de alinhamentos: 50


In [None]:
print("Número de alinhamentos encontrados:", len(blast_record.alignments))
for alignment in blast_record.alignments:
    print("Título do alinhamento:", alignment.title)
    for hsp in alignment.hsps:
        query_cover = (hsp.align_length / blast_record.query_letters) * 100
        print(f"HSP: E-value: {hsp.expect}, Identities: {hsp.identities}, "
              f"Align length: {hsp.align_length}, Query Cover: {query_cover:.2f}%")


Número de alinhamentos encontrados: 50
Título do alinhamento: sp|Q9K8D3.1| RecName: Full=Phosphoenolpyruvate-protein phosphotransferase; AltName: Full=Phosphotransferase system, enzyme I [Halalkalibacterium halodurans C-125]
HSP: E-value: 5.95966e-160, Identities: 228, Align length: 530, Query Cover: 96.89%
Título do alinhamento: sp|O83018.1| RecName: Full=Phosphoenolpyruvate-protein phosphotransferase; AltName: Full=Phosphotransferase system, enzyme I [Bacillus sp. S]
HSP: E-value: 4.75636e-154, Identities: 235, Align length: 530, Query Cover: 96.89%
Título do alinhamento: sp|P42014.1| RecName: Full=Phosphoenolpyruvate-protein phosphotransferase; AltName: Full=Phosphotransferase system, enzyme I [Geobacillus stearothermophilus]
HSP: E-value: 6.92156e-153, Identities: 235, Align length: 530, Query Cover: 96.89%
Título do alinhamento: sp|O69251.1| RecName: Full=Phosphoenolpyruvate-protein phosphotransferase; AltName: Full=Phosphotransferase system, enzyme I [Priestia megaterium]
HSP: E-