**Análise de Homologias por BLAST**

In [2]:
from Bio import SeqIO
from Bio.Blast import NCBIWWW, NCBIXML
import re

def blast_and_filter(gene_names, e_value_threshold=1e-5, percent_identity_threshold=50, coverage_threshold=50):
    for name_gene in gene_names:
        # Leitura da sequência e execução do BLAST
        try:
            query_seq = SeqIO.read(f"genes/{name_gene}.fasta", "fasta")
        except FileNotFoundError:
            print(f"Arquivo não encontrado para {name_gene}. Pulando...")
            continue

        print(f"Iniciando busca BLAST para {name_gene}...")
        result_handle = NCBIWWW.qblast("blastp", "swissprot", query_seq.seq)
        print(f"Busca BLAST concluída para {name_gene}.")

        # Parsing e filtragem dos resultados
        blast_records = NCBIXML.parse(result_handle)
        output_path = f"genes/{name_gene}_blast.fasta"
        
        with open(output_path, "w") as output_handle:
            for blast_record in blast_records:
                print(f"Número de alinhamentos encontrados para {name_gene}:", len(blast_record.alignments))
                for alignment in blast_record.alignments:
                    print("Título do alinhamento:", alignment.title)
                    for hsp in alignment.hsps:
                        query_cover = (hsp.align_length / blast_record.query_letters) * 100
                        print(f"HSP: E-value: {hsp.expect}, Identities: {hsp.identities}, "
                              f"Align length: {hsp.align_length}, Query Cover: {query_cover:.2f}%")
                        
                        percent_identity = (hsp.identities / hsp.align_length) * 100
                        if (hsp.expect <= e_value_threshold and
                            percent_identity >= percent_identity_threshold and
                            query_cover >= coverage_threshold):
                            
                            species_match = re.search(r"\[(.*?)\]", alignment.title)
                            species = species_match.group(1) if species_match else "Unknown species"
                            
                            SeqIO.write(
                                SeqIO.SeqRecord(
                                    seq=hsp.sbjct,
                                    id=alignment.accession,
                                    description=f"E-value: {hsp.expect:.2e}, Identities: {hsp.identities}/{hsp.align_length}, "
                                                f"Query Cover: {query_cover:.2f}%, Percent Identity: {percent_identity:.2f}%, "
                                                f"Species: {species}"
                                ),
                                output_handle,
                                "fasta"
                            )
                            break  # Pega apenas o melhor HSP para cada alinhamento
        
        print(f"Resultados filtrados do BLAST para {name_gene} foram salvos em '{output_path}'")



In [11]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

import re

def BLAST_prot (protein_name):
    """
    Função que realiza o BLASTp para a proteína fornecida a a partir do ficheiro fasta desta
    """
    protein = SeqIO.read(open(f'genes/{protein_name}.fasta'), format="fasta") 
    result = NCBIWWW.qblast("blastp", "swissprot", protein.seq)
    with open(f'genes/{protein_name}_protein.xml', "w") as save_file:
        save_file.write(result.read())
    save_file.close()
    result.close()

def first_alignment (protein_name):
    result_handle = open(f'genes/{protein_name}_protein.xml')
    blast_record = NCBIXML.read(result_handle)

    first = blast_record.alignments[0]
    print("FIRST ALIGNMENT: ", "\nAcession:" + first.accession, "\nHit id:" + first.hit_id)
    print("Definition: " + first.hit_def, "\nAlignment lenght: " , first.length, "\nNumber of HPSs: " , len(first.hsps))
    result_handle.close()



def filter_blast(protein_name, e_value_threshold = 0.001, percent_identity_threshold = 50, coverage_threshold = 50):
    """ 
    Função para filtrar os melhores alinhamentos com base no e-value, percent identity e coverage
    """
    result_handle = open(f'genes/{protein_name}_protein.xml')
    blast_record = NCBIXML.read(result_handle)
    
    print("PARAMETERS: \nDatabase: ", blast_record.database)
    print("Matriz: " + blast_record.matrix)
    print("Gap penalties: " , blast_record.gap_penalties)
    print('\nHits: ', len(blast_record.alignments))
    
    top_alignments = []
    for alignment in blast_record.alignments:
        if 'synthetic construct' not in alignment.title.lower():
            alignment_info = {
                'title': alignment.title,
                'length': alignment.length,
                'e_value': float('inf'),
                'percent_identity': 0.0,
                'coverage': 0.0,
                'query': '',
                'match': '',
                'sbjct': ''       }
            for hsp in alignment.hsps:
                if hsp.expect < alignment_info['e_value'] and \
                 (hsp.identities / hsp.align_length) * 100 >= percent_identity_threshold and \
                 (hsp.align_length / alignment.length) * 100 >= coverage_threshold:

                    alignment_info['e_value'] = hsp.expect
                    alignment_info['percent_identity'] = (hsp.identities / hsp.align_length) * 100
                    alignment_info['coverage'] = (hsp.align_length / alignment.length) * 100
                    alignment_info['query'] = hsp.query[0:75] + '...'
                    alignment_info['match'] = hsp.match[0:75] + '...'
                    alignment_info['sbjct'] = hsp.sbjct[0:75] + '...'

            if alignment_info['e_value'] < e_value_threshold:
                top_alignments.append(alignment_info)
                if alignment_info['coverage'] < coverage_threshold:
                    top_alignments.append(alignment_info)
                    if alignment_info['percent_identity'] < percent_identity_threshold:
                        top_alignments.append(alignment_info)
    for i, alignment_info in enumerate(top_alignments):
        print(f'****Alignment {i+1}****')
        print('Sequence: ', alignment_info['title'])
        print('Length: ', alignment_info['length'])
        print('E-value: ', alignment_info['e_value'])
        print('Percent identity: {:.2f}%'.format(alignment_info['percent_identity']))
        print('Coverage: {:.2f}%'.format(alignment_info['coverage']))
        print(alignment_info['query'])
        print(alignment_info['match'])
        print(alignment_info['sbjct'], '\n')

    species_list = []

    for alignment_info in top_alignments[:4]:
        title = alignment_info['title']
        match = re.search(r"\[(.*?)\]", title)
        if match:
            species = match.group(1)
            species_list.append(species)
    print("\nOrganisms:")
    for species in species_list:
        print(species)
        
    result_handle.close()


#### **1: Gene ptsP**

In [None]:
gene_names = ["ptsP"]
blast_and_filter(gene_names)

Iniciando busca BLAST para ptsP...


In [3]:
BLAST_prot('ptsP')



In [12]:
first_alignment('ptsP')



FIRST ALIGNMENT:  
Acession:Q9K8D3 
Hit id:sp|Q9K8D3.1|
Definition: RecName: Full=Phosphoenolpyruvate-protein phosphotransferase; AltName: Full=Phosphotransferase system, enzyme I [Halalkalibacterium halodurans C-125] 
Alignment lenght:  572 
Number of HPSs:  1


In [13]:
filter_blast('ptsP')

PARAMETERS: 
Database:  swissprot
Matriz: BLOSUM62
Gap penalties:  (11, 1)

Hits:  50


#### **2. Gene ButyrylCoA**

In [4]:
gene_names = ["butyrylCoA"]
blast_and_filter(gene_names)

Iniciando busca BLAST para butyrylCoA...
Busca BLAST concluída para butyrylCoA.
Número de alinhamentos encontrados para butyrylCoA: 8
Título do alinhamento: sp|G2SYC0.1| RecName: Full=Butyryl-CoA:acetate CoA-transferase; Short=Butyryl-CoA CoA-transferase [Roseburia hominis A2-183]
HSP: E-value: 0.0, Identities: 332, Align length: 447, Query Cover: 99.78%
Título do alinhamento: sp|B0MC58.1| RecName: Full=Butyryl-CoA:acetate CoA-transferase; AltName: Full=Butyryl-CoA CoA-transferase [Anaerostipes caccae L1-92]
HSP: E-value: 0.0, Identities: 318, Align length: 447, Query Cover: 99.78%
Título do alinhamento: sp|Q0AVM5.1| RecName: Full=Probable butyrate:acetyl-CoA coenzyme A-transferase; Short=Butyrate CoA-transferase [Syntrophomonas wolfei subsp. wolfei str. Goettingen G311]
HSP: E-value: 2.47933e-164, Identities: 229, Align length: 444, Query Cover: 99.11%
Título do alinhamento: sp|P38942.3| RecName: Full=4-hydroxybutyrate coenzyme A transferase [Clostridium kluyveri DSM 555]
HSP: E-value

