In [5]:
from Bio import SeqIO, AlignIO, Phylo
from Bio.SeqRecord import SeqRecord
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
import subprocess
import os
import matplotlib.pyplot as plt
import re

def process_gene_with_clustal(gene_name, fasta_file):
    # Create output directory
    output_dir = "filogenetic_analysis"
    os.makedirs(output_dir, exist_ok=True)

    # Read the FASTA file
    sequences = []
    with open(fasta_file) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            sequences.append(record)

    # Save the extracted sequences in a FASTA file
    trimmed_fasta = os.path.join(output_dir, f"{gene_name}_trimmed.fasta")
    SeqIO.write(sequences, trimmed_fasta, "fasta")

    # Path to the ClustalW2 executable
    clustal_exe = "clustalw2"  # Replace with the correct executable path if necessary

    # Define the alignment output file
    alignment_file = os.path.join(output_dir, f"{gene_name}_alignment.fasta")

    # Execute the ClustalW2 command to perform the alignment
    clustal_command = [
        clustal_exe,
        f"/INFILE={trimmed_fasta}",
        f"/OUTFILE={alignment_file}",
        "/OUTPUT=FASTA",
        "/QUIET"
    ]

    print(f"Running ClustalW2 for gene {gene_name}...")
    try:
        subprocess.run(clustal_command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error executing ClustalW2: {e}")
        return

    # Check if the alignment was successfully generated
    if not os.path.exists(alignment_file):
        raise FileNotFoundError(f"Alignment was not generated for gene {gene_name}.")

    # Read the generated alignment
    alignment = AlignIO.read(alignment_file, "fasta")

    # Ensure unique IDs and preserve important information
    unique_ids = set()
    query_id = None  # Para armazenar a identificação da query, se necessário

    for record in alignment:
        original_id = record.id
        full_description = record.description

        # Tenta extrair o nome da espécie da descrição, se disponível
        species_match = re.search(r"\[(.*?)\]", full_description)
        if species_match:
            full_id = species_match.group(1).replace(" ", "_")  # Usa o nome da espécie
        else:
            full_id = original_id  # Caso contrário, mantém o ID original

        # Se a sequência corresponde à tua query, guarda a referência
        if "query" in full_description.lower() or "query" in original_id.lower():
            query_id = full_id

        # Evitar duplicados
        if full_id in unique_ids:
            print(f"Duplicate ID found: {full_id}. Skipping...")
            continue

        # Atualizar ID da sequência
        record.id = full_id
        record.description = full_id
        unique_ids.add(full_id)

    # Verifica se a query foi encontrada
    if query_id:
        print(f"Query identificada: {query_id}")
    else:
        print("Aviso: A query não foi identificada no alinhamento.")


    # Compute distance matrix
    calculator = DistanceCalculator('blosum62')
    try:
        dm = calculator.get_distance(alignment)
        print(f"Distance Matrix:\n{dm}")
    except ValueError as e:
        print(f"Error calculating distance matrix: {e}")
        return

    # Build trees
    constructor = DistanceTreeConstructor()
    upgma_tree = constructor.upgma(dm)
    nj_tree = constructor.nj(dm)

    # Destacar a query e remover 'Inner' labels
    for tree in [upgma_tree, nj_tree]:
        for clade in tree.find_clades():
            if 'Inner' in (clade.name or ''):
                clade.name = ""
            if clade.name == query_id:
                clade.color = 'red'  # Destaca a query a vermelho
                clade.name = f'*** QUERY: {clade.name} ***'
            elif 'Homo' in (clade.name or ''):
                clade.color = 'orange'
                clade.name = f'*** {clade.name} ***'

    # Save trees
    Phylo.write(upgma_tree, os.path.join(output_dir, f'{gene_name}_upgma_tree.nwk'), 'newick')
    Phylo.write(nj_tree, os.path.join(output_dir, f'{gene_name}_nj_tree.nwk'), 'newick')

    # Process and plot trees
    for tree, tree_type in [(upgma_tree, 'UPGMA'), (nj_tree, 'NJ')]:
        # Create figure and axes
        fig = plt.figure(figsize=(35, 15))  # Increase figure size
        ax = fig.add_subplot(1, 1, 1)

        # Draw tree on created axes
        Phylo.draw(tree, do_show=False, axes=ax)
        for label in ax.get_xticklabels() + ax.get_yticklabels():
            label.set_fontsize(18)
        for text in ax.findobj(match=plt.Text):  # Find all text objects
            text.set_fontsize(22)

        # Add title
        plt.title(f'{gene_name} - {tree_type} Tree', fontsize=30)

        # Save tree image
        tree_img = os.path.join(output_dir, f'{gene_name}_{tree_type}_tree.png')
        plt.savefig(tree_img, bbox_inches='tight')  # 'bbox_inches' ensures nothing gets cut off
        plt.close()

    print(f"Processing completed for gene {gene_name}. Results saved in '{output_dir}'.")


In [6]:
from Bio import SeqIO, AlignIO, Phylo
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
import subprocess
import os
import matplotlib.pyplot as plt
import re

def find_query(query_name, query_dir="genes"):
    """Procura o ficheiro da query na pasta especificada."""
    query_path = os.path.join(query_dir, f"{query_name}.fasta")
    if os.path.exists(query_path):
        return query_path
    else:
        raise FileNotFoundError(f"Query {query_name} não encontrada em {query_dir}.")

def process_gene_with_clustal(gene_name, fasta_file, query_name):
    output_dir = "filogenetic_analysis"
    os.makedirs(output_dir, exist_ok=True)

    # Encontrar a query
    query_path = find_query(query_name)

    # Ler a query
    query_record = next(SeqIO.parse(query_path, "fasta"))
    query_id = query_record.id

    # Ler as sequências do FASTA e filtrar por nome de espécie
    sequences = []
    species_seen = set()

    with open(fasta_file) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            species_match = re.search(r"\[(.*?)\]", record.description)
            if species_match:
                species_name = species_match.group(1).replace(" ", "_")
            else:
                species_name = record.id  # Fallback para ID caso não haja nome da espécie

            if species_name not in species_seen:
                species_seen.add(species_name)
                record.id = species_name  # Atualizar ID para o nome da espécie
                record.description = species_name
                sequences.append(record)

    # Se houver mais de 10 sequências, escolher as 10 maiores
    if len(sequences) > 10:
        sequences = sorted(sequences, key=lambda x: len(x.seq), reverse=True)[:10]

    # Adicionar a query à lista
    sequences.insert(0, query_record)

    # Guardar as sequências filtradas
    trimmed_fasta = os.path.join(output_dir, f"{gene_name}_trimmed.fasta")
    SeqIO.write(sequences, trimmed_fasta, "fasta")

    # Caminho do ClustalW2
    clustal_exe = "clustalw2"

    # Definir ficheiro de saída do alinhamento
    alignment_file = os.path.join(output_dir, f"{gene_name}_alignment.fasta")

    # Executar ClustalW2 para alinhamento
    clustal_command = [
        clustal_exe,
        f"/INFILE={trimmed_fasta}",
        f"/OUTFILE={alignment_file}",
        "/OUTPUT=FASTA",
        "/QUIET"
    ]

    print(f"Running ClustalW2 for gene {gene_name}...")
    try:
        subprocess.run(clustal_command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error executing ClustalW2: {e}")
        return

    # Verificar se o alinhamento foi gerado
    if not os.path.exists(alignment_file):
        raise FileNotFoundError(f"Alignment was not generated for gene {gene_name}.")

    # Ler o alinhamento gerado
    alignment = AlignIO.read(alignment_file, "fasta")

    # Calcular a matriz de distâncias
    calculator = DistanceCalculator('blosum62')
    try:
        dm = calculator.get_distance(alignment)
    except ValueError as e:
        print(f"Error calculating distance matrix: {e}")
        return

    # Construir árvores filogenéticas
    constructor = DistanceTreeConstructor()
    upgma_tree = constructor.upgma(dm)
    nj_tree = constructor.nj(dm)

    # Destacar a query e formatar nomes
    for tree in [upgma_tree, nj_tree]:
        for clade in tree.find_clades():
            if clade.name == query_id:
                clade.color = 'red'
                clade.name = f'*** QUERY: {clade.name} ***'

    # Guardar árvores
    Phylo.write(upgma_tree, os.path.join(output_dir, f"{gene_name}_upgma_tree.nwk"), "newick")
    Phylo.write(nj_tree, os.path.join(output_dir, f"{gene_name}_nj_tree.nwk"), "newick")

    # Gerar imagens das árvores
    for tree, tree_type in [(upgma_tree, "UPGMA"), (nj_tree, "NJ")]:
        fig = plt.figure(figsize=(35, 15))
        ax = fig.add_subplot(1, 1, 1)
        Phylo.draw(tree, do_show=False, axes=ax)
        plt.title(f"{gene_name} - {tree_type} Tree", fontsize=30)
        plt.savefig(os.path.join(output_dir, f"{gene_name}_{tree_type}_tree.png"), bbox_inches="tight")
        plt.close()

    print(f"Processing completed for gene {gene_name}. Results saved in '{output_dir}'.")


In [9]:
from Bio import SeqIO, AlignIO, Phylo
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
import subprocess
import os
import matplotlib.pyplot as plt
import re

def process_gene_with_clustal(gene_name):
    # Definir diretórios
    genes_dir = "genes"
    blast_results_dir = "blast_results"
    output_dir = "filogenetic_analysis"
    os.makedirs(output_dir, exist_ok=True)

    # Buscar a query principal na pasta 'genes'
    query_path = os.path.join(genes_dir, f"{gene_name}.fasta")
    if not os.path.exists(query_path):
        raise FileNotFoundError(f"Query file not found: {query_path}")
    
    query_seq = next(SeqIO.parse(query_path, "fasta"))  # Assume apenas uma query

    # Buscar sequências do BLAST na pasta 'blast_results'
    blast_fasta = os.path.join(blast_results_dir, f"{gene_name}_blast.fasta")
    if not os.path.exists(blast_fasta):
        raise FileNotFoundError(f"BLAST results file not found: {blast_fasta}")

    # Filtrar sequências com nomes de espécies diferentes
    unique_species = {}
    with open(blast_fasta) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            species_match = re.search(r"\[(.*?)\]", record.description)
            if species_match:
                species_name = species_match.group(1).replace(" ", "_")
                if species_name not in unique_species:
                    unique_species[species_name] = record
    
    # Selecionar até 10 melhores sequências
    selected_seqs = list(unique_species.values())[:10]
    selected_seqs.insert(0, query_seq)  # Garantir que a query está incluída

    print(f"Query: {query_seq.id} - {query_seq.description}")

    print(f"Ficheiro BLAST encontrado? {os.path.exists(blast_fasta)}")
    if os.path.exists(blast_fasta):
        print(f"Tamanho do ficheiro BLAST: {os.path.getsize(blast_fasta)} bytes")

    print("Sequências extraídas do BLAST:")
    unique_species = {}
    with open(blast_fasta) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            print(f"Encontrado: {record.id} - {record.description}")  # Debug

            species_match = re.search(r"\[(.*?)\]", record.description)
            if species_match:
                species_name = species_match.group(1).replace(" ", "_")
                print(f"  -> Nome da espécie extraído: {species_name}")
                if species_name not in unique_species:
                    unique_species[species_name] = record
            else:
                print(f"  -> Aviso: Não foi possível extrair nome da espécie de {record.description}")

    print(f"Total de espécies únicas: {len(unique_species)}")



    # Guardar no ficheiro trimmed
    trimmed_fasta = os.path.join(output_dir, f"{gene_name}_trimmed.fasta")
    SeqIO.write(selected_seqs, trimmed_fasta, "fasta")

    # Executar ClustalW2 para alinhamento
    clustal_exe = "clustalw2"
    alignment_file = os.path.join(output_dir, f"{gene_name}_alignment.fasta")
    clustal_command = [
        clustal_exe,
        f"/INFILE={trimmed_fasta}",
        f"/OUTFILE={alignment_file}",
        "/OUTPUT=FASTA",
        "/QUIET"
    ]
    subprocess.run(clustal_command, check=True)

    # Ler alinhamento
    alignment = AlignIO.read(alignment_file, "fasta")
    
    # Calcular matriz de distâncias
    calculator = DistanceCalculator('blosum62')
    dm = calculator.get_distance(alignment)

    # Construir árvores
    constructor = DistanceTreeConstructor()
    upgma_tree = constructor.upgma(dm)
    nj_tree = constructor.nj(dm)
    
    # Destacar a query na árvore
    for tree in [upgma_tree, nj_tree]:
        for clade in tree.find_clades():
            if clade.name == query_seq.id:
                clade.color = 'red'
                clade.name = f'*** QUERY: {clade.name} ***'
    
    # Guardar árvores
    Phylo.write(upgma_tree, os.path.join(output_dir, f'{gene_name}_upgma_tree.nwk'), 'newick')
    Phylo.write(nj_tree, os.path.join(output_dir, f'{gene_name}_nj_tree.nwk'), 'newick')
    
    # Plotar árvores
    for tree, tree_type in [(upgma_tree, 'UPGMA'), (nj_tree, 'NJ')]:
        fig = plt.figure(figsize=(35, 15))
        ax = fig.add_subplot(1, 1, 1)
        Phylo.draw(tree, do_show=False, axes=ax)
        plt.title(f'{gene_name} - {tree_type} Tree', fontsize=30)
        plt.savefig(os.path.join(output_dir, f'{gene_name}_{tree_type}_tree.png'), bbox_inches='tight')
        plt.close()
    
    print(f"Processing completed for gene {gene_name}. Results saved in '{output_dir}'.")


In [10]:
process_gene_with_clustal("ptsP")

Query: WP_005925321.1 - WP_005925321.1 phosphoenolpyruvate--protein phosphotransferase [Faecalibacterium prausnitzii]
Ficheiro BLAST encontrado? True
Tamanho do ficheiro BLAST: 31203 bytes
Sequências extraídas do BLAST:
Encontrado: Q9K8D3 - Q9K8D3 E-value: 5.96e-160, Identities: 228/530, Query Cover: 96.89%, Percent Identity: 43.02%, Species: Halalkalibacterium halodurans C-125
  -> Aviso: Não foi possível extrair nome da espécie de Q9K8D3 E-value: 5.96e-160, Identities: 228/530, Query Cover: 96.89%, Percent Identity: 43.02%, Species: Halalkalibacterium halodurans C-125
Encontrado: O83018 - O83018 E-value: 4.76e-154, Identities: 235/530, Query Cover: 96.89%, Percent Identity: 44.34%, Species: Bacillus sp. S
  -> Aviso: Não foi possível extrair nome da espécie de O83018 E-value: 4.76e-154, Identities: 235/530, Query Cover: 96.89%, Percent Identity: 44.34%, Species: Bacillus sp. S
Encontrado: P42014 - P42014 E-value: 6.92e-153, Identities: 235/530, Query Cover: 96.89%, Percent Identity: 

ValueError: No records found in handle