## **Filogenetic Analysis**

The code performs a multiple sequence alignment of seven sequences, selecting the best alignments of homologous sequences. Then, it processes the calculation of the distance matrix based on the obtained alignment using the 'BLOSUM62' substitution matrix. Finally, it constructs phylogenetic trees using the UPGMA and NJ methods for each gene under analysis.


In [21]:
from Bio import SeqIO, AlignIO, Phylo
from Bio.SeqRecord import SeqRecord
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
import subprocess
import os
import matplotlib.pyplot as plt
import re
from Bio.Blast import NCBIXML

def process_gene_with_clustal(gene_name, fasta_file):
    # Create output directory
    output_dir = "filogenetic_analysis"
    os.makedirs(output_dir, exist_ok=True)

    # Read the FASTA file
    sequences = []
    with open(fasta_file) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            sequences.append(record)

    # Save the extracted sequences in a FASTA file
    trimmed_fasta = os.path.join(output_dir, f"{gene_name}_trimmed.fasta")
    SeqIO.write(sequences, trimmed_fasta, "fasta")

    # Path to the ClustalW2 executable
    clustal_exe = "clustalw2"  # Replace with the correct executable path if necessary

    # Define the alignment output file
    alignment_file = os.path.join(output_dir, f"{gene_name}_alignment.fasta")

    # Execute the ClustalW2 command to perform the alignment
    clustal_command = [
        clustal_exe,
        f"/INFILE={trimmed_fasta}",
        f"/OUTFILE={alignment_file}",
        "/OUTPUT=FASTA",
        "/QUIET"
    ]

    print(f"Running ClustalW2 for gene {gene_name}...")
    try:
        subprocess.run(clustal_command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error executing ClustalW2: {e}")
        return

    # Check if the alignment was successfully generated
    if not os.path.exists(alignment_file):
        raise FileNotFoundError(f"Alignment was not generated for gene {gene_name}.")

    # Read the generated alignment
    alignment = AlignIO.read(alignment_file, "fasta")

    # Ensure unique IDs and preserve important information
    unique_ids = set()
    query_id = None  # Para armazenar a identificação da query, se necessário

    for record in alignment:
        original_id = record.id
        full_description = record.description

        # Tenta extrair o nome da espécie da descrição, se disponível
        species_match = re.search(r"Species:\s*(.+)", full_description)
        if species_match:
            full_id = species_match.group(1).replace(" ", "_")  # Usa o nome da espécie
        else:
            full_id = original_id  # Caso contrário, mantém o ID original

        # Se a sequência corresponde à tua query, guarda a referência
        if "query" in full_description.lower() or "query" in original_id.lower():
            query_id = full_id

        # Evitar duplicados
        if full_id in unique_ids:
            print(f"Duplicate ID found: {full_id}. Skipping...")
            continue

        # Atualizar ID da sequência
        record.id = full_id
        record.description = full_id
        unique_ids.add(full_id)

    # Verifica se a query foi encontrada
    if query_id:
        print(f"Query identificada: {query_id}")
    else:
        print("Aviso: A query não foi identificada no alinhamento.")


    # Compute distance matrix
    calculator = DistanceCalculator('blosum62')
    try:
        dm = calculator.get_distance(alignment)
        print(f"Distance Matrix:\n{dm}")
    except ValueError as e:
        print(f"Error calculating distance matrix: {e}")
        return

    # Build trees
    constructor = DistanceTreeConstructor()
    upgma_tree = constructor.upgma(dm)
    nj_tree = constructor.nj(dm)

    # Destacar a query e remover 'Inner' labels
    for tree in [upgma_tree, nj_tree]:
        for clade in tree.find_clades():
            if 'Inner' in (clade.name or ''):
                clade.name = ""
            if clade.name == query_id:
                clade.color = 'red'  # Destaca a query a vermelho
                clade.name = f'*** QUERY: {clade.name} ***'
            elif 'Homo' in (clade.name or ''):
                clade.color = 'orange'
                clade.name = f'*** {clade.name} ***'

    # Save trees
    Phylo.write(upgma_tree, os.path.join(output_dir, f'{gene_name}_upgma_tree.nwk'), 'newick')
    Phylo.write(nj_tree, os.path.join(output_dir, f'{gene_name}_nj_tree.nwk'), 'newick')

    # Process and plot trees
    for tree, tree_type in [(upgma_tree, 'UPGMA'), (nj_tree, 'NJ')]:
        # Create figure and axes
        fig = plt.figure(figsize=(35, 15))  # Increase figure size
        ax = fig.add_subplot(1, 1, 1)

        # Draw tree on created axes
        Phylo.draw(tree, do_show=False, axes=ax)
        for label in ax.get_xticklabels() + ax.get_yticklabels():
            label.set_fontsize(18)
        for text in ax.findobj(match=plt.Text):  # Find all text objects
            text.set_fontsize(22)

        # Add title
        plt.title(f'{gene_name} - {tree_type} Tree', fontsize=30)

        # Save tree image
        tree_img = os.path.join(output_dir, f'{gene_name}_{tree_type}_tree.png')
        plt.savefig(tree_img, bbox_inches='tight')  # 'bbox_inches' ensures nothing gets cut off
        plt.close()

    print(f"Processing completed for gene {gene_name}. Results saved in '{output_dir}'.")


In [41]:
from Bio import SeqIO, AlignIO, Phylo
from Bio.SeqRecord import SeqRecord
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
import subprocess
import os
import matplotlib.pyplot as plt
import re
from Bio.Blast import NCBIXML

def process_gene_with_clustal(gene_name):
    # Check if the query file exists
    query_file = os.path.join("genes", f"{gene_name}.fasta")
    if not os.path.exists(query_file):
        print(f"Error: Query file '{query_file}' not found.")
        return

    # Check if the blast results file exists
    blast_file = os.path.join("blast_results", f"{gene_name}_blast.fasta")
    blast_sequences = list(SeqIO.parse(blast_file, "fasta"))

    if not os.path.exists(blast_file):
        print(f"Error: BLAST results file '{blast_file}' not found.")
        return

    # Selecionar as 10 melhores sequências do BLAST
    top_hits = blast_sequences[:10]

    # Adicionar a query às sequências selecionadas
    sequences = [query_sequence] + top_hits

    # Salvar as sequências selecionadas em um arquivo FASTA temporário
    temp_fasta = os.path.join("filogenetic_analysis", f"{gene_name}_temp.fasta")
    SeqIO.write(sequences, temp_fasta, "fasta")

    # Realizar o alinhamento com ClustalW2
    alignment_file = os.path.join("filogenetic_analysis", f"{gene_name}_alignment.fasta")
    clustal_command = [
        "clustalw2",
        f"-INFILE={temp_fasta}",
        f"-OUTFILE={alignment_file}",
        "-OUTPUT=FASTA",
        "-QUIET"
    ]
    subprocess.run(clustal_command, check=True)

    # Ler o alinhamento resultante
    alignment = AlignIO.read(alignment_file, "fasta")

     # Compute distance matrix
    calculator = DistanceCalculator('blosum62')
    dm = calculator.get_distance(alignment)

    # Build trees
    constructor = DistanceTreeConstructor()
    upgma_tree = constructor.upgma(dm)
    nj_tree = constructor.nj(dm)

    # Save and plot trees
    for tree, tree_type in [(upgma_tree, 'UPGMA'), (nj_tree, 'NJ')]:
        Phylo.write(tree, os.path.join(output_dir, f'{gene_name}_{tree_type}_tree.nwk'), 'newick')
        fig = plt.figure(figsize=(35, 15))
        Phylo.draw(tree, do_show=False)
        plt.title(f'{gene_name} - {tree_type} Tree', fontsize=30)
        plt.savefig(os.path.join(output_dir, f'{gene_name}_{tree_type}_tree.png'), bbox_inches='tight')
        plt.close()

    print(f"Processing completed for gene {gene_name}. Results saved in '{output_dir}'.")


In [43]:
from Bio import SeqIO, AlignIO, Phylo
from Bio.SeqRecord import SeqRecord
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
import subprocess
import os
import matplotlib.pyplot as plt

def process_gene_with_clustal(gene_name):
    # Check if the query file exists
    query_file = os.path.join("genes", f"{gene_name}.fasta")
    if not os.path.exists(query_file):
        print(f"Error: Query file '{query_file}' not found.")
        return

    # Read the query sequence
    query_sequence = next(SeqIO.parse(query_file, "fasta"))

    # Check if the blast results file exists
    blast_file = os.path.join("blast_results", f"{gene_name}_blast.fasta")
    if not os.path.exists(blast_file):
        print(f"Error: BLAST results file '{blast_file}' not found.")
        return

    # Read BLAST sequences
    blast_sequences = list(SeqIO.parse(blast_file, "fasta"))

    # Select the top 10 BLAST hits
    top_hits = blast_sequences[:10]

    # Add the query to the selected sequences
    sequences = [query_sequence] + top_hits

    # Create the filogenetic_analysis directory if it doesn't exist
    output_dir = "filogenetic_analysis"
    os.makedirs(output_dir, exist_ok=True)

    # Save the selected sequences in a temporary FASTA file
    temp_fasta = os.path.join(output_dir, f"{gene_name}_temp.fasta")
    SeqIO.write(sequences, temp_fasta, "fasta")

    # Perform alignment with ClustalW2
    alignment_file = os.path.join(output_dir, f"{gene_name}_alignment.fasta")
    clustal_command = [
        "clustalw2",
        f"-INFILE={temp_fasta}",
        f"-OUTFILE={alignment_file}",
        "-OUTPUT=FASTA",
        "-QUIET"
    ]
    subprocess.run(clustal_command, check=True)

    # Read the resulting alignment
    alignment = AlignIO.read(alignment_file, "fasta")

    # Compute distance matrix
    calculator = DistanceCalculator('blosum62')
    dm = calculator.get_distance(alignment)

    # Build trees
    constructor = DistanceTreeConstructor()
    upgma_tree = constructor.upgma(dm)
    nj_tree = constructor.nj(dm)

    # Save and plot trees
    for tree, tree_type in [(upgma_tree, 'UPGMA'), (nj_tree, 'NJ')]:
        Phylo.write(tree, os.path.join(output_dir, f'{gene_name}_{tree_type}_tree.nwk'), 'newick')
        fig = plt.figure(figsize=(35, 15))
        Phylo.draw(tree, do_show=False)
        plt.title(f'{gene_name} - {tree_type} Tree', fontsize=30)
        plt.savefig(os.path.join(output_dir, f'{gene_name}_{tree_type}_tree.png'), bbox_inches='tight')
        plt.close()

    print(f"Processing completed for gene {gene_name}. Results saved in '{output_dir}'.")

# Example usage
process_gene_with_clustal("ptsP")


Processing completed for gene ptsP. Results saved in 'filogenetic_analysis'.


<Figure size 3500x1500 with 0 Axes>

<Figure size 3500x1500 with 0 Axes>

#### **1: Gene ptsP**

In [42]:
process_gene_with_clustal("ptsP")

NameError: name 'query_sequence' is not defined

#### **2: Gene ButyrylCoA**

In [7]:
process_gene_with_clustal("butyrylCoA", "blast_results/butyrylCoA_blast.fasta")

Running ClustalW2 for gene butyrylCoA...
Distance Matrix:
G2SYC0  0.000000
B0MC58  0.245211    0.000000
Q0AVM5  0.451849    0.460215    0.000000
    G2SYC0  B0MC58  Q0AVM5
Processing completed for gene butyrylCoA. Results saved in 'filogenetic_analysis'.


#### **3: Gene MutS Domain I**

In [11]:
process_gene_with_clustal("MutS", "blast_results/MutS_blast.fasta")

Running ClustalW2 for gene MutS...


FileNotFoundError: Alignment was not generated for gene MutS.