## **Filogenetic Analysis**

The code performs a multiple sequence alignment of seven sequences, selecting the best alignments of homologous sequences. Then, it processes the calculation of the distance matrix based on the obtained alignment using the 'BLOSUM62' substitution matrix. Finally, it constructs phylogenetic trees using the UPGMA and NJ methods for each gene under analysis.


In [8]:
from Bio import SeqIO, AlignIO, Phylo
from Bio.SeqRecord import SeqRecord
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
import subprocess
import os
import matplotlib.pyplot as plt

def process_gene_with_clustal(gene_name, fasta_file):
    # Create output directory
    output_dir = "Resultados da Análise Filogenética"
    os.makedirs(output_dir, exist_ok=True)

    # Read the FASTA file
    sequences = []
    with open(fasta_file) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            sequences.append(record)

    # Check if we have at least 7 sequences
    if len(sequences) < 7:
        raise ValueError(f"The FASTA file must contain at least 7 sequences. Found: {len(sequences)}")

    # Save the extracted sequences in a FASTA file
    trimmed_fasta = os.path.join(output_dir, f"{gene_name}_trimmed.fasta")
    SeqIO.write(sequences, trimmed_fasta, "fasta")

    # Path to the ClustalW2 executable
    clustal_exe = "clustalw2"  # Replace with the correct executable path if necessary

    # Define the alignment output file
    alignment_file = os.path.join(output_dir, f"{gene_name}_alignment.fasta")

    # Execute the ClustalW2 command to perform the alignment
    clustal_command = [
        clustal_exe,
        f"/INFILE={trimmed_fasta}",
        f"/OUTFILE={alignment_file}",
        "/OUTPUT=FASTA",
        "/QUIET"
    ]

    print(f"Running ClustalW2 for gene {gene_name}...")
    try:
        subprocess.run(clustal_command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error executing ClustalW2: {e}")
        return

    # Check if the alignment was successfully generated
    if not os.path.exists(alignment_file):
        raise FileNotFoundError(f"Alignment was not generated for gene {gene_name}.")

    # Read the generated alignment
    alignment = AlignIO.read(alignment_file, "fasta")

    # Ensure unique IDs and preserve important information
    unique_ids = set()
    for i, record in enumerate(alignment):
        # Extract the full original ID
        original_id = record.id
        full_description = record.description

        # Process the description to include everything
        full_id = "_".join(full_description.replace(":", "").split())  # Replace spaces with "_", remove ":"

        # Ensure uniqueness
        if full_id in unique_ids:
            print(f"Duplicate ID found: {full_id}. Renaming...")
            full_id = f"{full_id}_{i}"  # Add index if necessary

        # Update the record's ID and description
        record.id = full_id
        record.description = full_id
        unique_ids.add(full_id)

    # Compute distance matrix
    calculator = DistanceCalculator('blosum62')
    try:
        dm = calculator.get_distance(alignment)
        print(f"Distance Matrix:\n{dm}")
    except ValueError as e:
        print(f"Error calculating distance matrix: {e}")
        return

    # Build trees
    constructor = DistanceTreeConstructor()
    upgma_tree = constructor.upgma(dm)
    nj_tree = constructor.nj(dm)

    # Remove 'Inner' labels from clades and highlight Homo genus
    for tree in [upgma_tree, nj_tree]:
        for clade in tree.find_clades():
            if 'Inner' in (clade.name or ''):
                clade.name = ""
            if 'Homo' in (clade.name or ''):
                clade.color = 'orange'
                clade.name = '*** ' + clade.name + ' ***'  # Mark the clade

    # Save trees
    Phylo.write(upgma_tree, os.path.join(output_dir, f'{gene_name}_upgma_tree.nwk'), 'newick')
    Phylo.write(nj_tree, os.path.join(output_dir, f'{gene_name}_nj_tree.nwk'), 'newick')

    # Process and plot trees
    for tree, tree_type in [(upgma_tree, 'UPGMA'), (nj_tree, 'NJ')]:
        # Create figure and axes
        fig = plt.figure(figsize=(35, 15))  # Increase figure size
        ax = fig.add_subplot(1, 1, 1)

        # Draw tree on created axes
        Phylo.draw(tree, do_show=False, axes=ax)
        for label in ax.get_xticklabels() + ax.get_yticklabels():
            label.set_fontsize(18)
        for text in ax.findobj(match=plt.Text):  # Find all text objects
            text.set_fontsize(22)

        # Add title
        plt.title(f'{gene_name} - {tree_type} Tree', fontsize=30)

        # Save tree image
        tree_img = os.path.join(output_dir, f'{gene_name}_{tree_type}_tree.png')
        plt.savefig(tree_img, bbox_inches='tight')  # 'bbox_inches' ensures nothing gets cut off
        plt.close()

    print(f"Processing completed for gene {gene_name}. Results saved in '{output_dir}'.")


#### **1: Gene ptsP**

In [9]:
process_gene_with_clustal("ptsP", "blast_results/ptsP_blast.fasta")

Running ClustalW2 for gene ptsP...
Distance Matrix:
Faecalibacterium_prausnitzii    0.000000
Faecalibacterium_prausnitzii_1  0.001080    0.000000
Faecalibacterium_prausnitzii_2  0.001080    0.002159    0.000000
Faecalibacterium_prausnitzii_3  0.001439    0.002519    0.002519    0.000000
Faecalibacterium_prausnitzii_4  0.002157    0.003236    0.003236    0.003596    0.000000
Faecalibacterium_sp.    0.001439    0.000360    0.002519    0.002879    0.003596    0.000000
Faecalibacterium_prausnitzii_6  0.001799    0.002879    0.002879    0.003239    0.003955    0.003239    0.000000
    Faecalibacterium_prausnitzii    Faecalibacterium_prausnitzii_1  Faecalibacterium_prausnitzii_2  Faecalibacterium_prausnitzii_3  Faecalibacterium_prausnitzii_4  Faecalibacterium_sp.    Faecalibacterium_prausnitzii_6
Processing completed for gene ptsP. Results saved in 'Resultados da Análise Filogenética'.


#### **2: Gene ButyrylCoA**

In [10]:
process_gene_with_clustal("butyrylCoA", "blast_results/butyrylCoA_blast.fasta")

Running ClustalW2 for gene butyrylCoA...
Distance Matrix:
Faecalibacterium    0.000000
Faecalibacterium_1  0.002124    0.000000
Faecalibacterium_prausnitzii    0.002123    0.004246    0.000000
Faecalibacterium_prausnitzii_M21/2  0.000000    0.002124    0.002123    0.000000
Faecalibacterium_sp.    0.003398    0.001700    0.005520    0.003398    0.000000
Faecalibacterium_prausnitzii_5  0.003398    0.001276    0.005520    0.003398    0.002975    0.000000
Faecalibacterium_prausnitzii_6  0.003398    0.001276    0.005520    0.003398    0.002975    0.002551    0.000000
    Faecalibacterium    Faecalibacterium_1  Faecalibacterium_prausnitzii    Faecalibacterium_prausnitzii_M21/2  Faecalibacterium_sp.    Faecalibacterium_prausnitzii_5  Faecalibacterium_prausnitzii_6
Processing completed for gene butyrylCoA. Results saved in 'Resultados da Análise Filogenética'.


#### **3: Gene MutS Domain I**

In [11]:
process_gene_with_clustal("MutS", "blast_results/MutS_blast.fasta")

Running ClustalW2 for gene MutS...


FileNotFoundError: Alignment was not generated for gene MutS.