## **Filogenetic Analysis**

The code performs a multiple sequence alignment of seven sequences, selecting the best alignments of homologous sequences. Then, it processes the calculation of the distance matrix based on the obtained alignment using the 'BLOSUM62' substitution matrix. Finally, it constructs phylogenetic trees using the UPGMA and NJ methods for each gene under analysis.


In [17]:
from Bio import SeqIO, AlignIO, Phylo
from Bio.SeqRecord import SeqRecord
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
import subprocess
import os
import matplotlib.pyplot as plt
import re

def process_gene_with_clustal(gene_name, fasta_file):
    # Create output directory
    output_dir = "filogenetic_analysis"
    os.makedirs(output_dir, exist_ok=True)

    # Read the FASTA file
    sequences = []
    with open(fasta_file) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            sequences.append(record)

    # Save the extracted sequences in a FASTA file
    trimmed_fasta = os.path.join(output_dir, f"{gene_name}_trimmed.fasta")
    SeqIO.write(sequences, trimmed_fasta, "fasta")

    # Path to the ClustalW2 executable
    clustal_exe = "clustalw2"  # Replace with the correct executable path if necessary

    # Define the alignment output file
    alignment_file = os.path.join(output_dir, f"{gene_name}_alignment.fasta")

    # Execute the ClustalW2 command to perform the alignment
    clustal_command = [
        clustal_exe,
        f"/INFILE={trimmed_fasta}",
        f"/OUTFILE={alignment_file}",
        "/OUTPUT=FASTA",
        "/QUIET"
    ]

    print(f"Running ClustalW2 for gene {gene_name}...")
    try:
        subprocess.run(clustal_command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error executing ClustalW2: {e}")
        return

    # Check if the alignment was successfully generated
    if not os.path.exists(alignment_file):
        raise FileNotFoundError(f"Alignment was not generated for gene {gene_name}.")

    # Read the generated alignment
    alignment = AlignIO.read(alignment_file, "fasta")

    # Ensure unique IDs and preserve important information
    unique_ids = set()
    query_id = None  # Para armazenar a identificação da query, se necessário

    for record in alignment:
        original_id = record.id
        full_description = record.description

        # Tenta extrair o nome da espécie da descrição, se disponível
        species_match = re.search(r"\[(.*?)\]", full_description)
        if species_match:
            full_id = species_match.group(1).replace(" ", "_")  # Usa o nome da espécie
        else:
            full_id = original_id  # Caso contrário, mantém o ID original

        # Se a sequência corresponde à tua query, guarda a referência
        if "query" in full_description.lower() or "query" in original_id.lower():
            query_id = full_id

        # Evitar duplicados
        if full_id in unique_ids:
            print(f"Duplicate ID found: {full_id}. Skipping...")
            continue

        # Atualizar ID da sequência
        record.id = full_id
        record.description = full_id
        unique_ids.add(full_id)

    # Verifica se a query foi encontrada
    if query_id:
        print(f"Query identificada: {query_id}")
    else:
        print("Aviso: A query não foi identificada no alinhamento.")


    # Compute distance matrix
    calculator = DistanceCalculator('blosum62')
    try:
        dm = calculator.get_distance(alignment)
        print(f"Distance Matrix:\n{dm}")
    except ValueError as e:
        print(f"Error calculating distance matrix: {e}")
        return

    # Build trees
    constructor = DistanceTreeConstructor()
    upgma_tree = constructor.upgma(dm)
    nj_tree = constructor.nj(dm)

    # Destacar a query e remover 'Inner' labels
    for tree in [upgma_tree, nj_tree]:
        for clade in tree.find_clades():
            if 'Inner' in (clade.name or ''):
                clade.name = ""
            if clade.name == query_id:
                clade.color = 'red'  # Destaca a query a vermelho
                clade.name = f'*** QUERY: {clade.name} ***'
            elif 'Homo' in (clade.name or ''):
                clade.color = 'orange'
                clade.name = f'*** {clade.name} ***'

    # Save trees
    Phylo.write(upgma_tree, os.path.join(output_dir, f'{gene_name}_upgma_tree.nwk'), 'newick')
    Phylo.write(nj_tree, os.path.join(output_dir, f'{gene_name}_nj_tree.nwk'), 'newick')

    # Process and plot trees
    for tree, tree_type in [(upgma_tree, 'UPGMA'), (nj_tree, 'NJ')]:
        # Create figure and axes
        fig = plt.figure(figsize=(35, 15))  # Increase figure size
        ax = fig.add_subplot(1, 1, 1)

        # Draw tree on created axes
        Phylo.draw(tree, do_show=False, axes=ax)
        for label in ax.get_xticklabels() + ax.get_yticklabels():
            label.set_fontsize(18)
        for text in ax.findobj(match=plt.Text):  # Find all text objects
            text.set_fontsize(22)

        # Add title
        plt.title(f'{gene_name} - {tree_type} Tree', fontsize=30)

        # Save tree image
        tree_img = os.path.join(output_dir, f'{gene_name}_{tree_type}_tree.png')
        plt.savefig(tree_img, bbox_inches='tight')  # 'bbox_inches' ensures nothing gets cut off
        plt.close()

    print(f"Processing completed for gene {gene_name}. Results saved in '{output_dir}'.")


#### **1: Gene ptsP**

In [18]:
process_gene_with_clustal("ptsP", "blast_results/ptsP_blast.fasta")

Running ClustalW2 for gene ptsP...
Aviso: A query não foi identificada no alinhamento.
Distance Matrix:
P23388  0.000000
P45597  0.538190    0.000000
D4GYE2  0.605221    0.621061    0.000000
O83018  0.634316    0.646711    0.603387    0.000000
P42014  0.633088    0.645530    0.603387    0.007593    0.000000
O69251  0.635552    0.648364    0.610526    0.174375    0.172858    0.000000
P08838  0.648463    0.666797    0.619289    0.185897    0.188914    0.171418    0.000000
Q9K8D3  0.622441    0.645694    0.599685    0.248579    0.247063    0.231148    0.248777    0.000000
P51183  0.663574    0.655706    0.627961    0.281191    0.283453    0.275438    0.269591    0.283453    0.000000
Q6GAD0  0.663171    0.652998    0.625243    0.280814    0.283076    0.273947    0.268091    0.280814    0.002563    0.000000
Q99V14  0.663311    0.655079    0.624854    0.284908    0.287166    0.276517    0.268813    0.281521    0.006947    0.004388    0.000000
Q931U2  0.663578    0.654946    0.625243    0.283

#### **2: Gene ButyrylCoA**

In [7]:
process_gene_with_clustal("butyrylCoA", "blast_results/butyrylCoA_blast.fasta")

Running ClustalW2 for gene butyrylCoA...
Distance Matrix:
G2SYC0  0.000000
B0MC58  0.245211    0.000000
Q0AVM5  0.451849    0.460215    0.000000
    G2SYC0  B0MC58  Q0AVM5
Processing completed for gene butyrylCoA. Results saved in 'filogenetic_analysis'.


#### **3: Gene MutS Domain I**

In [11]:
process_gene_with_clustal("MutS", "blast_results/MutS_blast.fasta")

Running ClustalW2 for gene MutS...


FileNotFoundError: Alignment was not generated for gene MutS.