## **Filogenetic Analysis**

The code performs a multiple sequence alignment of seven sequences, selecting the best alignments of homologous sequences. Then, it processes the calculation of the distance matrix based on the obtained alignment using the 'BLOSUM62' substitution matrix. Finally, it constructs phylogenetic trees using the UPGMA and NJ methods for each gene under analysis.


In [2]:
from Bio import SeqIO, AlignIO, Phylo
from Bio.SeqRecord import SeqRecord
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
import os
import subprocess
import matplotlib.pyplot as plt

def process_gene_with_clustal(gene_name):
    # Directories
    genes_dir = "genes"
    blast_results_dir = "blast_results"
    output_dir = "phylogenetic_results"
    os.makedirs(output_dir, exist_ok=True)

    # File paths
    gene_fasta_file = os.path.join(genes_dir, f"{gene_name}.fasta")
    blast_fasta_file = os.path.join(blast_results_dir, f"{gene_name}_blast.fasta")
    if not os.path.exists(gene_fasta_file) or not os.path.exists(blast_fasta_file):
        print(f"Query file or BLAST results not found for {gene_name}.")
        return

    # Read the query sequence
    query_sequence = SeqIO.read(gene_fasta_file, "fasta")
    query_sequence.id = "QUERY"
    query_sequence.description = "QUERY"

    # Read the BLAST result sequences and adjust IDs and descriptions
    blast_sequences = []
    seen_genera = set()  

    for seq in SeqIO.parse(blast_fasta_file, "fasta"):
        # Extract the genus 
        full_description = seq.description.split("|")[0].strip()
        species_name = full_description.replace(" ", "_")  
        genus_name = species_name.split("_")[0]  

        # Only add if the genus has not been seen before
        if genus_name not in seen_genera:
            seen_genera.add(genus_name)  # Mark the genus as processed
            seq.id = species_name  # Set ID as the unique species name
            seq.name = ""  # Reset name
            seq.description = ""  # Remove redundant description
            blast_sequences.append(seq)  # Add to the final list

    # Limit the sequences to 10 (including the query)
    all_sequences = [query_sequence] + blast_sequences

    # Save the limited sequences to a FASTA file
    trimmed_fasta = os.path.join(output_dir, f"{gene_name}_trimmed.fasta")
    SeqIO.write(all_sequences, trimmed_fasta, "fasta")

    # Path to the ClustalW2 executable
    clustal_exe = "clustalw2"

    # Path for the alignment file
    alignment_file = os.path.join(output_dir, f"{gene_name}_alignment.fasta")

    # Run ClustalW2
    clustal_command = [
        clustal_exe,
        f"/INFILE={trimmed_fasta}",
        f"/OUTFILE={alignment_file}",
        "/OUTPUT=FASTA",
        "/QUIET"
    ]

    print(f"Running ClustalW2 for {gene_name}...")
    try:
        subprocess.run(clustal_command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error running ClustalW2: {e}")
        return

    # Check if the alignment was generated
    if not os.path.exists(alignment_file):
        raise FileNotFoundError(f"Alignment not generated for {gene_name}.")

    # Read the alignment
    alignment = AlignIO.read(alignment_file, "fasta")

    # Calculate the distance matrix
    calculator = DistanceCalculator('blosum62')
    try:
        dm = calculator.get_distance(alignment)
        print(f"Distance Matrix:\n{dm}")
    except ValueError as e:
        print(f"Error calculating distance matrix: {e}")
        return

    # Construct phylogenetic trees
    constructor = DistanceTreeConstructor()
    upgma_tree = constructor.upgma(dm)
    nj_tree = constructor.nj(dm)

    # Highlight the query and remove 'Inner' labels
    for tree in [upgma_tree, nj_tree]:
        for clade in tree.find_clades():
            if clade.name and clade.name.startswith("Inner"):
                clade.name = None  # Remove names of internal clades
            elif clade.name == "QUERY":  
                clade.name = "*** QUERY ***"
                clade.color = "pink" 
           
    # Save the trees and generate images
    for tree, tree_type in [(upgma_tree, 'UPGMA'), (nj_tree, 'NJ')]:
        # Save the tree in Newick format
        tree_file = os.path.join(output_dir, f"{gene_name}_{tree_type}_tree.nwk")
        Phylo.write(tree, tree_file, "newick")

        # Create and save the tree plot
        fig = plt.figure(figsize=(30, 10))
        ax = fig.add_subplot(1, 1, 1)
        Phylo.draw(tree, do_show=False, axes=ax)
        plt.title(f"{gene_name} - {tree_type} Tree", fontsize=16)
        tree_img = os.path.join(output_dir, f"{gene_name}_{tree_type}_tree.png")
        plt.savefig(tree_img, bbox_inches='tight')
        plt.close()

    print(f"Processing completed for {gene_name}. Results saved in '{output_dir}'.")

#### **1: Gene ptsP**

In [3]:
process_gene_with_clustal("ptsP")

Running ClustalW2 for ptsP...
Distance Matrix:
Bacillus_sp._S  0.000000
Geobacillus_stearothermophilus  0.007593    0.000000
Priestia_megaterium 0.174375    0.172858    0.000000
Halalkalibacterium_halodurans_C-125 0.248579    0.247063    0.231148    0.000000
Staphylococcus_carnosus_subsp._carnosus_TM300   0.290614    0.293630    0.275849    0.295515    0.000000
Streptococcus_equinus   0.311027    0.309243    0.321308    0.333966    0.317208    0.000000
Lactococcus_lactis_subsp._lactis_Il1403 0.326894    0.324242    0.321710    0.353030    0.343796    0.163363    0.000000
Enterococcus_faecalis_V583  0.305608    0.304855    0.302534    0.322921    0.307834    0.205700    0.216196    0.000000
Latilactobacillus_sakei 0.321334    0.317923    0.309077    0.333839    0.326160    0.291388    0.280624    0.236511    0.000000
Listeria_innocua_Clip11262  0.263736    0.266768    0.249906    0.285335    0.273469    0.296724    0.302455    0.265556    0.270330    0.000000
Lysinibacillus_sphaericus  

#### **2: Gene ButyrylCoA**

In [4]:
process_gene_with_clustal("butyrylCoA")

Running ClustalW2 for butyrylCoA...
Distance Matrix:
QUERY   0.000000
Roseburia_hominis_A2-183    0.249363    0.000000
Anaerostipes_caccae_L1-92   0.257350    0.245211    0.000000
Syntrophomonas_wolfei_subsp._wolfei_str._Goettingen_G311    0.477721    0.451849    0.460215    0.000000
Clostridium_kluyveri_DSM_555    0.627855    0.622103    0.635468    0.637799    0.000000
Fasciola_hepatica   0.662234    0.671414    0.661874    0.699947    0.491096    0.000000
    QUERY   Roseburia_hominis_A2-183    Anaerostipes_caccae_L1-92   Syntrophomonas_wolfei_subsp._wolfei_str._Goettingen_G311    Clostridium_kluyveri_DSM_555    Fasciola_hepatica
Processing completed for butyrylCoA. Results saved in 'phylogenetic_results'.


#### **3: Gene MutS Domain I**

In [5]:
process_gene_with_clustal("MutS")

Running ClustalW2 for MutS...
Distance Matrix:
Streptomyces_albus_G    0.000000
QUERY   0.773655    0.000000
Paramecium_bursaria_Chlorella_virus_NC1A    0.837165    0.951747    0.000000
Escherichia_phage_P1    1.070342    0.871604    1.121784    0.000000
    Streptomyces_albus_G    QUERY   Paramecium_bursaria_Chlorella_virus_NC1A    Escherichia_phage_P1
Processing completed for MutS. Results saved in 'phylogenetic_results'.
