## **Filogenetic Analysis**

The code performs a multiple sequence alignment of seven sequences, selecting the best alignments of homologous sequences. Then, it processes the calculation of the distance matrix based on the obtained alignment using the 'BLOSUM62' substitution matrix. Finally, it constructs phylogenetic trees using the UPGMA and NJ methods for each gene under analysis.


In [6]:
from Bio import SeqIO, AlignIO, Phylo
from Bio.SeqRecord import SeqRecord
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
import subprocess
import os
import matplotlib.pyplot as plt

def process_gene_with_clustal(gene_name, fasta_file):
    # Create output directory
    output_dir = "filogenetic_analysis"
    os.makedirs(output_dir, exist_ok=True)

    # Read the FASTA file
    sequences = []
    with open(fasta_file) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            sequences.append(record)

    # Check if we have at least 7 sequences
    '''
    if len(sequences) < 7:
        raise ValueError(f"The FASTA file must contain at least 7 sequences. Found: {len(sequences)}")
    '''
    # Save the extracted sequences in a FASTA file
    trimmed_fasta = os.path.join(output_dir, f"{gene_name}_trimmed.fasta")
    SeqIO.write(sequences, trimmed_fasta, "fasta")

    # Path to the ClustalW2 executable
    clustal_exe = "clustalw2"  # Replace with the correct executable path if necessary

    # Define the alignment output file
    alignment_file = os.path.join(output_dir, f"{gene_name}_alignment.fasta")

    # Execute the ClustalW2 command to perform the alignment
    clustal_command = [
        clustal_exe,
        f"/INFILE={trimmed_fasta}",
        f"/OUTFILE={alignment_file}",
        "/OUTPUT=FASTA",
        "/QUIET"
    ]

    print(f"Running ClustalW2 for gene {gene_name}...")
    try:
        subprocess.run(clustal_command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error executing ClustalW2: {e}")
        return

    # Check if the alignment was successfully generated
    if not os.path.exists(alignment_file):
        raise FileNotFoundError(f"Alignment was not generated for gene {gene_name}.")

    # Read the generated alignment
    alignment = AlignIO.read(alignment_file, "fasta")

    # Ensure unique IDs and preserve important information
    '''temos de mudar os parametros aqui para aceitar um de cadz especie'''
    unique_ids = set()
    for i, record in enumerate(alignment):
        # Extract the full original ID
        original_id = record.id
        full_description = record.description

        # Process the description to include everything
        full_id = "_".join(full_description.replace(":", "").split())  # Replace spaces with "_", remove ":"

        '''isto vai ser para tirar'''
        # Ensure uniqueness
        if full_id in unique_ids:
            print(f"Duplicate ID found: {full_id}. Renaming...")
            full_id = f"{full_id}_{i}"  # Add index if necessary

        # Update the record's ID and description
        record.id = full_id
        record.description = full_id
        unique_ids.add(full_id)

    # Compute distance matrix
    calculator = DistanceCalculator('blosum62')
    try:
        dm = calculator.get_distance(alignment)
        print(f"Distance Matrix:\n{dm}")
    except ValueError as e:
        print(f"Error calculating distance matrix: {e}")
        return

    # Build trees
    constructor = DistanceTreeConstructor()
    upgma_tree = constructor.upgma(dm)
    nj_tree = constructor.nj(dm)

    # Remove 'Inner' labels from clades and highlight Homo genus
    for tree in [upgma_tree, nj_tree]:
        for clade in tree.find_clades():
            if 'Inner' in (clade.name or ''):
                clade.name = ""
            if 'Homo' in (clade.name or ''):
                clade.color = 'orange'
                clade.name = '*** ' + clade.name + ' ***'  # Mark the clade

    # Save trees
    Phylo.write(upgma_tree, os.path.join(output_dir, f'{gene_name}_upgma_tree.nwk'), 'newick')
    Phylo.write(nj_tree, os.path.join(output_dir, f'{gene_name}_nj_tree.nwk'), 'newick')

    # Process and plot trees
    for tree, tree_type in [(upgma_tree, 'UPGMA'), (nj_tree, 'NJ')]:
        # Create figure and axes
        fig = plt.figure(figsize=(35, 15))  # Increase figure size
        ax = fig.add_subplot(1, 1, 1)

        # Draw tree on created axes
        Phylo.draw(tree, do_show=False, axes=ax)
        for label in ax.get_xticklabels() + ax.get_yticklabels():
            label.set_fontsize(18)
        for text in ax.findobj(match=plt.Text):  # Find all text objects
            text.set_fontsize(22)

        # Add title
        plt.title(f'{gene_name} - {tree_type} Tree', fontsize=30)

        # Save tree image
        tree_img = os.path.join(output_dir, f'{gene_name}_{tree_type}_tree.png')
        plt.savefig(tree_img, bbox_inches='tight')  # 'bbox_inches' ensures nothing gets cut off
        plt.close()

    print(f"Processing completed for gene {gene_name}. Results saved in '{output_dir}'.")


In [3]:
from Bio.Blast import NCBIXML
from Bio.Align.Applications import ClustalOmegaCommandline
from Bio import Phylo, AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
import os

genes = ["ABCB11", "COG7", "EMCN_2", "ITIH5L"]
blast_files = [f"{gene}_blast.xml" for gene in genes]

for gene, blast_file in zip(genes, blast_files):
    # Load BLAST results from XML file
    blast_results = NCBIXML.parse(open(blast_file))

    sequences = []
    id_counts = {}
    for result in blast_results:
        for alignment in result.alignments:
            for hsp in alignment.hsps:
                # Modify the sequence identifier
                sequence_id = alignment.hit_id
                if sequence_id in id_counts:
                    id_counts[sequence_id] += 1
                    sequence_id = f"{sequence_id}_hsps{id_counts[sequence_id]}"
                else:
                    id_counts[sequence_id] = 1

                sequence = f">{sequence_id}\n{hsp.sbjct}\n"
                sequences.append(sequence)

    fasta_file_path = f"{gene}_aligned.fasta"
    with open(fasta_file_path, "w") as fasta_file:
        fasta_file.writelines(sequences)

    # Perform multiple sequence alignment using Clustal Omega
    script_directory = os.path.dirname(os.path.abspath(__file__))
    clustalomega_exe_path = os.path.join(script_directory, "clustal-omega/clustalo.exe")
    
    clustalomega_cline = ClustalOmegaCommandline(clustalomega_exe_path, infile=fasta_file_path, outfile=f"{gene}_aligned.fasta", force=True)
    clustalomega_cline()
    print(f"Alignment for {gene} saved to {gene}_aligned.fasta")

    # Read the aligned sequences
    aligned_file_path = AlignIO.read(f"{gene}_aligned.fasta", "fasta")

    # Calculate distances for the current gene
    calculator = DistanceCalculator("identity")
    dm = calculator.get_distance(aligned_file_path)

    # Build the phylogenetic tree
    constructor = DistanceTreeConstructor()
    tree = constructor.upgma(dm)

    # Save the phylogenetic tree
    tree_file_path = f"{gene}_phylogenetic_tree.newick"
    Phylo.write(tree, tree_file_path, "newick")
    print(f"Phylogenetic tree for {gene} saved to {tree_file_path}")


Due to the on going maintenance burden of keeping command line application
wrappers up to date, we have decided to deprecate and eventually remove these
modules.

We instead now recommend building your command line and invoking it directly
with the subprocess module.


FileNotFoundError: [Errno 2] No such file or directory: 'ABCB11_blast.xml'

#### **1: Gene ptsP**

In [5]:
process_gene_with_clustal("ptsP", "blast_results/ptsP_blast.fasta")

ValueError: The FASTA file must contain at least 7 sequences. Found: 0

#### **2: Gene ButyrylCoA**

In [7]:
process_gene_with_clustal("butyrylCoA", "blast_results/butyrylCoA_blast.fasta")

Running ClustalW2 for gene butyrylCoA...
Distance Matrix:
G2SYC0  0.000000
B0MC58  0.245211    0.000000
Q0AVM5  0.451849    0.460215    0.000000
    G2SYC0  B0MC58  Q0AVM5
Processing completed for gene butyrylCoA. Results saved in 'filogenetic_analysis'.


#### **3: Gene MutS Domain I**

In [11]:
process_gene_with_clustal("MutS", "blast_results/MutS_blast.fasta")

Running ClustalW2 for gene MutS...


FileNotFoundError: Alignment was not generated for gene MutS.