In [34]:
# Imports
import os
import subprocess
from Bio.Seq import Seq
from Bio.motifs import Motif
from Bio import SeqIO, AlignIO
from Bio.SeqRecord import SeqRecord
from collections import defaultdict
from Bio.Align import MultipleSeqAlignment

In [35]:
def write_temp_fasta(records, filepath):
    SeqIO.write(records, filepath, "fasta")

In [36]:
def process_sequences_by_class(input_fasta):
    sequences_by_class = defaultdict(list)
    for record in SeqIO.parse(input_fasta, "fasta"):
        sequence_class = record.description.split('|')[-1]
        sequences_by_class[sequence_class].append(record)
    return sequences_by_class

In [37]:
def align_sequences(muscle_path, input_fasta, output_fasta):
    # Construct the MUSCLE command
    command = f'"{muscle_path}" -in "{input_fasta}" -out "{output_fasta}" -maxiters 100'
    print(f"Executing command: {command}")
    
    try:
        # Execute the command using subprocess.run
        subprocess.run(command, check=True, shell=True, text=True, stderr=subprocess.PIPE)
        print(f"MUSCLE successfully executed for {input_fasta}")
    
    except subprocess.CalledProcessError as e:
        # Handle the error and display the stderr output
        print(f"Error executing MUSCLE for {input_fasta}: {e.stderr}")

In [38]:
def read_alignment(aligned_fasta):
    return AlignIO.read(aligned_fasta, "fasta")

In [39]:
def generate_consensus(align):
    alignment = align.alignment
    # Include all possible characters (including '-' for gaps) in the alphabet
    alphabet = set(''.join(str(record.seq) for record in align))
    motif = Motif(''.join(sorted(alphabet)), alignment)
    consensus = motif.counts.calculate_consensus(identity=0.7)
    return consensus

In [40]:
def generate_consensus_per_class(input_fasta, output_fasta, muscle_path):
    sequences_by_class = process_sequences_by_class(input_fasta)
    print(sequences_by_class)
    consensus_records = []
    for cls, records in sequences_by_class.items():
        temp_fasta = f'temp_{cls}.fasta'
        aligned_fasta = f'{cls}_aligned.fasta'
        
        if not os.path.exists(aligned_fasta):
            write_temp_fasta(records, temp_fasta)
            align_sequences(muscle_path, temp_fasta, aligned_fasta)
            if os.path.exists(temp_fasta):
                os.remove(temp_fasta)

        if os.path.exists(aligned_fasta):
            align = read_alignment(aligned_fasta)
            os.remove(aligned_fasta)
        else:
            print(f"Failed to generate alignment for class {cls}")
            continue

        consensus = generate_consensus(align)
        # Create a Seq object from the consensus string
        consensus_seq = Seq(str(consensus))
        consensus_records.append(SeqRecord(consensus_seq, id=cls, description=""))

    SeqIO.write(consensus_records, output_fasta, "fasta")

In [31]:
# Usage
input_fasta = 'data/fasta/UL33_all_nucleotide_sequences_annotated.fasta'
output_fasta = 'data/fasta/UL33_cluster_consensus.fasta'
muscle_path = './lib/muscle.exe'
generate_consensus_per_class(input_fasta, output_fasta, muscle_path)

defaultdict(<class 'list'>, {'C5': [SeqRecord(seq=Seq('ATGGACACCATCATCCACAACACCACGATCCGCAATACCAGCACCCCGCACGTC...TGA'), id='39.B1.W11.12|C5', name='39.B1.W11.12|C5', description='39.B1.W11.12|C5', dbxrefs=[]), SeqRecord(seq=Seq('ATGGACACCATCATCCACAACACCACGATCCGCAATACCAGCACCCCGCACGTC...TGA'), id='GQ396662.1|C5', name='GQ396662.1|C5', description='GQ396662.1|C5', dbxrefs=[]), SeqRecord(seq=Seq('ATGGACACCATCATCCACAACACCACGATCCGCAATACCAGCACCCCGCACGTC...TGA'), id='02-520-S1a|C5', name='02-520-S1a|C5', description='02-520-S1a|C5', dbxrefs=[]), SeqRecord(seq=Seq('ATGGACACCATCATCCACAACACCACGATCCGCAATACCAGCACCCCGCACGTC...TGA'), id='KP745648.1|C5', name='KP745648.1|C5', description='KP745648.1|C5', dbxrefs=[]), SeqRecord(seq=Seq('ATGGACACCATCATCCACAACACCACGATCCGCAATACCAGCACCCCGCACGTC...TGA'), id='KP745677.1|C5', name='KP745677.1|C5', description='KP745677.1|C5', dbxrefs=[]), SeqRecord(seq=Seq('ATGGACACCATCATCCACAACACCACGATCCGCAATACCAGCACCCCGCACGTC...TGA'), id='KY490070.1|C5', name='KY490070.1|C5'