In [1]:
# Imports
import subprocess
from Bio import SeqIO
from Bio.Seq import Seq

In [2]:
def truncate_aligned_sequences(sample):
    # Define the start and end motifs
    motif_start = "ATGGACACCATCATCCAC"
    motif_end = "GGGTATGA"
    # Read the FASTA file containing aligned sequences based on the sample nam
    records = list(SeqIO.parse("data/fasta/" + sample + ".fasta", "fasta"))
    truncated_sequences = []
    for record in records:
        sequence = str(record.seq)
        # Find the start motif in the sequence and truncate the sequence from that motif onwards
        motif_start_index = sequence.find(motif_start)
        if motif_start_index != -1:
            sequence = sequence[motif_start_index:]
        # Find the end motif and truncate the sequence to only include up to and including this motif
        motif_end_index = sequence.find(motif_end)
        if motif_end_index != -1:
            sequence = sequence[:motif_end_index+len(motif_end)]
        # Update the sequence of the record
        record.seq = Seq(sequence) 
         # Append the truncated sequence to the list
        truncated_sequences.append(record)
    # Write the truncated sequences back to the FASTA file
    SeqIO.write(truncated_sequences, "data/fasta/" + sample + "_truncated.fasta", "fasta")

In [3]:
def align_sequences(sample):
    # Construct the command to align sequences using Muscle.
    command = "muscle -in data/fasta/" + sample + "_truncated.fasta -out data/fasta/" + sample + "_aligned.fasta -maxiters 100"
    # Execute the Muscle command
    subprocess.run(command, check=True, shell=True)

In [12]:
def combine_samples(samples):
    all_samples_fasta = "data/fasta/all_samples.fasta"
    ids = set()
    with open(all_samples_fasta, 'w') as output:
        for sample in samples:
            input_path = "data/fasta/" + sample + "_truncated.fasta"
            for record in SeqIO.parse(input_path, "fasta"):
                if record.id not in ids:
                    SeqIO.write(record, output, "fasta")
                    ids.add(record.id)

In [4]:
# Define list of samples to process
samples = ["04.B1.W14.01_04.M1.W09.02", "05.B1.W14.04_05.M1.W08.03", "27.B1.W13.06_27.M1.W10.07",
           "30.B1.W11.08_30.M1.W04.09", "38.B1.W10.11_38.M1.W03.10", "39.B1.W11.12_39.M1.W03.13_39.M1.W05.14",
           "51.M1.W03.26_51.S1.W05.27", "53.B1.W14.17_53.M1.W07.16", "54.M1.W03.18_54.M1.W05.19",
           "56.B1.W09.22_56.M1.W03.21", "63.B1.W09.29_63.M1.W02.30", "66.B1.W09.25_66.M1.W02.24"]

# For each sample in the list:
for sample in samples: 
    # Print the alignment in process
    print("Truncation and alignment of sample:", sample)
    # Truncate the sequences of the sample based on specific motifs
    #truncate_aligned_sequences(sample)
    # Align the sequences of the sample using Muscle
    align_sequences(sample)

Truncation and alignment of sample: 04.B1.W14.01_04.M1.W09.02



MUSCLE v3.8.1551 by Robert C. Edgar

http://www.drive5.com/muscle
This software is donated to the public domain.
Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.

04.B1.W14.01_04.M1.W09.02_trunc 12 seqs, lengths min 1274, max 1360, avg 1333
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 1
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 2
00:00:00    31 MB(-2%)  Iter   1  100.00%  Align node       
00:00:00    31 MB(-2%)  Iter   1  100.00%  Root alignment
00:00:01    31 MB(-2%)  Iter   2  100.00%  Refine tree   
00:00:01    31 MB(-2%)  Iter   2  100.00%  Root alignment
00:00:01    31 MB(-2%)  Iter   2  100.00%  Root alignment
00:00:01    31 MB(-2%)  Iter   3  100.00%  Refine biparts
00:00:02    31 MB(-2%)  Iter   4  100.00%  Refine biparts
00:00:02    31 MB(-2%)  Iter   5  100.00%  Refine biparts
00:00:02    31 MB(-2%)  Iter   5  100.00%  Refine biparts

MUSCLE v3.8.1551 by Robert C. Edgar

http://www.drive5.com/muscle
This software is donated to the pu

Truncation and alignment of sample: 05.B1.W14.04_05.M1.W08.03


00:00:00    28 MB(-2%)  Iter   1  100.00%  Align node
00:00:00    28 MB(-2%)  Iter   1  100.00%  Root alignment
00:00:00    28 MB(-2%)  Iter   2  100.00%  Refine tree   
00:00:00    28 MB(-2%)  Iter   2  100.00%  Root alignment
00:00:00    28 MB(-2%)  Iter   2  100.00%  Root alignment
00:00:01    28 MB(-2%)  Iter   3  100.00%  Refine biparts
00:00:01    28 MB(-2%)  Iter   4  100.00%  Refine biparts

MUSCLE v3.8.1551 by Robert C. Edgar

http://www.drive5.com/muscle
This software is donated to the public domain.
Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.

27.B1.W13.06_27.M1.W10.07_trunc 11 seqs, lengths min 1297, max 1360, avg 1333
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 1
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 2
00:00:00    27 MB(-2%)  Iter   1   40.00%  Align node       

Truncation and alignment of sample: 27.B1.W13.06_27.M1.W10.07


00:00:00    30 MB(-2%)  Iter   1  100.00%  Align node
00:00:00    30 MB(-2%)  Iter   1  100.00%  Root alignment
00:00:00    30 MB(-2%)  Iter   2  100.00%  Root alignment
00:00:00    31 MB(-2%)  Iter   3  100.00%  Refine biparts
00:00:01    31 MB(-2%)  Iter   4  100.00%  Refine biparts

MUSCLE v3.8.1551 by Robert C. Edgar

http://www.drive5.com/muscle
This software is donated to the public domain.
Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.

30.B1.W11.08_30.M1.W04.09_trunc 6 seqs, lengths min 1345, max 1360, avg 1353
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 1
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 2
00:00:00    26 MB(-2%)  Iter   1  100.00%  Align node       
00:00:00    26 MB(-2%)  Iter   1  100.00%  Root alignment
00:00:00    26 MB(-2%)  Iter   2  100.00%  Root alignment
00:00:00    26 MB(-2%)  Iter   3   22.22%  Refine biparts

Truncation and alignment of sample: 30.B1.W11.08_30.M1.W04.09


00:00:00    26 MB(-2%)  Iter   3  100.00%  Refine biparts
00:00:00    26 MB(-2%)  Iter   4  100.00%  Refine biparts

MUSCLE v3.8.1551 by Robert C. Edgar

http://www.drive5.com/muscle
This software is donated to the public domain.
Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.

38.B1.W10.11_38.M1.W03.10_trunc 10 seqs, lengths min 1338, max 1360, avg 1346
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 1
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 2
00:00:00    27 MB(-2%)  Iter   1   55.56%  Align node       

Truncation and alignment of sample: 38.B1.W10.11_38.M1.W03.10


00:00:01    29 MB(-2%)  Iter   1  100.00%  Align node
00:00:01    29 MB(-2%)  Iter   1  100.00%  Root alignment
00:00:01    29 MB(-2%)  Iter   2  100.00%  Refine tree   
00:00:01    29 MB(-2%)  Iter   2  100.00%  Root alignment
00:00:01    29 MB(-2%)  Iter   2  100.00%  Root alignment
00:00:01    29 MB(-2%)  Iter   3  100.00%  Refine biparts

MUSCLE v3.8.1551 by Robert C. Edgar

http://www.drive5.com/muscle
This software is donated to the public domain.
Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.

39.B1.W11.12_39.M1.W03.13_39.M1 14 seqs, lengths min 1297, max 1360, avg 1332
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 1
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 2
00:00:00    28 MB(-2%)  Iter   1   53.85%  Align node       

Truncation and alignment of sample: 39.B1.W11.12_39.M1.W03.13_39.M1.W05.14


00:00:01    32 MB(-2%)  Iter   1  100.00%  Align node
00:00:01    32 MB(-2%)  Iter   1  100.00%  Root alignment
00:00:01    32 MB(-2%)  Iter   2  100.00%  Root alignment
00:00:01    34 MB(-2%)  Iter   3  100.00%  Refine biparts
00:00:02    34 MB(-2%)  Iter   4  100.00%  Refine biparts
00:00:02    34 MB(-2%)  Iter   5  100.00%  Refine biparts
00:00:02    34 MB(-2%)  Iter   5  100.00%  Refine biparts

MUSCLE v3.8.1551 by Robert C. Edgar

http://www.drive5.com/muscle
This software is donated to the public domain.
Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.

51.M1.W03.26_51.S1.W05.27_trunc 9 seqs, lengths min 1281, max 1360, avg 1329
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 1
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 2
00:00:00    23 MB(-2%)  Iter   1   25.00%  Align node       

Truncation and alignment of sample: 51.M1.W03.26_51.S1.W05.27


00:00:00    28 MB(-2%)  Iter   1  100.00%  Align node
00:00:00    28 MB(-2%)  Iter   1  100.00%  Root alignment
00:00:00    28 MB(-2%)  Iter   2  100.00%  Root alignment
00:00:00    28 MB(-2%)  Iter   3  100.00%  Refine biparts
00:00:01    29 MB(-2%)  Iter   4  100.00%  Refine biparts
00:00:01    29 MB(-2%)  Iter   5  100.00%  Refine biparts
00:00:01    29 MB(-2%)  Iter   5  100.00%  Refine biparts

MUSCLE v3.8.1551 by Robert C. Edgar

http://www.drive5.com/muscle
This software is donated to the public domain.
Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.

53.B1.W14.17_53.M1.W07.16_trunc 9 seqs, lengths min 1331, max 1360, avg 1348
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 1
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 2
00:00:00    27 MB(-2%)  Iter   1   75.00%  Align node       

Truncation and alignment of sample: 53.B1.W14.17_53.M1.W07.16


00:00:00    28 MB(-2%)  Iter   1  100.00%  Align node
00:00:00    28 MB(-2%)  Iter   1  100.00%  Root alignment
00:00:00    28 MB(-2%)  Iter   2  100.00%  Refine tree   
00:00:00    28 MB(-2%)  Iter   2  100.00%  Root alignment
00:00:00    28 MB(-2%)  Iter   2  100.00%  Root alignment
00:00:00    28 MB(-2%)  Iter   3  100.00%  Refine biparts
00:00:01    28 MB(-2%)  Iter   4  100.00%  Refine biparts
00:00:01    28 MB(-2%)  Iter   5  100.00%  Refine biparts
00:00:01    28 MB(-2%)  Iter   5  100.00%  Refine biparts


Truncation and alignment of sample: 54.M1.W03.18_54.M1.W05.19



MUSCLE v3.8.1551 by Robert C. Edgar

http://www.drive5.com/muscle
This software is donated to the public domain.
Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.

54.M1.W03.18_54.M1.W05.19_trunc 8 seqs, lengths min 1295, max 1360, avg 1333
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 1
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 2
00:00:00    27 MB(-2%)  Iter   1  100.00%  Align node       
00:00:00    27 MB(-2%)  Iter   1  100.00%  Root alignment
00:00:00    27 MB(-2%)  Iter   2  100.00%  Root alignment
00:00:00    27 MB(-2%)  Iter   3  100.00%  Refine biparts
00:00:01    28 MB(-2%)  Iter   4  100.00%  Refine biparts
00:00:01    29 MB(-2%)  Iter   5  100.00%  Refine biparts
00:00:01    29 MB(-2%)  Iter   5  100.00%  Refine biparts

MUSCLE v3.8.1551 by Robert C. Edgar

http://www.drive5.com/muscle
This software is donated to the public domain.
Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.

56.B1.W09.22_56.M1.W03.21_trunc 9 seqs, leng

Truncation and alignment of sample: 56.B1.W09.22_56.M1.W03.21


00:00:00    28 MB(-2%)  Iter   1  100.00%  Align node
00:00:00    28 MB(-2%)  Iter   1  100.00%  Root alignment
00:00:00    28 MB(-2%)  Iter   2  100.00%  Root alignment
00:00:00    28 MB(-2%)  Iter   3  100.00%  Refine biparts
00:00:00    28 MB(-2%)  Iter   4  100.00%  Refine biparts

MUSCLE v3.8.1551 by Robert C. Edgar

http://www.drive5.com/muscle
This software is donated to the public domain.
Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.

63.B1.W09.29_63.M1.W02.30_trunc 8 seqs, lengths min 1347, max 1360, avg 1355
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 1
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 2
00:00:01    27 MB(-2%)  Iter   1  100.00%  Align node       

Truncation and alignment of sample: 63.B1.W09.29_63.M1.W02.30


00:00:01    28 MB(-2%)  Iter   1  100.00%  Align node
00:00:01    28 MB(-2%)  Iter   1  100.00%  Root alignment
00:00:01    28 MB(-2%)  Iter   2  100.00%  Refine tree   
00:00:01    28 MB(-2%)  Iter   2  100.00%  Root alignment
00:00:01    28 MB(-2%)  Iter   2  100.00%  Root alignment
00:00:01    28 MB(-2%)  Iter   3  100.00%  Refine biparts
00:00:01    28 MB(-2%)  Iter   4  100.00%  Refine biparts

MUSCLE v3.8.1551 by Robert C. Edgar

http://www.drive5.com/muscle
This software is donated to the public domain.
Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.

66.B1.W09.25_66.M1.W02.24_trunc 9 seqs, lengths min 1290, max 1360, avg 1338
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 1
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 2
00:00:00    17 MB(-1%)  Iter   1   12.50%  Align node       

Truncation and alignment of sample: 66.B1.W09.25_66.M1.W02.24


00:00:00    28 MB(-2%)  Iter   1  100.00%  Align node
00:00:00    28 MB(-2%)  Iter   1  100.00%  Root alignment
00:00:00    28 MB(-2%)  Iter   2  100.00%  Root alignment
00:00:01    29 MB(-2%)  Iter   3  100.00%  Refine biparts
00:00:01    29 MB(-2%)  Iter   4  100.00%  Refine biparts
00:00:01    29 MB(-2%)  Iter   5  100.00%  Refine biparts
00:00:01    29 MB(-2%)  Iter   5  100.00%  Refine biparts


In [18]:
combine_samples(samples)
truncate_aligned_sequences("all_samples")
align_sequences("all_samples")


MUSCLE v3.8.1551 by Robert C. Edgar

http://www.drive5.com/muscle
This software is donated to the public domain.
Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.

all_samples_truncated 92 seqs, lengths min 1270, max 1360, avg 1332
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 1
00:00:00    16 MB(-1%)  Iter   1  100.00%  K-mer dist pass 2
00:00:02    95 MB(-6%)  Iter   1  100.00%  Align node       
00:00:02    95 MB(-6%)  Iter   1  100.00%  Root alignment
00:00:04    95 MB(-6%)  Iter   2  100.00%  Refine tree   
00:00:04    95 MB(-6%)  Iter   2  100.00%  Root alignment
00:00:04    95 MB(-6%)  Iter   2  100.00%  Root alignment
00:00:10    95 MB(-6%)  Iter   3  100.00%  Refine biparts
00:00:16    95 MB(-6%)  Iter   4  100.00%  Refine biparts
00:00:16    95 MB(-6%)  Iter   5  100.00%  Refine biparts
00:00:16    95 MB(-6%)  Iter   5  100.00%  Refine biparts
