In [11]:
# Import
from Bio import SeqIO
from Bio.Seq import Seq

In [18]:
# Define list of samples to process
samples = ["04.B1.W14.01", "04.M1.W09.02", 
           "05.B1.W14.04", "05.M1.W08.03",
           "27.B1.W13.06", "27.M1.W10.07", 
           "30.B1.W11.08", "30.M1.W04.09", 
           "38.B1.W10.11", "38.M1.W03.10", 
           "39.B1.W11.12", "39.M1.W03.13", "39.M1.W05.14", 
           "53.B1.W14.17", "53.M1.W07.16", 
           "56.B1.W09.22", "56.M1.W03.21", 
           "63.B1.W09.29", "63.M1.W02.30", 
           "66.B1.W09.25", "66.M1.W02.24"]

In [19]:
def get_haplotype_frequency(sample):
    # Initialize dictionaries to store unique haplotypes and their frequencies
    unique_haplotypes = {}
    haplotype_frequencies = {}
    
    # Flag to identify if we are within the "Haplotype frequencies" section in the file
    is_in_haplotype_section = False

    # Open and read the file
    with open("data/" + sample + "/HaROLD/step_2/" + sample + ".log", "r") as file:
        lines = file.readlines()
        for line in lines:
            # Strip the line to remove leading and trailing whitespaces
            stripped_line = line.strip()
            
            # If we encounter the "Haplotype frequencies" line, set the flag to True
            if 'Haplotype frequencies' in stripped_line:
                is_in_haplotype_section = True
                continue

            # If we're inside the "Haplotype frequencies" section and we encounter 
            # a blank line or a line starting with a non-digit, break out of the loop
            if is_in_haplotype_section and (stripped_line == "" or not stripped_line[0].isdigit()):
                break

            # If we're inside the "Haplotype frequencies" section, extract key-value pairs and update the dictionary
            if is_in_haplotype_section:
                key, value = stripped_line.split()
                # Convert the string value to float after replacing comma with period
                haplotype_frequencies[sample + "_H" + str(key)] = round((float(value.replace(',', '.')) * 100), 1)

    # Return the final dictionary of haplotype frequencies
    return haplotype_frequencies

In [20]:
# Initialize a dictionary to store the haplotype frequencies across all samples
haplotype_frequencies = {}

# Loop over each sample in the list
for sample in samples:
    # Call the 'get_frequencies' function for the sample and update the global haplotype_frequencies dictionary with the frequencies for this sample
    haplotype_frequencies = {**haplotype_frequencies, **get_haplotype_frequency(sample)}

In [24]:
# Create an empty set to store the unique two-character prefixes
families = set()

# Iterate through each sample and add the first two characters to the set
for sample in samples:
    families.add(sample[:2])

# Convert the set to a list and sort it
families = sorted(list(families))

# Load the sequences from the additional fasta files
ad169_data = [(record.id, str(record.seq)) for record in SeqIO.parse("data/fasta/reference/HCMV_AD169_UL33.fasta", "fasta")]
merlin_data = [(record.id, str(record.seq)) for record in SeqIO.parse("data/fasta/reference/HCMV_Merlin_UL33.fasta", "fasta")]
toledo_data = [(record.id, str(record.seq)) for record in SeqIO.parse("data/fasta/reference/HCMV_Toledo_UL33.fasta", "fasta")]
towne_data = [(record.id, str(record.seq)) for record in SeqIO.parse("data/fasta/reference/HCMV_Towne_UL33.fasta", "fasta")]

# Loop over families
for family in families:
    # To store all sequences and their IDs for the current family
    all_data = []
    # To store concatenated sample names for the current family
    output_file_name = ""  
    # Loop over samples
    for sample in samples:
        # Check if the sample belongs to the current family
        if sample.startswith(family):
            fasta_path = "data/" + sample + "/HaROLD/step_2/" + sample + ".fasta"
            # Initialize an empty list to store the data
            data = []
            # Open and parse the FASTA file
            for record in SeqIO.parse(fasta_path, "fasta"):
                # Store each record's ID and sequence as a tuple in the list
                if haplotype_frequencies[record.id] > 1: data.append((record.id, str(record.seq)))
            all_data.extend(data)
            output_file_name += sample + "_"
    # Add the sequences and IDs from HCMV_AD169_UL33.fasta and HCMV_Merlin_UL33.fasta
    all_data.extend(ad169_data)
    all_data.extend(merlin_data)
    all_data.extend(toledo_data)
    all_data.extend(towne_data)
    
    # Save sequences to a new fasta file
    with open("data/fasta/initial/" + output_file_name[:-1] + ".fasta", "w", newline='\r\n') as f_out:
        for seq_id, seq in all_data: 
            f_out.write(f">{seq_id}\n{seq}\n")
    print(output_file_name, "successfully generated.")

04.B1.W14.01_04.M1.W09.02_ successfully generated.
05.B1.W14.04_05.M1.W08.03_ successfully generated.
27.B1.W13.06_27.M1.W10.07_ successfully generated.
30.B1.W11.08_30.M1.W04.09_ successfully generated.
38.B1.W10.11_38.M1.W03.10_ successfully generated.
39.B1.W11.12_39.M1.W03.13_39.M1.W05.14_ successfully generated.
53.B1.W14.17_53.M1.W07.16_ successfully generated.
56.B1.W09.22_56.M1.W03.21_ successfully generated.
63.B1.W09.29_63.M1.W02.30_ successfully generated.
66.B1.W09.25_66.M1.W02.24_ successfully generated.


In [25]:
def truncate_sequences(sample, motif_start, motif_end):
    # Read the FASTA file containing aligned sequences based on the sample nam
    records = list(SeqIO.parse("data/fasta/initial/" + sample + ".fasta", "fasta"))
    truncated_sequences = []
    for record in records:
        sequence = str(record.seq)
        # Find the start motif in the sequence and truncate the sequence from that motif onwards
        motif_start_index = sequence.find(motif_start)
        if motif_start_index != -1:
            sequence = sequence[motif_start_index:]
        # Find the end motif and truncate the sequence to only include up to and including this motif
        motif_end_index = sequence.find(motif_end)
        if motif_end_index != -1:
            sequence = sequence[:motif_end_index+len(motif_end)]
        # Update the sequence of the record
        record.seq = Seq(sequence) 
        
        # Check the percentage of nucleotides that are missing
        missing_percentage = round((sequence.count('-') / len(sequence)) * 100, 2)
        
        # If missing percentage is 5% or less, append the record to the list
        if missing_percentage <= 5: truncated_sequences.append(record)
        else: print("Removed sequence: ",record.id, missing_percentage, "%")
    
    # Write the truncated sequences back to the FASTA file
    SeqIO.write(truncated_sequences, "data/fasta/truncated/" + sample + "_truncated.fasta", "fasta")

In [26]:
# Define list of samples to process
samples = ["04.B1.W14.01_04.M1.W09.02", "05.B1.W14.04_05.M1.W08.03", 
           "27.B1.W13.06_27.M1.W10.07","30.B1.W11.08_30.M1.W04.09",
           "38.B1.W10.11_38.M1.W03.10", "39.B1.W11.12_39.M1.W03.13_39.M1.W05.14",
           "53.B1.W14.17_53.M1.W07.16", "56.B1.W09.22_56.M1.W03.21", 
           "63.B1.W09.29_63.M1.W02.30", "66.B1.W09.25_66.M1.W02.24"]

# Define the start and end motifs
motif_start = "ATGGACACCATCATCCAC"
motif_end = "GGGTATGA"

for sample in samples: 
    # Print the sample in process
    print("Truncation of sample:", sample)
    # Truncation of sequences
    truncate_sequences(sample, motif_start, motif_end)

Truncation of sample: 04.B1.W14.01_04.M1.W09.02
Removed sequence:  04.M1.W09.02_H0 6.25 %
Truncation of sample: 05.B1.W14.04_05.M1.W08.03
Truncation of sample: 27.B1.W13.06_27.M1.W10.07
Removed sequence:  27.M1.W10.07_H4 5.08 %
Truncation of sample: 30.B1.W11.08_30.M1.W04.09
Truncation of sample: 38.B1.W10.11_38.M1.W03.10
Truncation of sample: 39.B1.W11.12_39.M1.W03.13_39.M1.W05.14
Removed sequence:  39.M1.W05.14_H3 5.08 %
Truncation of sample: 53.B1.W14.17_53.M1.W07.16
Truncation of sample: 56.B1.W09.22_56.M1.W03.21
Truncation of sample: 63.B1.W09.29_63.M1.W02.30
Truncation of sample: 66.B1.W09.25_66.M1.W02.24
Removed sequence:  66.M1.W02.24_H1 5.08 %
