In [1]:
# Imports
import os
import csv
import json
from math import ceil
from Bio import SeqIO

In [2]:
def read_fasta(file_name):
    # Return a dictionary where the key is the record id and the value is the record object.
    return {record.id: record for record in SeqIO.parse(file_name, 'fasta')}

In [3]:
def read_clusters(file_name):
    # Open the CSV file with read permissions.
    with open(file_name, 'r') as file:
        # Use the csv.reader() function to read the file.
        reader = csv.reader(file)
        # Skip the first row (header) of the CSV.
        next(reader)
        # Return a dictionary with the first column as the key and the second column as the integer value.
        return {rows[0]: int(rows[1]) for rows in reader}

In [4]:
def clone_sequences(fasta_file):
    # Initialize an empty dictionary to store sequences.
    sequences_dictionary = {}
    # Loop through each record in the fasta file.
    for record in SeqIO.parse(fasta_file, "fasta"):
         # Extract the sequence id and sequence from the record.
        seq_id = str(record.id)
        sequence = str(record.seq)
        sequences_dictionary[seq_id] = sequence
        
    # Initialize a dictionary to store sequences based on their cluster.
    cluster_sequences = {0: [], 1: []}
    # Loop through each sequence in the dictionary.
    for seq_id, sequence in sequences_dictionary.items():
        # Add the sequence to the appropriate cluster in the dictionary.
        if "|Class_0" in seq_id: cluster_number = 0
        elif "|Class_1" in seq_id: cluster_number = 1
        else: continue
        cluster_sequences[cluster_number].append((seq_id, sequence))

    # Determine the factor by which sequences need to be cloned.
    clone_factor = max(ceil(10 / len(sequences)) for sequences in cluster_sequences.values())
    
    # Open the fasta file for writing.
    with open(fasta_file, 'w') as file:
         # Loop through each cluster.
        for cluster_num, sequences in cluster_sequences.items():
            # Repeat cloning for the required number of times.
            for factor in range(clone_factor):
                # Write each sequence to the file.
                for seq_id, seq in sequences:
                    # If it's a cloned sequence, change its id.
                    if factor > 0:
                        original_id, cluster_label = seq_id.split("|")
                        clone_id = f"{original_id}_{factor}|{cluster_label}"
                        file.write(f">{clone_id}\n{seq}\n")
                    # If it's the original sequence, write it as is.
                    else: file.write(f">{seq_id}\n{seq}\n")

In [5]:
def generate_comparison_files(clusters_dictionary, sequences_dictionary, comparisons_dictionary, sample):
    # Check if the directory "data/comparisons/sample_name" exists. If not, create it.
    if not os.path.exists("data/comparisons/" + sample): 
        os.makedirs("data/comparisons/" + sample)
         
    # Iterate through each comparison and its associated groups for the given sample.
    for comparison_name, groups in comparisons_dictionary[sample].items():
        # Print the current comparison and groups.
        print(comparison_name, groups)  
        
        # Check if the directory "data/comparisons/sample_name/comparison_name" exists. If not, create it.
        if not os.path.exists("data/comparisons/" + sample + "/" + comparison_name): 
            os.makedirs("data/comparisons/" + sample + "/" + comparison_name)
        
        # Construct the path for the output FASTA file.
        fasta_file = "data/comparisons/" + sample + "/" + comparison_name + "/" + sample + "_" + comparison_name + ".fasta"
        
        # Open the file for writing.
        with open(fasta_file, 'w') as file:
            # Loop through each sequence name and its associated cluster.
            for seq_name, cluster in clusters_dictionary.items():
                # Check if the current sequence's cluster is in one of the two groups for this comparison.
                if cluster in groups[0] or cluster in groups[1]:
                    # Create a new sequence ID based on its cluster.
                    new_id = f"{seq_name}|Class_{0 if cluster in groups[0] else 1}"
                    # Get the actual sequence for this sequence name.
                    sequence = str(sequences_dictionary[seq_name].seq)  
                    # Write the sequence to the file.
                    file.write(f">{new_id}\n{sequence}\n")
        
        # Call the previously discussed 'clone_sequences' function on the FASTA file.
        clone_sequences(fasta_file)

In [7]:
# Define a list of sample names/identifiers to process.
samples = ["04.B1.W14.01_04.M1.W09.02", "05.B1.W14.04_05.M1.W08.03", 
           "27.B1.W13.06_27.M1.W10.07", "30.B1.W11.08_30.M1.W04.09", 
           "38.B1.W10.11_38.M1.W03.10", "39.B1.W11.12_39.M1.W03.13_39.M1.W05.14",
           "53.B1.W14.17_53.M1.W07.16", "56.B1.W09.22_56.M1.W03.21", 
           "63.B1.W09.29_63.M1.W02.30", "66.B1.W09.25_66.M1.W02.24"]

# Read the comparisons data from a JSON file into a dictionary.
with open("json/comparisons.json", "r") as json_file: 
    comparisons_dictionary = json.load(json_file)

# Loop through each sample in the list.
for sample in samples: 
    print("\nSample:", sample)
    
    # Read sequences from the sample's FASTA file.
    sequences_dictionary = read_fasta("data/fasta/" + sample + "_truncated.fasta")
    
    # Read clusters from the sample's CSV file.
    clusters_dictionary = read_clusters("data/clusters/" + sample + "_cluster.csv")
    
    # Generate comparison-specific FASTA files for the sample.
    generate_comparison_files(clusters_dictionary, sequences_dictionary, comparisons_dictionary, sample)


Sample: 04.B1.W14.01_04.M1.W09.02
Comparison_1 [[1], [3]]
Comparison_2 [[1], [0, 2]]
Comparison_3 [[1], [0, 2, 3]]
Comparison_4 [[3], [0, 2]]

Sample: 05.B1.W14.04_05.M1.W08.03
Comparison_1 [[0], [1]]
Comparison_2 [[0], [2]]
Comparison_3 [[0], [1, 2]]

Sample: 27.B1.W13.06_27.M1.W10.07
Comparison_1 [[1], [2]]
Comparison_2 [[1], [0]]
Comparison_3 [[1], [2, 0]]
Comparison_4 [[2], [0, 1]]

Sample: 30.B1.W11.08_30.M1.W04.09
Comparison_1 [[0], [1]]

Sample: 38.B1.W10.11_38.M1.W03.10
Comparison_1 [[0], [1]]
Comparison_2 [[0], [2]]
Comparison_3 [[0], [1, 2]]

Sample: 39.B1.W11.12_39.M1.W03.13_39.M1.W05.14
Comparison_1 [[2], [1]]
Comparison_2 [[2], [0]]
Comparison_3 [[2], [3]]
Comparison_4 [[2], [0, 1]]
Comparison_5 [[2], [0, 1, 3]]
Comparison_6 [[2, 1, 0], [3]]

Sample: 53.B1.W14.17_53.M1.W07.16
Comparison_1 [[0], [1]]
Comparison_2 [[0], [2]]
Comparison_3 [[0], [1, 2]]

Sample: 56.B1.W09.22_56.M1.W03.21
Comparison_1 [[0], [1]]
Comparison_2 [[0], [2]]
Comparison_3 [[0], [1, 2]]
Comparison_4 [