<a href="https://colab.research.google.com/github/anihab/dnaTokenization/blob/main/randomSamples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Libraries
import multiprocessing
processes=multiprocessing.cpu_count()
print("The number of processes:")
print(processes)

import argparse
import gzip
import os
import math
import re

import pandas as pd
import numpy as np

Generate Random Samples and Process Files

In [None]:
def generate_random_samples(seq_type, sequence, num_samples, desired_coverage, min_length, max_length):
    samples = []
    seq_length = len(sequence)

    # Random start between 0 and 1000
    start = 0
    actual_coverage = 0

    while num_samples > 0:
        segment_length = random.randint(min_length, max_length)
        start_position = random.randint(start, seq_length-segment_length-1)
        end_position = start_position + segment_length

        sample = sequence[start_position:end_position]
        samples.append((start_position, end_position, sample))
        num_samples -= 1
        actual_coverage += segment_length

    return samples

In [None]:
# Function to process bacterial files
def process_bacterial_file(bacteria_input_file, output_dir, coverage_bacteria, min_length_bacteria, max_length_bacteria):
    filename = os.path.basename(bacteria_input_file)
    root_filename = filename.split(".",1)[0]
    output_path = os.path.join(output_dir, root_filename)

    print(f"Processing bacterial file: {filename}")
    with open(bacteria_input_file, "r") as input_file, open(output_path, "w") as output_file:
        for record in SeqIO.parse(input_file, "fasta"):
            coverage = coverage_bacteria
            min_length = min_length_bacteria
            max_length = max_length_bacteria
            average_length = (min_length + max_length)/2
            num_samples = (len(record.seq)*coverage)//average_length
            samples = generate_random_samples('bacteria', record.seq, num_samples,coverage, min_length, max_length)
            for i, (start, end, sample) in enumerate(samples):
                output_file.write(f"{record.id},{start},{end},{end-start},{sample}\n")

# Function to process phage files
def process_phage_file(phage_input_file, output_dir, coverage_phage, min_length_phage, max_length_phage):
    filename = os.path.basename(phage_input_file)
    root_filename = filename.split(".",1)[0]
    output_path = os.path.join(output_dir, root_filename)
    print(f"Processing phage file: {filename}")
    with open(phage_input_file, "r") as input_file, open(output_path, "w") as output_file:
        for record in SeqIO.parse(input_file, "fasta"):
            coverage = coverage_phage
            min_length = min_length_phage
            max_length = max_length_phage
            average_length = (min_length + max_length)/2
            num_samples = (len(record.seq)*coverage)//average_length
            samples = generate_random_samples('phage', record.seq, num_samples, coverage, min_length, max_length)
            for i, (start, end, sample) in enumerate(samples):
                output_file.write(f"{record.id},{start},{end},{end-start},{sample}\n")

###Main

In [None]:
# define inputs and outputs
bacteria_input_dir="/uufs/chpc.utah.edu/common/home/u1323098/sundar-group-space2/PHAGE/DATASETS/BACTERIA_RAW/FASTA/ncbi-genomes-2023-09-02/FASTA"
phage_input_dir="/uufs/chpc.utah.edu/common/home/u1323098/sundar-group-space2/PHAGE/DATASETS/PHAGE_RAW/GenomesDB/FASTA"
bacteria_output_dir="/uufs/chpc.utah.edu/common/home/u1323098/sundar-group-space2/PHAGE/DATASETS/BACTERIA_RAW/FASTA/ncbi-genomes-2023-09-02/FASTA/TEST_OUTPUT/bacteria"
phage_output_dir="/uufs/chpc.utah.edu/common/home/u1323098/sundar-group-space2/PHAGE/DATASETS/BACTERIA_RAW/FASTA/ncbi-genomes-2023-09-02/FASTA/TEST_OUTPUT/phage"

coverage_bacteria = 0.25
coverage_phage = 0.9

min_length_bacteria = 1500
max_length_bacteria = 2000
min_length_phage = 1000
max_length_phage = 1500

In [None]:
# process bacterial and phage files in parallel
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())

bacterial_files = [os.path.join(bacteria_input_dir, filename) for filename in os.listdir(bacteria_input_dir) if filename.endswith(".fasta") or filename.endswith(".fna")]
phage_files = [os.path.join(phage_input_dir, filename) for filename in os.listdir(phage_input_dir) if filename.endswith(".fasta")]

pool.starmap(process_bacterial_file, [(file, bacteria_output_dir, coverage_bacteria, min_length_bacteria, max_length_bacteria) for file in bacterial_files])
pool.starmap(process_phage_file, [(file, phage_output_dir, coverage_phage, min_length_phage, max_length_phage) for file in phage_files])

pool.close()
pool.join()