### SimGen_Ideal signal generator functions

In [None]:
import os
import gc
import psutil
import random
import pandas as pd
import numpy as np
from Bio import SeqIO
from scipy.signal import resample
from datetime import datetime
import concurrent.futures
import matplotlib.pyplot as plt

def normalize_signal(signal):
    mean = 63.63331112435138 # can be changed if needed, depending on dataset
    std = 10.705338783244988 # can be changed if needed, depending on dataset
    std = std if std != 0 else 1
    normalized = (signal - mean) / std
    print("mean is", mean, "and std is", std)

    return normalized
    
def moving_6mer_Substrings(string):
    return [string[i:i+6] for i in range(len(string) - 5)]

def predict_DNA_6mer_5_3_with_sampling(template, lut, lambda_time, sampling_rate, I_max=180):
    template = template[::-1] 
    kmers = moving_6mer_Substrings(template)
    N = len(kmers)

    valid_kmers = [k for k in kmers if k in lut]
    params = np.array([list(lut[k].values()) for k in valid_kmers]) 

    pre_mean, pre_std, post_mean, post_std = params.T

    step_times = np.ones(len(valid_kmers)) * lambda_time 
    num_samples = (step_times * sampling_rate).astype(int)

    sampled_signals = []
    sampled_times = []
    current_time = 0.0
    
    for i in range(len(valid_kmers)):
        ns = num_samples[i]
        if ns == 0:
            continue
        
        pre = np.random.normal(pre_mean[i] * I_max, pre_std[i] * I_max, ns)
        post = np.random.normal(post_mean[i] * I_max, post_std[i] * I_max, ns)
        
        step_time = step_times[i]
        
        t_pre = np.linspace(current_time, current_time + step_time, ns)
        sampled_signals.extend(pre)
        sampled_times.extend(t_pre)
        current_time += step_time
       
        t_post = np.linspace(current_time, current_time + step_time, ns)
        sampled_signals.extend(post)
        sampled_times.extend(t_post)
        current_time += step_time

    return pd.DataFrame({
        "time": sampled_times,
        "current": sampled_signals
    })

def extract_random_sequences(reference_genome_file, num_sequences, seq_length):
    genome = "".join(str(record.seq) for record in SeqIO.parse(reference_genome_file, "fasta"))
    genome_len = len(genome)
    sequences = set()
    
    while len(sequences) < num_sequences:
        start = random.randint(0, genome_len - seq_length)
        seq = genome[start:start + seq_length]
        sequences.add(seq)
    
    return list(sequences)

def encode_dna_sequence(seq):
    mapping = {'A': 1, 'C': 2, 'G': 3, 'T': 4} # if reference genome includes lower case, those can be added here 
    return [mapping.get(base.upper(), 0) for base in seq]

def generate_random_signals(reference_genome_file, num_reads, seq_length):
    sequences = extract_random_sequences(reference_genome_file, num_reads, seq_length)
    test_size = max(1, int(0.05 * len(sequences)))
    test_indices = np.random.choice(len(sequences), size=test_size, replace=False)
    test_sequences = [sequences[i] for i in test_indices]
    train_sequences = [sequences[i] for i in range(len(sequences)) if i not in test_indices]
    return train_sequences, test_sequences

def chunk_signal_optimized(n_chunks, signal, chunksize, overlap):
    step = chunksize - overlap
    result = np.zeros((n_chunks, chunksize))
    
    for i in range(n_chunks):
        start = i * step
        result[i] = signal[start:start + chunksize]
        
    return result

def align_references(bases_per_point,dna_sequence, signal_length, chunksize, overlap): 
    
    dna_encoded = encode_dna_sequence(dna_sequence) 
    step = chunksize - overlap

    raw_references = []
    reference_lengths = []
    
    for i in range(0, signal_length - chunksize + 1, step):
        start = int(i * bases_per_point)
        end = int((i + chunksize) * bases_per_point)

        chunk_ref = dna_encoded[start:end]
        
        reference_lengths.append(len(chunk_ref))
        raw_references.append(chunk_ref[::-1])

    max_len = max(reference_lengths)
    padded_references = [np.pad(ref, (0, max_len - len(ref)), 'constant', constant_values=0) for ref in raw_references] 

    return np.array(padded_references, dtype=np.int8), np.array(reference_lengths, dtype=np.int64)

def process_batch_in_parallel(batch_seqs, lut, lambda_time, sampling_rate, output_dir, batch_idx):
    all_chunks, all_refs, all_ref_lens = [None] * len(batch_seqs), [None] * len(batch_seqs), [None] * len(batch_seqs)

    for seq_i, seq in enumerate(batch_seqs):
        signal_df = predict_DNA_6mer_5_3_with_sampling(seq, lut, lambda_time, sampling_rate)
        signal = signal_df['current'].values
        sig_len = len(signal)
        dna_len = len(seq)
        chunksize, overlap = 10000, 500 # chunk size and overlap values changed here
        bases_per_point = 0.05
        
        step = chunksize - overlap
        n_chunks = (sig_len - overlap) // step
        retained_signal_length = n_chunks * step + overlap
        trimmed_signal = signal[:retained_signal_length]
        normalized_signal = normalize_signal(trimmed_signal)
        chunks = chunk_signal_optimized(n_chunks, normalized_signal, chunksize, overlap)
        trimmed_seq_length = dna_len-int(retained_signal_length * bases_per_point)#+1
        trimmed_seq = seq[trimmed_seq_length:]
        refs, ref_lens = align_references(bases_per_point, trimmed_seq, len(trimmed_signal), chunksize, overlap)

        all_chunks[seq_i] = chunks
        all_ref_lens[seq_i] = ref_lens
        all_refs[seq_i] = refs

    all_refs = [ref[::-1] for ref in all_refs]  
    all_ref_lens = [lens[::-1] for lens in all_ref_lens]
    
    max_ref_len = int(max(np.concatenate(all_ref_lens)))
    padded = []
    for ref in all_refs:
        padded_ref = np.array([
            np.pad(chunk, (0, max_ref_len - len(chunk)), 'constant', constant_values=0) 
            for chunk in ref
        ], dtype=np.uint8)
        padded.append(padded_ref)
    
    output_file = os.path.join(output_dir, f"batch_{batch_idx}.npz")
    np.savez_compressed(output_file,
                        chunks=np.concatenate(all_chunks),
                        references=np.concatenate(padded),
                        reference_lengths=np.concatenate(all_ref_lens),
                        max_ref_len=max_ref_len)
    del all_chunks, all_refs, all_ref_lens
    gc.collect()
    return output_file

def process_and_save_batches(sequences, lut, output_dir, prefix, lambda_time, sampling_rate, batch_size):
    os.makedirs(output_dir, exist_ok=True)
    saved_files = []
    global_max_ref_len = 0

    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor: # number of workers can be changed here 
        futures = {}
        for batch_idx, start in enumerate(range(0, len(sequences), batch_size)):
            end = min(start + batch_size, len(sequences))
            batch_seqs = sequences[start:end]
            future = executor.submit(
                process_batch_in_parallel,
                batch_seqs, lut, lambda_time, sampling_rate, output_dir, batch_idx
            )
            futures[future] = batch_idx

        for future in concurrent.futures.as_completed(futures):
            batch_idx = futures[future]
            try:
                output_file = future.result()
                saved_files.append(output_file)

                with np.load(output_file) as data:
                    global_max_ref_len = max(global_max_ref_len, int(data["max_ref_len"]))
                print_memory_usage(f"After batch {batch_idx}")
            except Exception as e:
                print(f"Failed batch {batch_idx}: {e}")
            gc.collect()

### Example usage 

In [None]:
reference_genome = '...' # in .fna format preferably
LUT_6mer = pd.read_csv('...', encoding='utf-8') # add kmer model 
lut = LUT_6mer.set_index("kmer_pull_3_5")[["pre_mean", "pre_std", "post_mean", "post_std"]].to_dict("index") # change parameter names if needed


train_seqs, test_seqs = generate_random_signals(reference_genome_file=reference_genome, num_reads=...,seq_length=...) # add values here

output_dir_train = "..." # output training files directory 
output_dir_test = "..." # validation set folder within the training output folder


process_and_save_batches(
    sequences=train_seqs,
    lut=lut,
    output_dir=output_dir_train,
    prefix="train",
    lambda_time=..., # add dwell time, preferably similar to 0.002 s 
    sampling_rate=5000, # change if needed 
    batch_size=500 # change if needed 
)

process_and_save_batches(
    sequences=test_seqs,
    lut=lut,
    output_dir=output_dir_test,
    prefix="test",
    lambda_time=..., # add same dwell time as for training data
    sampling_rate=5000,
    batch_size=500
)