In [1]:
import pod5
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from scipy.ndimage import gaussian_filter1d 
import time

# --- Settings ---
SIGNAL_WINDOW_SIZE = 100 
N_SAMPLES = 10000  
POD5_FILEPATH = '/Users/aman/Biocompute_assignment/control_rep2.pod5'

# --- New File Names ---
SAVE_PREFIX = '_adv' 

all_signal_windows = []
all_labels = []

print(f"Starting FINAL Hybrid Toy data generation...")

Starting FINAL Hybrid Toy data generation...


In [2]:
start_time = time.time()
np.random.seed(42) 
half_window = SIGNAL_WINDOW_SIZE // 2
signal_count = 0

try:
    with pod5.Reader(POD5_FILEPATH) as reader:
        print(f"Opened {POD5_FILEPATH}. Generating {N_SAMPLES} samples...")
        
        for read in reader.reads():
            if signal_count >= N_SAMPLES:
                break
                
            raw_signal = read.signal
            if raw_signal.shape[0] < SIGNAL_WINDOW_SIZE:
                continue
                
            # Pick a random spot in the signal to cut out our window
            rand_start = np.random.randint(0, raw_signal.shape[0] - SIGNAL_WINDOW_SIZE)
            signal_window = raw_signal[rand_start : rand_start + SIGNAL_WINDOW_SIZE].astype(np.float32)
            
            # Assign a random label (0 or 1)
            label = np.random.randint(0, 2)
            
            # --- PREPROCESSING FIXES ---
            # 1. Gaussian Smoothing: [REMOVED]
            
            if label == 1:
                # 2. Add LOUD & RANDOMIZED Spike
                
                # Randomized Spike Height Multiplier (0.8x to 1.5x)
                magnitude_multiplier = np.random.uniform(0.8, 1.5) 
                
                # Randomized Spike Position (Place spike randomly, not just in center)
                spike_length = 6
                rand_spike_start = np.random.randint(10, SIGNAL_WINDOW_SIZE - spike_length - 10)
                
                # Define the base spike shape (higher noise than before)
                spike = np.array([50, 100, 150, 150, 100, 50])
                spike = (spike * magnitude_multiplier) + np.random.normal(0, 40, spike_length)
                
                # Apply the spike
                signal_window[rand_spike_start : rand_spike_start + spike_length] += spike
            
            # --- NORMALIZATION ---
            mean = np.mean(signal_window)
            std = np.std(signal_window)
            
            if std > 0:
                normalized_window = (signal_window - mean) / std
                all_signal_windows.append(normalized_window)
                all_labels.append(label)
                signal_count += 1

except Exception as e:
    print(f"An error occurred: {e}")

print(f"\nFINAL data generation finished. Total events: {len(all_labels)}")
end_time = time.time()
print(f"Generation took {end_time - start_time:.2f} seconds.")

Opened /Users/aman/Biocompute_assignment/control_rep2.pod5. Generating 10000 samples...

FINAL data generation finished. Total events: 10000
Generation took 0.59 seconds.


In [3]:
# Convert your lists to big NumPy arrays
X = np.array(all_signal_windows)
y = np.array(all_labels)

print(f"\nTotal X shape: {X.shape}") 
print(f"Total y shape: {y.shape}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

# Save these arrays to disk with the new names
np.save(f'X_train{SAVE_PREFIX}.npy', X_train)
np.save(f'y_train{SAVE_PREFIX}.npy', y_train)
np.save(f'X_test{SAVE_PREFIX}.npy', X_test)
np.save(f'y_test{SAVE_PREFIX}.npy', y_test)

print(f"\nAll 'advanced' data has been saved to files with prefix '{SAVE_PREFIX}'")


Total X shape: (10000, 100)
Total y shape: (10000,)
X_train shape: (8000, 100)
X_test shape: (2000, 100)

All 'advanced' data has been saved to files with prefix '_adv'
