# Data Augmentation Pipeline for Sequence Models

This notebook demonstrates how to:
1. Apply reverse complement transformations
2. Introduce random mutations for robustness
3. Extract random subsequences for training
4. Simulate quality scores for FASTA → FASTQ conversion

Data augmentation improves model generalization by expanding training datasets.

In [None]:
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np

import deepbiop as dbp

## 1. Reverse Complement Transformation

Reverse complement is biologically meaningful:
- DNA double strand: 5'-ACGT-3' ↔ 3'-TGCA-5'
- Models should be orientation-invariant

Base pairing rules:
- A ↔ T
- C ↔ G

In [None]:
# Original sequence
sequence = b"ACGTACGTACGT"
print(f"Original:  {sequence.decode()}")

# Apply reverse complement
rc = dbp.ReverseComplement()
rc_sequence = rc.apply(sequence)
print(f"Rev Comp:  {rc_sequence.decode()}")

# Verify: reverse complement of reverse complement = original
rc_rc = rc.apply(rc_sequence)
print(f"RC of RC:  {rc_rc.decode()}")
print(f"Match:     {rc_rc == sequence}")

In [None]:
# Batch processing for efficiency
sequences = [b"ACGTACGTACGT", b"TTTTAAAACCCC", b"GGGGCCCCAAAA"]

rc_sequences = rc.apply_batch(sequences)

for orig, rev_comp in zip(sequences, rc_sequences, strict=False):
    print(f"{orig.decode():20s} → {rev_comp.decode()}")

## 2. Random Mutations for Robustness

Introduce point mutations to simulate:
- Sequencing errors
- Natural genetic variation
- SNPs (Single Nucleotide Polymorphisms)

Configurable mutation rate (typically 0.01-0.05 for augmentation)

In [None]:
# Create mutator with 5% mutation rate
mutator = dbp.Mutator(mutation_rate=0.05, seed=42)

# Apply mutations
original = b"A" * 100  # 100 A's
mutated = mutator.apply(original)

# Count mutations
mutations = sum(1 for o, m in zip(original, mutated, strict=False) if o != m)
print(f"Original: {original[:20].decode()}...")
print(f"Mutated:  {mutated[:20].decode()}...")
print(f"\nMutations: {mutations}/100 ({mutations}%)")

In [None]:
# Visualize mutation distribution
bases = Counter(mutated.decode())

plt.figure(figsize=(8, 5))
plt.bar(bases.keys(), bases.values(), color=["red", "blue", "green", "orange"])
plt.xlabel("Base")
plt.ylabel("Count")
plt.title("Base Distribution After 5% Mutation (100 A's)")
plt.axhline(y=95, color="r", linestyle="--", label="Expected A count")
plt.legend()
plt.show()

## 3. Random Subsequence Sampling

Extract random subsequences for:
- Fixed-length model inputs
- Data augmentation from long reads
- Sliding window analysis

In [None]:
# Create sampler for 150bp subsequences
sampler = dbp.Sampler(length=150, strategy="random", seed=42)

# Long read (1000bp)
long_read = b"ACGT" * 250  # 1000bp
print(f"Original length: {len(long_read)}bp")

# Sample multiple subsequences
samples = [sampler.apply(long_read) for _ in range(5)]

print(f"\nGenerated {len(samples)} samples of {len(samples[0])}bp each")
for i, sample in enumerate(samples, 1):
    print(f"Sample {i}: {sample[:30].decode()}... (length: {len(sample)}bp)")

In [None]:
# Different sampling strategies
strategies = ["start", "center", "end", "random"]

sequence = b"A" * 50 + b"G" * 50 + b"C" * 50 + b"T" * 50  # 200bp with distinct regions

for strategy in strategies:
    sampler = dbp.Sampler(length=50, strategy=strategy, seed=42)
    sample = sampler.apply(sequence)
    print(f"{strategy:8s}: {sample.decode()}")

## 4. Quality Score Simulation

Simulate realistic Phred quality scores for:
- FASTA → FASTQ conversion
- Synthetic read generation
- Quality-aware model training

Quality models:
- **HighQuality**: Modern Illumina (mean Q37)
- **MediumQuality**: Older platforms (mean Q28)
- **Degrading**: Quality decreases along read

In [None]:
from deepbiop import QualityModel, QualitySimulator

# High quality simulation (modern Illumina)
sim_high = QualitySimulator(QualityModel.HighQuality, seed=42)
quality_high = sim_high.generate(150)

# Medium quality simulation
sim_med = QualitySimulator(QualityModel.MediumQuality, seed=42)
quality_med = sim_med.generate(150)

# Degrading quality (typical for longer reads)
sim_deg = QualitySimulator(
    QualityModel.Degrading(start_mean=40.0, end_mean=25.0, std_dev=3.0), seed=42
)
quality_deg = sim_deg.generate(150)

print(f"High quality (first 20): {quality_high[:20]}")
print(f"Medium quality (first 20): {quality_med[:20]}")
print(f"Degrading quality (first 20): {quality_deg[:20]}")

In [None]:
# Convert ASCII to Phred scores and visualize
def ascii_to_phred(quality_str):
    """Convert ASCII quality string to Phred scores."""
    return np.array([q - 33 for q in quality_str])


phred_high = ascii_to_phred(quality_high)
phred_med = ascii_to_phred(quality_med)
phred_deg = ascii_to_phred(quality_deg)

plt.figure(figsize=(12, 6))

plt.subplot(1, 3, 1)
plt.plot(phred_high)
plt.title("High Quality (Q37)")
plt.xlabel("Position")
plt.ylabel("Phred Score")
plt.ylim(0, 45)

plt.subplot(1, 3, 2)
plt.plot(phred_med)
plt.title("Medium Quality (Q28)")
plt.xlabel("Position")
plt.ylabel("Phred Score")
plt.ylim(0, 45)

plt.subplot(1, 3, 3)
plt.plot(phred_deg)
plt.title("Degrading Quality")
plt.xlabel("Position")
plt.ylabel("Phred Score")
plt.ylim(0, 45)

plt.tight_layout()
plt.show()

## 5. Complete Augmentation Pipeline

Combine transformations for comprehensive data augmentation.

In [None]:
def augment_sequence(seq, n_augmented=5):
    """Generate augmented versions of a sequence.

    Includes:
    - Original
    - Reverse complement
    - Mutated versions
    - Random subsequences.
    """
    rc = dbp.ReverseComplement()
    mutator = dbp.Mutator(mutation_rate=0.02, seed=None)  # 2% mutations
    dbp.Sampler(length=len(seq), strategy="random", seed=None)

    augmented = []

    # 1. Original
    augmented.append(("original", seq))

    # 2. Reverse complement
    augmented.append(("reverse_complement", rc.apply(seq)))

    # 3. Mutated versions
    for i in range(n_augmented // 2):
        mutated = mutator.apply(seq)
        augmented.append((f"mutated_{i + 1}", mutated))

    # 4. RC + mutations
    for i in range(n_augmented - n_augmented // 2):
        rc_seq = rc.apply(seq)
        mutated_rc = mutator.apply(rc_seq)
        augmented.append((f"rc_mutated_{i + 1}", mutated_rc))

    return augmented


# Test the pipeline
original_seq = b"ACGTACGTGGCCTTAAGGCCTTAAACGTACGT"
augmented_seqs = augment_sequence(original_seq, n_augmented=4)

print(f"Generated {len(augmented_seqs)} augmented sequences:")
for label, seq in augmented_seqs:
    print(f"  {label:20s}: {seq.decode()}")

## 6. Augmentation for Training Data

Practical example: Expand a small training dataset.

In [None]:
# Simulate small training dataset
training_sequences = [
    b"ACGTACGTACGTACGT",
    b"TTTTAAAACCCCGGGG",
    b"GGGGCCCCAAAATTTT",
    b"ATATATATATATATAT",
    b"GCGCGCGCGCGCGCGC",
]

print(f"Original dataset: {len(training_sequences)} sequences")

# Augment each sequence 10x
augmented_dataset = []
for seq in training_sequences:
    augmented_versions = augment_sequence(seq, n_augmented=9)
    augmented_dataset.extend([s for _, s in augmented_versions])

print(f"Augmented dataset: {len(augmented_dataset)} sequences")
print(f"Expansion factor: {len(augmented_dataset) / len(training_sequences):.1f}x")

## Summary

DeepBioP provides biologically-meaningful augmentation:

| Transformation | Purpose | Biological Meaning |
|----------------|---------|--------------------|
| Reverse Complement | Orientation invariance | DNA double strand |
| Random Mutations | Error robustness | SNPs, sequencing errors |
| Subsequence Sampling | Fixed-length inputs | Sliding windows |
| Quality Simulation | FASTA→FASTQ | Realistic quality scores |

Benefits:
- ✅ Expand small training datasets
- ✅ Improve model generalization
- ✅ Batch processing for efficiency
- ✅ Reproducible with seeds
- ✅ Biologically valid transformations