# Experimental Design for cfDNA Cancer Detection

This notebook explores how configuration choices affect statistical power and signal-to-noise ratio in cfDNA epigenomic cancer detection studies.

## Key Questions:
1. How does sample size affect detection power?
2. What are the effects of batch size and imbalance?
3. How do effect sizes impact statistical significance?
4. What are common design pitfalls to avoid?

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
from pathlib import Path
import sys

# Add src to path to import our modules
sys.path.append('../src')

from cfdna.simulate import simulate_dataset
from cfdna.features import prepare_features
from cfdna.metrics import auroc_with_ci

plt.style.use('seaborn-v0_8')
sns.set_palette('husl')
%matplotlib inline

## 1. Sample Size Power Analysis

Let's examine how sample size affects our ability to detect cancer signals.

In [None]:
# Base configuration
base_config = {
    "dataset": {
        "name": "power_analysis",
        "random_seed": 42
    },
    "samples": {
        "n_batches": 3,
        "centers": ["site_a", "site_b", "site_c"],
        "age_range": [40, 80],
        "sex_ratio": 0.5
    },
    "methylation": {
        "n_cpgs": 5000,
        "n_dmrs": 50,
        "cpgs_per_dmr": 100,
        "dmr_effect_size_mean": 0.15,
        "dmr_effect_size_std": 0.05,
        "alpha_base": 2.0,
        "beta_base": 8.0,
        "batch_effect_std": 0.02,
        "missingness_rate": 0.05
    },
    "fragmentomics": {
        "size_bins": [50, 100, 150, 200, 250, 300, 400, 500],
        "tss_enrichment_bins": 10,
        "size_effect_mean": 0.1,
        "size_effect_std": 0.03,
        "tss_effect_mean": 0.08,
        "tss_effect_std": 0.02,
        "noise_std": 0.05
    }
}

print("Base configuration loaded")

In [None]:
# Sample size analysis
sample_sizes = [100, 200, 400, 600, 800, 1000]
results = []

for n_total in sample_sizes:
    print(f"Testing sample size: {n_total}")
    
    # Update config
    config = base_config.copy()
    config["dataset"].update({
        "n_samples": n_total,
        "n_controls": int(n_total * 0.6),
        "n_cancer": int(n_total * 0.4)
    })
    
    # Run multiple replicates
    aurocs = []
    for rep in range(5):  # 5 replicates
        config["dataset"]["random_seed"] = 42 + rep
        
        # Save config temporarily
        config_path = Path(f"temp_config_{n_total}_{rep}.yaml")
        with open(config_path, 'w') as f:
            yaml.dump(config, f)
        
        try:
            # Generate data
            data_dir = Path("temp_data")
            data_dir.mkdir(exist_ok=True)
            
            stats = simulate_dataset(config_path, data_dir)
            
            # Prepare features and train simple model
            data = prepare_features(data_dir, config)
            
            # Quick logistic regression for power estimate
            from sklearn.linear_model import LogisticRegression
            from sklearn.metrics import roc_auc_score
            
            X = data["X"]
            y = data["y"]
            splits = data["splits"]
            
            X_train = X.iloc[splits["train"]]
            y_train = y.iloc[splits["train"]]
            X_test = X.iloc[splits["test"]]
            y_test = y.iloc[splits["test"]]
            
            # Standardize features
            from sklearn.preprocessing import StandardScaler
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train.fillna(X_train.median()))
            X_test_scaled = scaler.transform(X_test.fillna(X_train.median()))
            
            # Train model
            model = LogisticRegression(random_state=42, max_iter=1000)
            model.fit(X_train_scaled, y_train)
            
            # Evaluate
            y_probs = model.predict_proba(X_test_scaled)[:, 1]
            auroc = roc_auc_score(y_test, y_probs)
            aurocs.append(auroc)
            
            # Cleanup
            import shutil
            shutil.rmtree(data_dir, ignore_errors=True)
            config_path.unlink()
            
        except Exception as e:
            print(f"Error with sample size {n_total}, rep {rep}: {e}")
            aurocs.append(np.nan)
    
    # Store results
    results.append({
        'sample_size': n_total,
        'mean_auroc': np.nanmean(aurocs),
        'std_auroc': np.nanstd(aurocs),
        'min_auroc': np.nanmin(aurocs),
        'max_auroc': np.nanmax(aurocs)
    })

power_df = pd.DataFrame(results)
print("Power analysis completed")

In [None]:
# Plot power analysis results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# AUROC vs sample size
ax1.errorbar(power_df['sample_size'], power_df['mean_auroc'], 
             yerr=power_df['std_auroc'], marker='o', capsize=5)
ax1.fill_between(power_df['sample_size'], power_df['min_auroc'], 
                power_df['max_auroc'], alpha=0.3)
ax1.set_xlabel('Sample Size')
ax1.set_ylabel('AUROC')
ax1.set_title('Detection Power vs Sample Size')
ax1.grid(True, alpha=0.3)
ax1.set_ylim(0.5, 1.0)

# Variability vs sample size
ax2.plot(power_df['sample_size'], power_df['std_auroc'], marker='s', color='red')
ax2.set_xlabel('Sample Size')
ax2.set_ylabel('AUROC Standard Deviation')
ax2.set_title('Performance Stability vs Sample Size')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print summary
print("\nPower Analysis Summary:")
print(power_df.round(3))

## 2. Effect Size Analysis

How do different effect sizes impact detectability?

In [None]:
# Effect size analysis
effect_sizes = [0.05, 0.10, 0.15, 0.20, 0.25, 0.30]
effect_results = []

for effect_size in effect_sizes:
    print(f"Testing effect size: {effect_size}")
    
    config = base_config.copy()
    config["dataset"].update({
        "n_samples": 400,
        "n_controls": 240,
        "n_cancer": 160
    })
    
    # Update effect sizes
    config["methylation"]["dmr_effect_size_mean"] = effect_size
    config["fragmentomics"]["size_effect_mean"] = effect_size * 0.67
    config["fragmentomics"]["tss_effect_mean"] = effect_size * 0.53
    
    aurocs = []
    for rep in range(3):  # Fewer reps for speed
        config["dataset"]["random_seed"] = 42 + rep
        
        # Save and run
        config_path = Path(f"temp_effect_{effect_size}_{rep}.yaml")
        with open(config_path, 'w') as f:
            yaml.dump(config, f)
        
        try:
            data_dir = Path("temp_data")
            data_dir.mkdir(exist_ok=True)
            
            simulate_dataset(config_path, data_dir)
            data = prepare_features(data_dir, config)
            
            # Quick evaluation
            X = data["X"]
            y = data["y"]
            splits = data["splits"]
            
            X_train = X.iloc[splits["train"]]
            y_train = y.iloc[splits["train"]]
            X_test = X.iloc[splits["test"]]
            y_test = y.iloc[splits["test"]]
            
            from sklearn.preprocessing import StandardScaler
            from sklearn.linear_model import LogisticRegression
            from sklearn.metrics import roc_auc_score
            
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train.fillna(X_train.median()))
            X_test_scaled = scaler.transform(X_test.fillna(X_train.median()))
            
            model = LogisticRegression(random_state=42, max_iter=1000)
            model.fit(X_train_scaled, y_train)
            
            y_probs = model.predict_proba(X_test_scaled)[:, 1]
            auroc = roc_auc_score(y_test, y_probs)
            aurocs.append(auroc)
            
            # Cleanup
            import shutil
            shutil.rmtree(data_dir, ignore_errors=True)
            config_path.unlink()
            
        except Exception as e:
            print(f"Error with effect size {effect_size}, rep {rep}: {e}")
            aurocs.append(np.nan)
    
    effect_results.append({
        'effect_size': effect_size,
        'mean_auroc': np.nanmean(aurocs),
        'std_auroc': np.nanstd(aurocs)
    })

effect_df = pd.DataFrame(effect_results)
print("Effect size analysis completed")

In [None]:
# Plot effect size results
plt.figure(figsize=(8, 6))
plt.errorbar(effect_df['effect_size'], effect_df['mean_auroc'], 
             yerr=effect_df['std_auroc'], marker='o', capsize=5, linewidth=2)
plt.xlabel('DMR Effect Size')
plt.ylabel('AUROC')
plt.title('Detection Performance vs Effect Size')
plt.grid(True, alpha=0.3)
plt.ylim(0.5, 1.0)

# Add reference lines
plt.axhline(y=0.7, color='orange', linestyle='--', alpha=0.7, label='Weak Signal (0.7)')
plt.axhline(y=0.8, color='green', linestyle='--', alpha=0.7, label='Strong Signal (0.8)')
plt.legend()

plt.tight_layout()
plt.show()

print("\nEffect Size Analysis Summary:")
print(effect_df.round(3))

## 3. Common Design Pitfalls

### Pitfall 1: Batch Confounding
When cancer samples are concentrated in specific batches

In [None]:
# Simulate batch confounding
print("Demonstrating batch confounding effects...")

# Simulate a confounded design where batch 3 has more cancer samples
# This is BAD practice but demonstrates the issue

# Create manually confounded metadata
np.random.seed(42)
n_samples = 300

# Batch assignment (confounded)
batch_labels = np.concatenate([
    np.full(100, 0),  # Batch 0: mostly controls
    np.full(100, 1),  # Batch 1: mixed
    np.full(100, 2)   # Batch 2: mostly cancer
])

# Cancer labels (confounded with batch)
cancer_probs = np.array([0.2, 0.5, 0.8])  # Different cancer rates per batch
cancer_labels = []
for i in range(n_samples):
    batch = batch_labels[i]
    is_cancer = np.random.random() < cancer_probs[batch]
    cancer_labels.append(int(is_cancer))

cancer_labels = np.array(cancer_labels)

# Create mock "methylation" data that's just batch effects
batch_effects = np.array([0.3, 0.5, 0.7])  # Different means per batch
mock_methylation = []
for i in range(n_samples):
    batch = batch_labels[i]
    # Data is just batch effect + noise
    features = np.random.normal(batch_effects[batch], 0.1, 50)
    mock_methylation.append(features)

mock_methylation = np.array(mock_methylation)

# Train model on confounded data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

X_train, X_test, y_train, y_test, batch_train, batch_test = train_test_split(
    mock_methylation, cancer_labels, batch_labels, test_size=0.3, random_state=42
)

model_confounded = LogisticRegression(random_state=42)
model_confounded.fit(X_train, y_train)

y_probs_confounded = model_confounded.predict_proba(X_test)[:, 1]
auroc_confounded = roc_auc_score(y_test, y_probs_confounded)

print(f"AUROC with batch confounding: {auroc_confounded:.3f}")
print("This high AUROC is misleading - it's detecting batch effects, not cancer!")

# Show batch distribution
batch_df = pd.DataFrame({
    'batch': batch_labels,
    'cancer': cancer_labels
})

print("\nBatch-Cancer Distribution (PROBLEMATIC):")
print(pd.crosstab(batch_df['batch'], batch_df['cancer'], normalize='index').round(2))

### Pitfall 2: Data Leakage in Preprocessing

In [None]:
# Demonstrate data leakage
print("Demonstrating preprocessing data leakage...")

# Generate toy data
np.random.seed(42)
n_samples = 200
n_features = 100

X = np.random.randn(n_samples, n_features)
y = np.random.choice([0, 1], n_samples)

# Add a weak signal
signal_features = [0, 1, 2]
for feature_idx in signal_features:
    X[y == 1, feature_idx] += 0.3  # Small effect

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# WRONG: Scale using all data (leakage)
scaler_leaky = StandardScaler()
X_all_scaled = scaler_leaky.fit_transform(X)  # Using ALL data!
X_train_leaky = X_all_scaled[:len(X_train)]
X_test_leaky = X_all_scaled[len(X_train):]

model_leaky = LogisticRegression(random_state=42)
model_leaky.fit(X_train_leaky, y_train)
auroc_leaky = roc_auc_score(y_test, model_leaky.predict_proba(X_test_leaky)[:, 1])

# CORRECT: Scale using only training data
scaler_correct = StandardScaler()
X_train_correct = scaler_correct.fit_transform(X_train)  # Fit only on training!
X_test_correct = scaler_correct.transform(X_test)       # Transform test

model_correct = LogisticRegression(random_state=42)
model_correct.fit(X_train_correct, y_train)
auroc_correct = roc_auc_score(y_test, model_correct.predict_proba(X_test_correct)[:, 1])

print(f"AUROC with data leakage: {auroc_leaky:.3f}")
print(f"AUROC without leakage: {auroc_correct:.3f}")
print(f"Inflation due to leakage: {auroc_leaky - auroc_correct:.3f}")
print("\nLeakage often leads to overly optimistic performance estimates!")

## 4. Summary and Recommendations

### Key Findings:
1. **Sample Size**: Performance improves and stabilizes with larger sample sizes
2. **Effect Size**: Larger biological effects lead to better detection performance
3. **Batch Effects**: Can create spurious signals if confounded with outcome
4. **Data Leakage**: Can inflate performance estimates substantially

### Design Recommendations:
1. **Randomize batch assignment** across cancer status
2. **Use appropriate sample sizes** based on expected effect sizes
3. **Implement strict train/validation/test splits** before any preprocessing
4. **Include batch correction** in your analysis pipeline
5. **Report confidence intervals** to assess uncertainty
6. **Validate findings** in independent cohorts

In [None]:
# Create summary visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 10))

# Sample size effect
ax1.plot(power_df['sample_size'], power_df['mean_auroc'], 'o-', linewidth=2)
ax1.set_xlabel('Sample Size')
ax1.set_ylabel('AUROC')
ax1.set_title('A. Sample Size vs Performance')
ax1.grid(True, alpha=0.3)

# Effect size impact
ax2.plot(effect_df['effect_size'], effect_df['mean_auroc'], 's-', color='orange', linewidth=2)
ax2.set_xlabel('Effect Size')
ax2.set_ylabel('AUROC')
ax2.set_title('B. Effect Size vs Performance')
ax2.grid(True, alpha=0.3)

# Batch confounding illustration
batches = [0, 1, 2]
cancer_rates = [0.2, 0.5, 0.8]
ax3.bar(batches, cancer_rates, color=['lightblue', 'orange', 'red'], alpha=0.7)
ax3.set_xlabel('Batch')
ax3.set_ylabel('Cancer Rate')
ax3.set_title('C. Problematic Batch Design')
ax3.set_ylim(0, 1)

# Data leakage impact
methods = ['With Leakage', 'Without Leakage']
aurocs = [auroc_leaky, auroc_correct]
colors = ['red', 'green']
bars = ax4.bar(methods, aurocs, color=colors, alpha=0.7)
ax4.set_ylabel('AUROC')
ax4.set_title('D. Data Leakage Impact')
ax4.set_ylim(0.4, 0.8)

# Add value labels on bars
for bar, auroc in zip(bars, aurocs):
    height = bar.get_height()
    ax4.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{auroc:.3f}', ha='center', va='bottom')

plt.suptitle('Experimental Design Considerations for cfDNA Studies', fontsize=16)
plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("EXPERIMENTAL DESIGN CHECKLIST:")
print("="*60)
print("✓ Randomize sample collection across batches")
print("✓ Balance cancer/control samples within batches")
print("✓ Define train/test splits BEFORE preprocessing")
print("✓ Include batch correction in analysis")
print("✓ Use appropriate sample sizes for expected effects")
print("✓ Report confidence intervals")
print("✓ Validate in independent cohorts")
print("="*60)