# 01 – Data Preprocessing

This notebook builds the data pipeline:
- Load coordinates/labels (CSV/TSV/BED-like)
- Extract ±1kb sequences from a reference FASTA
- One-hot encode and save train/val/test arrays (.npz)

**Inputs expected:**
- `data/raw/reference.fa` (+ `.fai` index)
- `data/raw/coords.tsv` with columns: `chrom start end label`


In [16]:
import sys
from pathlib import Path
import numpy as np, pandas as pd

# Ensure project root (that contains 'src/') is on sys.path
ROOT = Path.cwd()
if not (ROOT/'src').exists():
	ROOT = ROOT.parent
if (ROOT/'src').exists() and str(ROOT) not in sys.path:
	sys.path.insert(0, str(ROOT))

from src.data_utils import extract_sequence_window, load_coordinates, one_hot_encode, train_val_test_split

RAW = Path('data/raw')
PROC = Path('data/processed')
PROC.mkdir(parents=True, exist_ok=True)

FASTA = RAW/'reference.fa'  # <-- place your hg19/hg38 FASTA here
COORDS = RAW/'coords.tsv'   # <-- provide your coordinate+label file here
WINDOW = 1000  # +/- 1kb


In [17]:
# 1) Load coordinates

df = load_coordinates(str(COORDS))
print(df.head())
print('Total rows:', len(df))


  chrom  start    end  label
0  chr1   9950  10050      0
1  chr1  10950  11050      1
2  chr1  11950  12050      0
3  chr1  12950  13050      1
4  chr1  13950  14050      0
Total rows: 50


In [18]:
# 
from pathlib import Path
RAW.mkdir(parents=True, exist_ok=True)
if not FASTA.exists():
    with open(FASTA, 'w') as fh:
        fh.write('>chr1\n')
        # 200,000bp of repeating pattern to allow many windows
        fh.write(('ACGT' * 50000) + '\n')
if not (RAW/'reference.fa.fai').exists():
    # Build a minimal 1-line index for naive users; real workflows should run: samtools faidx reference.fa
    with open(RAW/'reference.fa.fai', 'w') as fh:
        # chrom length offset line_bases line_width
        fh.write('chr1\t200000\t6\t200000\t200001\n')
if not COORDS.exists():
    import pandas as pd, numpy as np
    centers = np.arange(10_000, 10_000 + 50*1000, 1000)  # 50 examples spaced
    df_toy = pd.DataFrame({
        'chrom': 'chr1',
        'start': centers - 50,
        'end': centers + 50,
        'label': (np.arange(len(centers)) % 2).astype(int)
    })
    df_toy.to_csv(COORDS, sep='\t', index=False)
print('FASTA exists:', FASTA.exists(), '| COORDS exists:', COORDS.exists())

FASTA exists: True | COORDS exists: True


In [19]:
# 2) Extract sequences and one-hot encode

X_list = []
y_list = []
for _, row in df.iterrows():
    chrom, start, end, label = row['chrom'], int(row['start']), int(row['end']), int(row['label'])
    center = (start + end)//2
    seq = extract_sequence_window(str(FASTA), chrom, center, window=WINDOW)
    X_list.append(one_hot_encode(seq))
    y_list.append(label)

X = np.stack(X_list, axis=0)  # shape: (N, 2*WINDOW, 4)
y = np.asarray(y_list, dtype=np.int64)
print('X shape:', X.shape, 'y shape:', y.shape)


X shape: (50, 2000, 4) y shape: (50,)


In [20]:
# 3) Train/Val/Test split and save

train_idx, val_idx, test_idx = train_val_test_split(len(y), val_frac=0.1, test_frac=0.1, seed=42)

np.savez(PROC/'train.npz', X=X[train_idx], y=y[train_idx])
np.savez(PROC/'val.npz', X=X[val_idx], y=y[val_idx])
np.savez(PROC/'test.npz', X=X[test_idx], y=y[test_idx])

print('Saved to data/processed/: train.npz, val.npz, test.npz')


Saved to data/processed/: train.npz, val.npz, test.npz
