# DDD Colab Runner
Minimal notebook that calls stable functions from the repo.

Workflow:
1. Run setup + clone.
2. Run data download/extract (optional).
3. Validate data.
4. Build EGS JSONs.
5. Train.


## Mount Drive (optional)
Only needed if you use `/content/drive` paths.


In [ ]:
from google.colab import drive
drive.mount('/content/drive')


## Config


In [ ]:
# ==== CONFIG ====
RUN_NAME = "ddd_48k_run1"
REPO_URL = "https://github.com/a-n-t-h-o-n-y/DDD.git"
REPO_DIR = "/content/DDD"
BRANCH = "main"

# Data
DATA_URL = ""  # set this to your dataset zip URL
DATA_ZIP_PATH = "/content/data_raw/training_data.zip"
DATA_EXTRACT_DIR = "/content/data_raw/audio"

# EGS + outputs
EGS_DIR = f"/content/egs/{RUN_NAME}"
TEST_NOISY_DIR = "/content/test_noisy"
CHECKPOINT_DIR = f"/content/drive/MyDrive/DeClip Data/runs/{RUN_NAME}/checkpoints"

# Split + validation
SEED = 1337
TRAIN_RATIO = 0.90
VALID_RATIO = 0.05
TEST_RATIO = 0.05
SAMPLE_RATE = 48000
CHANNELS = 1

# Training
SAVE_EVERY = 10


## Repo Setup


In [ ]:
# ==== SETUP: clone or update repo ====
import os
import subprocess

if not os.path.isdir(REPO_DIR):
    subprocess.check_call(["git", "clone", "--branch", BRANCH, REPO_URL, REPO_DIR])
else:
    subprocess.check_call(["git", "-C", REPO_DIR, "fetch"])
    subprocess.check_call(["git", "-C", REPO_DIR, "checkout", BRANCH])
    subprocess.check_call(["git", "-C", REPO_DIR, "pull"])


## Import API


In [ ]:
# ==== SETUP: import colab API ====
import sys

if REPO_DIR not in sys.path:
    sys.path.append(REPO_DIR)

from pathlib import Path
from colab_api import (
    build_egs_from_wavs,
    download_if_missing,
    extract_zip,
    list_wavs,
    run_hifigan_training,
    validate_wavs,
)


## Download + Extract Data


In [ ]:
# ==== DATA: download + extract ====
zip_path = Path(DATA_ZIP_PATH)
extract_dir = Path(DATA_EXTRACT_DIR)

if DATA_URL:
    download_if_missing(DATA_URL, zip_path)
extract_zip(zip_path, extract_dir)


## Validate Data


In [ ]:
# ==== DATA: validate ====
wav_paths = list_wavs(Path(DATA_EXTRACT_DIR))
report = validate_wavs(wav_paths, expected_sample_rate=SAMPLE_RATE, expected_channels=CHANNELS)
print(f"Total files: {report.total_files}")
print(f"Invalid files: {len(report.invalid_files)}")
if report.invalid_files:
    print("First 5 issues:")
    for issue in report.invalid_files[:5]:
        print(f"- {issue.path}: sr={issue.sample_rate}, ch={issue.channels}")


## Build EGS JSONs


In [ ]:
# ==== DATA: build EGS ====
egs = build_egs_from_wavs(
    wav_paths=wav_paths,
    egs_dir=Path(EGS_DIR),
    seed=SEED,
    train_ratio=TRAIN_RATIO,
    valid_ratio=VALID_RATIO,
    test_ratio=TEST_RATIO,
    test_noisy_dir=Path(TEST_NOISY_DIR),
)
print("Train EGS dir:", egs.train_dir)
print("Valid EGS dir:", egs.valid_dir)
print("Test EGS dir:", egs.test_dir)


## Train


In [ ]:
# ==== TRAIN ====
run_hifigan_training(
    train_dir=Path(egs.train_dir),
    valid_dir=Path(egs.valid_dir),
    test_dir=Path(egs.test_dir),
    output_dir=Path(CHECKPOINT_DIR),
    sample_rate=SAMPLE_RATE,
    save_every=SAVE_EVERY,
)
