# Train Custom openWakeWord Model

Windows-compatible version of the openWakeWord training notebook.

## Prerequisites

### Setup the Jupyter kernel (one-time)

```bash
uv add ipykernel --dev
uv run python -m ipykernel install --user --name voice-gateway
```

### Run in VS Code

1. Open this notebook in VS Code
2. Click **Select Kernel** (top right) â†’ **voice-gateway**
3. Run the cells!

## 1. Install Dependencies

In [None]:
# Set to True for NVIDIA GPU support (requires CUDA installed)
USE_GPU = True

In [None]:
import subprocess
import sys

# Install PyTorch with CUDA support from PyTorch's index
if USE_GPU:
    print("Installing PyTorch with CUDA support...")
    subprocess.run([
        "uv", "pip", "install",
        "torch>=2.0,<2.6", "torchaudio",
        "--index-url", "https://download.pytorch.org/whl/cu124"
    ], check=True)
else:
    print("Installing PyTorch (CPU only)...")
    subprocess.run(["uv", "pip", "install", "torch>=2.0,<2.6", "torchaudio"], check=True)

# Verify GPU is available
import torch
if torch.cuda.is_available():
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU detected - will use CPU")

In [None]:
# Other dependencies
deps = [
    "webrtcvad",
    "piper-tts",
    "piper-phonemize-cross",
    "numpy",
    "scipy",
    "tqdm",
    "datasets==2.14.6",
    "pyyaml",
    "onnxruntime",
    "onnx",
    "onnx2tf",
    "onnxsim",
    "onnx-graphsurgeon",
    "sng4onnx",
    "pronouncing",
    "deep-phonemizer",
    "mutagen",
    "torchinfo",
    "torchmetrics",
    "speechbrain==0.5.14",
    "audiomentations",
    "torch-audiomentations",
    "acoustics",
    "requests",
    "ipywidgets",
]

# Install dependencies one by one to see which fail
failed = []
for dep in deps:
    result = subprocess.run(["uv", "pip", "install", dep], capture_output=True, text=True)
    if result.returncode != 0:
        failed.append(dep)
        print(f"Failed: {dep}")
    else:
        print(f"Installed: {dep}")

if failed:
    print(f"\nWarning: Failed to install: {failed}")
    print("Some features may not work, but core training should still function.")
else:
    print("\nAll dependencies installed successfully!")

## 2. Setup Repositories

In [None]:
import os
import subprocess
from pathlib import Path
import requests
from tqdm.auto import tqdm

SCRIPT_DIR = Path(".").resolve()
print(f"Working directory: {SCRIPT_DIR}")

In [None]:
# Clone piper-sample-generator
PIPER_DIR = SCRIPT_DIR / "piper-sample-generator"

if not PIPER_DIR.exists():
    print("Cloning piper-sample-generator...")
    subprocess.run(["git", "clone", "https://github.com/rhasspy/piper-sample-generator"], cwd=SCRIPT_DIR, check=True)
    subprocess.run(["git", "checkout", "213d4d5"], cwd=PIPER_DIR, check=True)
else:
    print("piper-sample-generator already exists")

In [None]:
# Download piper model
MODELS_DIR = PIPER_DIR / "models"
MODELS_DIR.mkdir(exist_ok=True)
MODEL_PATH = MODELS_DIR / "en_US-libritts_r-medium.pt"

if not MODEL_PATH.exists():
    print("Downloading piper model...")
    url = "https://github.com/rhasspy/piper-sample-generator/releases/download/v2.0.0/en_US-libritts_r-medium.pt"
    response = requests.get(url, stream=True)
    response.raise_for_status()
    total = int(response.headers.get("content-length", 0))
    with open(MODEL_PATH, "wb") as f:
        with tqdm(total=total, unit="B", unit_scale=True, desc="Piper model") as pbar:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
                pbar.update(len(chunk))
else:
    print("Piper model already downloaded")

In [None]:
# Clone openwakeword
OWW_DIR = SCRIPT_DIR / "openwakeword"

if not OWW_DIR.exists():
    print("Cloning openwakeword...")
    subprocess.run(["git", "clone", "https://github.com/dscripka/openwakeword"], cwd=SCRIPT_DIR, check=True)
else:
    print("openwakeword already exists")

In [None]:
# Download openwakeword models
OWW_MODELS_DIR = OWW_DIR / "openwakeword" / "resources" / "models"
OWW_MODELS_DIR.mkdir(parents=True, exist_ok=True)

model_urls = {
    "embedding_model.onnx": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/embedding_model.onnx",
    "embedding_model.tflite": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/embedding_model.tflite",
    "melspectrogram.onnx": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/melspectrogram.onnx",
    "melspectrogram.tflite": "https://github.com/dscripka/openWakeWord/releases/download/v0.5.1/melspectrogram.tflite",
}

for filename, url in model_urls.items():
    filepath = OWW_MODELS_DIR / filename
    if not filepath.exists():
        print(f"Downloading {filename}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(filepath, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
    else:
        print(f"{filename} already exists")

## 3. Test Wake Word Pronunciation

Before training, verify the TTS pronounces your wake word correctly.

**Tips:**
- If pronunciation is wrong, spell it phonetically with underscores: `"hey_seer_e"` for "hey siri"
- Spell out numbers: `"two"` not `"2"`
- Avoid punctuation except `?` and `!`

In [None]:
# Configure your wake word here!
TARGET_WORD = "Seraphina"  # Change this to your desired wake word

In [None]:
import sys
if str(PIPER_DIR) not in sys.path:
    sys.path.insert(0, str(PIPER_DIR))

from generate_samples import generate_samples
from IPython.display import Audio, display

def test_pronunciation(text: str):
    """Generate and play a test sample."""
    generate_samples(
        text=text,
        max_samples=1,
        length_scales=[1.1],
        noise_scales=[0.7],
        noise_scale_ws=[0.7],
        output_dir=str(SCRIPT_DIR),
        batch_size=1,
        auto_reduce_batch_size=True,
        file_names=["test_generation.wav"],
    )
    display(Audio(str(SCRIPT_DIR / "test_generation.wav"), autoplay=True))

test_pronunciation(TARGET_WORD)

## 4. Download Training Data

This downloads:
- Pre-computed openWakeWord features (~16GB) - for negative examples
- Validation set features (~180MB) - for false positive estimation

**Note:** The 16GB download takes a while. You can skip it with `SKIP_LARGE_DOWNLOAD = True` but training quality will be lower.

In [None]:
SKIP_LARGE_DOWNLOAD = False  # Set to True to skip the 16GB download

In [None]:
# Download validation features (small, always download)
VAL_PATH = SCRIPT_DIR / "validation_set_features.npy"

if not VAL_PATH.exists():
    print("Downloading validation features...")
    url = "https://huggingface.co/datasets/davidscripka/openwakeword_features/resolve/main/validation_set_features.npy"
    response = requests.get(url, stream=True)
    response.raise_for_status()
    total = int(response.headers.get("content-length", 0))
    with open(VAL_PATH, "wb") as f:
        with tqdm(total=total, unit="B", unit_scale=True, desc="Validation features") as pbar:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
                pbar.update(len(chunk))
else:
    print("Validation features already downloaded")

In [None]:
# Download training features (large)
FEATURES_PATH = SCRIPT_DIR / "openwakeword_features_ACAV100M_2000_hrs_16bit.npy"

if SKIP_LARGE_DOWNLOAD:
    print("Skipping large feature download (training quality will be reduced)")
elif not FEATURES_PATH.exists():
    print("Downloading training features (16GB, this will take a while)...")
    url = "https://huggingface.co/datasets/davidscripka/openwakeword_features/resolve/main/openwakeword_features_ACAV100M_2000_hrs_16bit.npy"
    response = requests.get(url, stream=True)
    response.raise_for_status()
    total = int(response.headers.get("content-length", 0))
    with open(FEATURES_PATH, "wb") as f:
        with tqdm(total=total, unit="B", unit_scale=True, desc="Training features") as pbar:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
                pbar.update(len(chunk))
else:
    print("Training features already downloaded")

## 5. Configure Training

Adjust these parameters:
- `N_SAMPLES`: Number of synthetic examples (1000 is quick, 30000-50000 is better)
- `N_STEPS`: Training steps (10000 is quick, more is better)
- `FALSE_ACTIVATION_PENALTY`: Higher = fewer false activations but may miss quiet/noisy speech

In [None]:
N_SAMPLES = 1000
N_STEPS = 10000
FALSE_ACTIVATION_PENALTY = 1500

In [None]:
import yaml

# Load default config
with open(OWW_DIR / "examples" / "custom_model.yml") as f:
    config = yaml.safe_load(f)

# Modify config
config["target_phrase"] = [TARGET_WORD]
config["model_name"] = TARGET_WORD.replace(" ", "_")
config["n_samples"] = N_SAMPLES
config["n_samples_val"] = max(500, N_SAMPLES // 10)
config["steps"] = N_STEPS
config["target_accuracy"] = 0.5
config["target_recall"] = 0.25
config["output_dir"] = str(SCRIPT_DIR / "my_custom_model")
config["max_negative_weight"] = FALSE_ACTIVATION_PENALTY

# Data paths
config["background_paths"] = []  # Empty - we're using pre-computed features
config["false_positive_validation_data_path"] = str(VAL_PATH)

if FEATURES_PATH.exists():
    config["feature_data_files"] = {"ACAV100M_sample": str(FEATURES_PATH)}
else:
    config["feature_data_files"] = {}

# Save config
CONFIG_PATH = SCRIPT_DIR / "my_model.yaml"
with open(CONFIG_PATH, "w") as f:
    yaml.dump(config, f)

print(f"Config saved to: {CONFIG_PATH}")
print(f"\nTraining configuration:")
print(f"  Target word: {TARGET_WORD}")
print(f"  Samples: {N_SAMPLES}")
print(f"  Steps: {N_STEPS}")
print(f"  Output: {config['output_dir']}")

## 6. Train the Model

This runs three steps:
1. **Generate clips** - Create synthetic audio of your wake word
2. **Augment clips** - Add noise, reverb, etc. for robustness
3. **Train model** - Train the neural network

With default settings, this takes 30-60 minutes on CPU.

In [None]:
TRAIN_SCRIPT = OWW_DIR / "openwakeword" / "train.py"

print("=" * 50)
print("Step 1: Generating training clips")
print("=" * 50)
subprocess.run([sys.executable, str(TRAIN_SCRIPT), "--training_config", str(CONFIG_PATH), "--generate_clips"], check=True)

In [None]:
print("=" * 50)
print("Step 2: Augmenting clips")
print("=" * 50)
subprocess.run([sys.executable, str(TRAIN_SCRIPT), "--training_config", str(CONFIG_PATH), "--augment_clips"], check=True)

In [None]:
print("=" * 50)
print("Step 3: Training model")
print("=" * 50)
subprocess.run([sys.executable, str(TRAIN_SCRIPT), "--training_config", str(CONFIG_PATH), "--train_model"], check=True)

## 7. Done!

Your trained model is in the `my_custom_model` folder. You'll find:
- `{TARGET_WORD}.onnx` - ONNX format model
- `{TARGET_WORD}.tflite` - TensorFlow Lite format model

Copy the `.onnx` or `.tflite` file to your wakewords folder to use it!

In [None]:
OUTPUT_DIR = Path(config["output_dir"])
print(f"\nTraining complete! Model files:")
for f in OUTPUT_DIR.glob("*"):
    print(f"  {f}")

In [None]:
# Optional: Copy to wakewords folder
import shutil

WAKEWORD_DIR = SCRIPT_DIR.parent / "wakewords" / "seraphina"
WAKEWORD_DIR.mkdir(parents=True, exist_ok=True)

model_name = TARGET_WORD.replace(" ", "_")
for ext in [".onnx", ".tflite"]:
    src = OUTPUT_DIR / f"{model_name}{ext}"
    if src.exists():
        dst = WAKEWORD_DIR / f"{model_name}{ext}"
        shutil.copy(src, dst)
        print(f"Copied {src.name} to {dst}")