# üá´üáÆ Finnish TTS Training - Brev Launchables

**Production-ready training notebook for Fish Speech + LoRA fine-tuning**

## Quick Start

```bash
# 1. SSH into Brev instance
ssh user@instance-ip

# 2. Authenticate with HuggingFace
huggingface-cli login  # Paste your token

# 3. Run setup (if not already done)
cd ~/nvidia-brev-launchables
bash setup.sh

# 4. Start training
jupyter notebook finnish-tts-launchables.ipynb
```

---

## Step 0: Pre-flight Checks

Verify all prerequisites before training.

In [None]:
import os
import sys
from pathlib import Path
import subprocess
import json

print("="*70)
print("PRE-FLIGHT CHECKS")
print("="*70)

# Check setup.sh was run
setup_state_file = Path.home() / '.finnish-tts-setup-state'
if setup_state_file.exists():
    with open(setup_state_file) as f:
        steps = f.read().strip().split('\n')
    print(f"\n‚úÖ Setup completed ({len(steps)} steps)")
    for step in steps:
        print(f"   ‚úì {step}")
else:
    print("\n‚ö†Ô∏è  setup.sh may not have been run completely")
    print("   Run: bash setup.sh")

# Check HF authentication
print("\n‚úì Checking HuggingFace authentication...")
hf_cache = Path.home() / '.cache' / 'huggingface' / 'token'
if hf_cache.exists():
    print("  ‚úÖ HuggingFace token found")
else:
    print("  ‚ö†Ô∏è  HuggingFace token not found")
    print("     Run: huggingface-cli login")

# Check GPU
print("\n‚úì Checking GPU...")
import torch
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"  ‚úÖ GPU: {gpu_name} ({gpu_mem:.1f}GB)")
else:
    print("  ‚ùå No GPU detected! Aborting.")
    raise RuntimeError("GPU required for training")

# Check Fish Speech
fish_dir = Path.home() / 'fish-speech'
if fish_dir.exists():
    print(f"  ‚úÖ Fish Speech: {fish_dir}")
else:
    print(f"  ‚ö†Ô∏è  Fish Speech not found at {fish_dir}")

print("\n" + "="*70)
print("‚úÖ Pre-flight checks passed!")
print("="*70)

## Step 1: Environment Setup

Configure paths and imports.

In [None]:
from pathlib import Path
import torch
import subprocess
from tqdm.auto import tqdm
import json
from datetime import datetime
import glob

# Paths
HOME = Path.home()
REPO_DIR = HOME / 'nvidia-brev-launchables'
FISH_SPEECH_DIR = HOME / 'fish-speech'
WORK_DIR = HOME / 'finnish-tts-training'  # fallback work directory

# Ensure work directory exists
WORK_DIR.mkdir(exist_ok=True, parents=True)
os.chdir(WORK_DIR)

# Add Fish Speech to path
sys.path.insert(0, str(FISH_SPEECH_DIR))

print(f"Repository: {REPO_DIR}")
print(f"Fish Speech: {FISH_SPEECH_DIR}")
print(f"Work Dir: {WORK_DIR}")
print(f"\n‚úÖ Paths configured")

## Step 2: GPU Auto-Configuration

Detect GPU and set optimal training parameters.

In [None]:
print("="*70)
print("GPU AUTO-CONFIGURATION")
print("="*70)

gpu_name = torch.cuda.get_device_name(0)
gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3

print(f"\n‚úÖ GPU: {gpu_name}")
print(f"‚úÖ VRAM: {gpu_memory:.1f} GB")
print(f"‚úÖ CUDA: {torch.version.cuda}")
print(f"‚úÖ PyTorch: {torch.__version__}")

# Auto-configure
if 'L40S' in gpu_name:
    BATCH_SIZE = 4
    NUM_WORKERS = 6
    ACCUMULATE_GRAD = 1
    GPU_CONFIG = "L40S (48GB)"
elif 'A100' in gpu_name:
    if gpu_memory > 70:
        BATCH_SIZE = 8
        NUM_WORKERS = 8
        ACCUMULATE_GRAD = 1
        GPU_CONFIG = "A100-80GB"
    else:
        BATCH_SIZE = 6
        NUM_WORKERS = 6
        ACCUMULATE_GRAD = 1
        GPU_CONFIG = "A100-40GB"
elif 'H100' in gpu_name:
    BATCH_SIZE = 10
    NUM_WORKERS = 10
    ACCUMULATE_GRAD = 1
    GPU_CONFIG = "H100-80GB"
else:
    # Conservative defaults
    if gpu_memory > 40:
        BATCH_SIZE = 4
        NUM_WORKERS = 4
    elif gpu_memory > 20:
        BATCH_SIZE = 2
        NUM_WORKERS = 4
    else:
        BATCH_SIZE = 1
        NUM_WORKERS = 2
    ACCUMULATE_GRAD = 2
    GPU_CONFIG = f"{gpu_name} ({gpu_memory:.1f}GB)"

print(f"\nüéØ Config: {GPU_CONFIG}")
print(f"   Batch size: {BATCH_SIZE}")
print(f"   Workers: {NUM_WORKERS}")
print(f"   Grad accumulation: {ACCUMULATE_GRAD}")

# Show nvidia-smi
try:
    result = subprocess.run(
        ['nvidia-smi', '--query-gpu=name,memory.total,memory.free,temperature.gpu',
         '--format=csv,noheader'],
        capture_output=True, text=True, timeout=5
    )
    print(f"\nüìä GPU Stats:\n{result.stdout}")
except Exception as e:
    print(f"\n‚ö†Ô∏è  Could not query GPU: {e}")

print("\n" + "="*70)

## Step 3: Dataset Validation

Validate your Finnish audio dataset before training.

In [None]:
import librosa
import soundfile as sf
import numpy as np
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Expected dataset locations
DATASET_OPTIONS = [
    WORK_DIR / 'data' / 'FinnishSpeaker',
    FISH_SPEECH_DIR / 'data' / 'FinnishSpeaker',
    HOME / 'finnish-tts-training' / 'data' / 'FinnishSpeaker',
]

DATA_DIR = None
for path in DATASET_OPTIONS:
    if path.exists() and list(path.glob('*.wav')):
        DATA_DIR = path
        break

if not DATA_DIR:
    print("‚ö†Ô∏è  No dataset found!")
    print(f"\nExpected location: {WORK_DIR}/data/FinnishSpeaker/")
    print("\nPlease:")
    print("1. Upload your dataset via SCP:")
    print(f"   scp -r your_data/ user@instance:{WORK_DIR}/data/FinnishSpeaker/")
    print("2. Then re-run this cell")
    DATA_DIR = WORK_DIR / 'data' / 'FinnishSpeaker'
    DATA_DIR.mkdir(parents=True, exist_ok=True)
else:
    print(f"‚úÖ Dataset found at: {DATA_DIR}")

if DATA_DIR.exists() and list(DATA_DIR.glob('*.wav')):
    print("\n" + "="*70)
    print("DATASET VALIDATION")
    print("="*70)
    
    wav_files = list(DATA_DIR.glob('*.wav'))
    lab_files = list(DATA_DIR.glob('*.lab'))
    npy_files = list(DATA_DIR.glob('*.npy'))
    
    print(f"\nüìä File counts:")
    print(f"   WAV files: {len(wav_files)}")
    print(f"   LAB files: {len(lab_files)}")
    print(f"   NPY files: {len(npy_files)} (VQ tokens)")
    
    # Check pairing
    wav_stems = {f.stem for f in wav_files}
    lab_stems = {f.stem for f in lab_files}
    
    missing_lab = wav_stems - lab_stems
    missing_wav = lab_stems - wav_stems
    
    if missing_lab:
        print(f"\n‚ö†Ô∏è  {len(missing_lab)} WAV files missing LAB")
    if missing_wav:
        print(f"\n‚ö†Ô∏è  {len(missing_wav)} LAB files missing WAV")
    if not missing_lab and not missing_wav:
        print("\n‚úÖ All WAV-LAB pairs match!")
    
    # Audio validation (sample)
    if wav_files:
        print(f"\nüîä Validating audio quality (sampling {min(10, len(wav_files))} files)...")
        sample_files = wav_files[:min(10, len(wav_files))]
        
        durations = []
        sample_rates = []
        channels_list = []
        
        for wav_file in tqdm(sample_files, desc="Checking", leave=False):
            try:
                audio, sr = librosa.load(wav_file, sr=None)
                durations.append(len(audio) / sr)
                sample_rates.append(sr)
                info = sf.info(wav_file)
                channels_list.append(info.channels)
            except Exception as e:
                print(f"‚ùå Error: {wav_file.name}: {e}")
        
        if durations:
            print(f"   Average duration: {np.mean(durations):.2f}s")
            print(f"   Duration range: {np.min(durations):.2f}s - {np.max(durations):.2f}s")
            
            sr_counts = Counter(sample_rates)
            print(f"   Sample rates: {dict(sr_counts)}")
            
            if all(sr == 24000 for sr in sample_rates):
                print("   ‚úÖ All samples at 24kHz (correct for Fish Speech)")
            else:
                print("   ‚ö†Ô∏è  Mixed sample rates - should be 24kHz")
            
            channel_counts = Counter(channels_list)
            print(f"   Channels: {dict(channel_counts)}")
            
            if all(ch == 1 for ch in channels_list):
                print("   ‚úÖ All samples are mono (correct)")
            else:
                print("   ‚ö†Ô∏è  Some samples are stereo - should be mono")
    
    print("\n" + "="*70)
else:
    print(f"\n‚ö†Ô∏è  Awaiting dataset at: {DATA_DIR}")

## Step 4: Download Base Model

Ensure base model is available.

In [None]:
from huggingface_hub import snapshot_download, list_repo_files
import os

os.chdir(FISH_SPEECH_DIR)

BASE_MODEL_PATH = FISH_SPEECH_DIR / 'checkpoints' / 'openaudio-s1-mini'
BASE_MODEL_PATH.mkdir(parents=True, exist_ok=True)

print("="*70)
print("BASE MODEL DOWNLOAD")
print("="*70)

if (BASE_MODEL_PATH / 'model.pth').exists():
    print(f"\n‚úÖ Base model already exists at: {BASE_MODEL_PATH}")
    print("\nModel files:")
    for file in sorted(BASE_MODEL_PATH.glob('*')):
        size = file.stat().st_size / 1024**2  # MB
        print(f"  {file.name:40s} {size:8.1f} MB")
else:
    print(f"\nüì• Downloading base model...")
    try:
        snapshot_download(
            repo_id="fishaudio/openaudio-s1-mini",
            local_dir=str(BASE_MODEL_PATH),
            local_dir_use_symlinks=False
        )
        print(f"‚úÖ Model downloaded to: {BASE_MODEL_PATH}")
        
        print("\nModel files:")
        for file in sorted(BASE_MODEL_PATH.glob('*')):
            size = file.stat().st_size / 1024**2
            print(f"  {file.name:40s} {size:8.1f} MB")
    except Exception as e:
        print(f"‚ùå Model download failed: {e}")
        print(f"\nTroubleshooting:")
        print(f"1. Check HuggingFace authentication: huggingface-cli login")
        print(f"2. Accept model license: https://huggingface.co/fishaudio/openaudio-s1-mini")
        print(f"3. Try again")
        raise

print("\n" + "="*70)

## Step 5: Training Configuration

Configure training parameters.

In [None]:
# Training parameters
TRAINING_CONFIG = {
    'project': 'FinnishSpeaker_2000_finetune',
    'batch_size': BATCH_SIZE,
    'num_workers': NUM_WORKERS,
    'max_steps': 2000,  # Full training
    'val_check_interval': 100,  # Validate every 100 steps
    'accumulate_grad_batches': ACCUMULATE_GRAD,
    'lora_r': 8,
    'lora_alpha': 16,
}

print("="*70)
print("TRAINING CONFIGURATION")
print("="*70)
print(f"\nProject: {TRAINING_CONFIG['project']}")
print(f"Batch size: {TRAINING_CONFIG['batch_size']}")
print(f"Workers: {TRAINING_CONFIG['num_workers']}")
print(f"Max steps: {TRAINING_CONFIG['max_steps']}")
print(f"LoRA: r={TRAINING_CONFIG['lora_r']}, alpha={TRAINING_CONFIG['lora_alpha']}")

# Estimate training time
steps_per_hour = (3600 / (15 * 60)) * TRAINING_CONFIG['val_check_interval']  # ~15 min per 100 steps
total_hours = TRAINING_CONFIG['max_steps'] / steps_per_hour
estimated_cost = total_hours * 1.44  # $1.44/hr for L40S (adjust if needed)

print(f"\n‚è±Ô∏è  Estimated time: {total_hours:.1f} hours")
print(f"üí∞ Estimated cost: ${estimated_cost:.2f} (L40S @ $1.44/hr)")
print(f"\nNote: Actual times vary by GPU and dataset complexity")
print("\n" + "="*70)

## Step 6: Start Training

Launch the training process. This will take 1-4 hours depending on GPU.

In [None]:
import subprocess
import time

os.chdir(FISH_SPEECH_DIR)

print("="*70)
print("üöÄ STARTING TRAINING")
print("="*70)
print(f"\nTimestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"GPU: {gpu_name}")
print(f"Dataset: {DATA_DIR}")
print(f"\nThis will take approximately {total_hours:.1f} hours...\n")

# Build training command
cmd = [
    'python', 'fish_speech/train.py',
    '--config-name', 'text2semantic_finetune',
    f'project={TRAINING_CONFIG["project"]}',
    f'train_dataset.proto_files={FISH_SPEECH_DIR}/data/protos',
    f'trainer.max_steps={TRAINING_CONFIG["max_steps"]}',
    f'trainer.val_check_interval={TRAINING_CONFIG["val_check_interval"]}',
    f'model.lora_config.r={TRAINING_CONFIG["lora_r"]}',
    f'model.lora_config.lora_alpha={TRAINING_CONFIG["lora_alpha"]}',
    f'data.batch_size={TRAINING_CONFIG["batch_size"]}',
    f'data.num_workers={TRAINING_CONFIG["num_workers"]}',
    f'trainer.accumulate_grad_batches={TRAINING_CONFIG["accumulate_grad_batches"]}',
    'pretrained_ckpt_path=checkpoints/openaudio-s1-mini/model.pth',
]

print("Command:")
print(' '.join(cmd))
print("\n" + "="*70 + "\n")

try:
    process = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        bufsize=1
    )
    
    # Stream output
    for line in process.stdout:
        print(line, end='')
    
    return_code = process.wait()
    
    if return_code == 0:
        print(f"\n‚úÖ Training completed successfully!")
    else:
        print(f"\n‚ùå Training failed with return code {return_code}")
        
except KeyboardInterrupt:
    print("\n‚ö†Ô∏è  Training interrupted by user")
    process.terminate()
except Exception as e:
    print(f"\n‚ùå Error: {e}")
    raise

print("\n" + "="*70)

## Step 7: Monitor Training (Optional)

Check training progress in real-time.

In [None]:
# Monitor checkpoints
project_dir = FISH_SPEECH_DIR / 'results' / TRAINING_CONFIG['project']
if project_dir.exists():
    print(f"Project directory: {project_dir}")
    
    ckpt_dir = project_dir / 'checkpoints'
    if ckpt_dir.exists():
        ckpts = sorted(ckpt_dir.glob('step_*.ckpt'))
        print(f"\nüì¶ Checkpoints ({len(ckpts)}):")
        for ckpt in ckpts[-5:]:
            size = ckpt.stat().st_size / 1024**2
            print(f"  {ckpt.name:40s} {size:8.1f} MB")
    
    # Show training log
    log_file = project_dir / 'train.log'
    if log_file.exists():
        print(f"\nüìù Recent logs:")
        with open(log_file) as f:
            lines = f.readlines()[-10:]
            for line in lines:
                print(line.strip())
else:
    print(f"‚ö†Ô∏è  Project directory not found: {project_dir}")

## Step 8: Export Model

Prepare trained model for download.

In [None]:
import tarfile

print("="*70)
print("MODEL EXPORT")
print("="*70)

project_dir = FISH_SPEECH_DIR / 'results' / TRAINING_CONFIG['project']
if not project_dir.exists():
    print(f"\n‚ö†Ô∏è  Project not found: {project_dir}")
    print("Please run training first.")
else:
    # Find latest checkpoint
    ckpt_dir = project_dir / 'checkpoints'
    if ckpt_dir.exists():
        ckpts = sorted(ckpt_dir.glob('step_*.ckpt'))
        if ckpts:
            latest_ckpt = ckpts[-1]
            print(f"\nüì¶ Latest checkpoint: {latest_ckpt.name}")
            
            # Create export directory
            export_dir = WORK_DIR / 'exports'
            export_dir.mkdir(exist_ok=True)
            
            # Archive results
            archive_name = f"finnish-tts-{datetime.now().strftime('%Y%m%d-%H%M%S')}.tar.gz"
            archive_path = export_dir / archive_name
            
            print(f"\nüì• Creating archive: {archive_name}")
            with tarfile.open(archive_path, 'w:gz') as tar:
                # Add checkpoints
                tar.add(ckpt_dir, arcname='checkpoints')
                # Add logs
                log_file = project_dir / 'train.log'
                if log_file.exists():
                    tar.add(log_file, arcname='train.log')
            
            size_gb = archive_path.stat().st_size / 1024**3
            print(f"‚úÖ Archive created: {archive_path}")
            print(f"   Size: {size_gb:.2f} GB")
            print(f"\nüì• Download command:")
            print(f"   scp user@instance:{archive_path} .")
        else:
            print("‚ö†Ô∏è  No checkpoints found")
    else:
        print(f"‚ö†Ô∏è  Checkpoint directory not found: {ckpt_dir}")

print("\n" + "="*70)

## Step 9: Summary

Training workflow complete!

In [None]:
print("="*70)
print("‚úÖ TRAINING WORKFLOW COMPLETE")
print("="*70)
print(f"\nTimestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"GPU: {gpu_name}")
print(f"Training steps: {TRAINING_CONFIG['max_steps']}")
print(f"LoRA config: r={TRAINING_CONFIG['lora_r']}, alpha={TRAINING_CONFIG['lora_alpha']}")

print("\nüìã Next Steps:")
print("\n1. Download your model:")
print(f"   cd {export_dir}")
print(f"   scp user@instance:exports/*.tar.gz .")

print("\n2. Extract on your machine:")
print("   tar -xzf finnish-tts-*.tar.gz")

print("\n3. Test inference:")
print("   python fish_speech/tools/llama/generate.py \\")
print("     --checkpoint checkpoints/step_000002000.ckpt \\")
print("     --text 'Hyv√§√§ huomenta!' \\")
print("     --output output.wav")

print("\nüéØ Deployment:")
print("   - Merge LoRA weights into base model")
print("   - Deploy to production inference server")
print("   - Integrate with WebUI or API")

print("\nüìö Documentation:")
print("   - Deployment: https://github.com/akusingh/nvidia-brev-launchables")
print("   - Fish Speech: https://github.com/fishaudio/fish-speech")

print("\n" + "="*70)
print("Happy Training! üá´üáÆüöÄ")
print("="*70)