# XTTS-v2 Sinhala Fine-tuning on Kaggle

This notebook performs complete fine-tuning of XTTS-v2 for Sinhala language:
1. Setup environment
2. Clone repository
3. Download dataset
4. Download XTTS-v2 model
5. Prepare dataset
6. Extend vocabulary for Sinhala
7. (Optional) DVAE fine-tuning
8. GPT fine-tuning

**Repository**: https://github.com/amalshafernando/XTTSv2-sinhala  
**Dataset**: https://www.kaggle.com/datasets/amalshaf/sinhala-tts-dataset

In [None]:
# Cell 1: Install PyTorch
!pip install torch==2.1.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118

# Verify
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"PyTorch version: {torch.__version__}")

In [None]:
# Cell 2: Set environment variables and verify GPU
import os
import sys

# CRITICAL: Set these BEFORE any TTS imports
os.environ['TRANSFORMERS_NO_TORCHAO_IMPORT'] = '1'
os.environ['TORCH_ALLOW_UNSAFE_DESERIALIZATION'] = '1'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

print("‚úÖ Environment variables set")
print(f"Working directory: {os.getcwd()}")
print(f"Python version: {sys.version}")

# Check GPU
import torch
print(f"\nCUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Cell 3: Install TTS and dependencies
!pip install -q TTS==0.22.0
!pip install -q transformers==4.36.0 tokenizers==0.15.0
!pip install -q librosa==0.10.2 soundfile==0.12.1 scipy==1.11.2 pysbd==0.3.4
!pip install -q pandas==1.5.3 scikit-learn==1.3.2 tqdm==4.66.3
!pip install -q einops==0.7.0 unidecode==1.3.8 inflect==7.0.0
!pip install -q coqpit==0.0.16 trainer==0.0.36 mutagen
!pip install -q pypinyin hangul_romanize num2words kagglehub

print("‚úÖ All dependencies installed successfully!")

In [None]:
# Cell 4: Verify critical packages
import trainer
import TTS
import transformers
import librosa

print(f"trainer version: {trainer.__version__}")
print(f"TTS installed: {TTS.__version__}")
print(f"transformers version: {transformers.__version__}")
print(f"librosa version: {librosa.__version__}")
print("‚úÖ All packages verified!")

In [None]:
# Cell 5: Clone repository
import os

repo_url = "https://github.com/amalshafernando/XTTSv2-sinhala.git"
repo_name = "XTTSv2-sinhala"

# Clone only if it doesn't exist
if not os.path.exists(repo_name):
    print(f"üîπ Cloning {repo_name}...")
    !git clone {repo_url}
    print("‚úÖ Repository cloned")
else:
    print(f"‚úÖ Repository already exists: {repo_name}")

# Change to repo directory
os.chdir(repo_name)
print(f"‚úÖ Current directory: {os.getcwd()}")

# List contents
print("\nüîπ Repository contents:")
!ls -la

In [None]:
# Cell 6: Download dataset
import pandas as pd
import shutil
import kagglehub
import os

# Download dataset
path = kagglehub.dataset_download("amalshaf/sinhala-tts-dataset")
print(f"Dataset downloaded to: {path}")

# Setup paths
kaggle_dataset_path = f"{path}/sinhala-tts-dataset"
print(f"Kaggle dataset path: {kaggle_dataset_path}")
target_dataset_path = "/kaggle/working/datasets/"
print(f"Target dataset path: {target_dataset_path}")

# Create target directory
os.makedirs(f"{target_dataset_path}/wavs", exist_ok=True)

# Copy audio files if they exist
possible_audio_dirs = [
    f"{kaggle_dataset_path}/wavs",
    f"{kaggle_dataset_path}/wav",
    f"{kaggle_dataset_path}/audio",
    kaggle_dataset_path
]

audio_copied = False
for audio_dir in possible_audio_dirs:
    if os.path.exists(audio_dir):
        # Check if it contains audio files
        audio_files = [f for f in os.listdir(audio_dir) if f.endswith(('.wav', '.mp3', '.flac'))]
        if audio_files:
            print(f"‚úÖ Found audio directory: {audio_dir} ({len(audio_files)} files)")
            if os.path.isdir(audio_dir):
                shutil.copytree(audio_dir, f"{target_dataset_path}/wavs", dirs_exist_ok=True)
            else:
                shutil.copy2(audio_dir, f"{target_dataset_path}/wavs/")
            audio_copied = True
            break

if not audio_copied:
    print("‚ö†Ô∏è Warning: Could not find audio files. Will proceed with metadata only.")

# Convert CSV to XTTS format using prepare_dataset_sinhala.py
print("\nüîπ Converting dataset to XTTS format...")
!python prepare_dataset_sinhala.py --kaggle_path "{kaggle_dataset_path}" --output_path "{target_dataset_path}"

# Verify output
train_meta = f"{target_dataset_path}/metadata_train.csv"
eval_meta = f"{target_dataset_path}/metadata_eval.csv"

if os.path.exists(train_meta) and os.path.exists(eval_meta):
    df_train = pd.read_csv(train_meta, sep='|', header=None, names=['audio_file', 'text', 'speaker_name'])
    df_eval = pd.read_csv(eval_meta, sep='|', header=None, names=['audio_file', 'text', 'speaker_name'])
    print(f"\n‚úÖ Training samples: {len(df_train)}")
    print(f"‚úÖ Validation samples: {len(df_eval)}")
else:
    print("‚ùå Error: Metadata files not created!")

In [None]:
# Cell 7: Download XTTS-v2 model files
import os
import requests
from tqdm import tqdm

# Create output directory
output_dir = "/kaggle/working/checkpoints/XTTS_v2.0_original_model_files"
os.makedirs(output_dir, exist_ok=True)

print("=" * 80)
print("DOWNLOADING XTTS-v2 MODEL FILES")
print("=" * 80)

# Use Coqui gateway for downloads
base_url = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/"

files_to_download = {
    "config.json": f"{base_url}config.json",
    "vocab.json": f"{base_url}vocab.json",
    "model.pth": f"{base_url}model.pth",
    "dvae.pth": f"{base_url}dvae.pth",
    "mel_stats.pth": f"{base_url}mel_stats.pth",
}

def download_file(url, output_path):
    '''Download file with progress bar'''
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    
    with open(output_path, 'wb') as f:
        with tqdm(total=total_size, unit='B', unit_scale=True, desc=os.path.basename(output_path)) as pbar:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
                    pbar.update(len(chunk))

# Download each file
for filename, url in files_to_download.items():
    output_path = os.path.join(output_dir, filename)
    
    if os.path.exists(output_path):
        print(f"‚úÖ {filename} already exists, skipping...")
    else:
        print(f"\nüîπ Downloading {filename}...")
        try:
            download_file(url, output_path)
            print(f"‚úÖ {filename} downloaded successfully")
        except Exception as e:
            print(f"‚ùå Failed to download {filename}: {e}")

# Verify all files downloaded
print(f"\n{'=' * 80}")
print("VERIFICATION")
print(f"{'=' * 80}")

all_downloaded = True
for filename in files_to_download.keys():
    filepath = os.path.join(output_dir, filename)
    if os.path.exists(filepath):
        size_mb = os.path.getsize(filepath) / (1024 * 1024)
        print(f"‚úÖ {filename}: {size_mb:.1f} MB")
    else:
        print(f"‚ùå {filename}: MISSING!")
        all_downloaded = False

if all_downloaded:
    print(f"\n{'=' * 80}")
    print("‚úÖ ALL XTTS-v2 MODEL FILES DOWNLOADED SUCCESSFULLY!")
    print(f"{'=' * 80}")

In [None]:
# Cell 8: Extend vocabulary for Sinhala (CORRECTED - uses extend_vocab_sinhala.py)
import os
import json
import subprocess
import sys

print("=" * 80)
print("EXTENDING VOCABULARY FOR SINHALA")
print("=" * 80)

# Correct paths
vocab_script = "extend_vocab_sinhala.py"
train_metadata_path = "/kaggle/working/datasets/metadata_train.csv"
output_path = "/kaggle/working/checkpoints/XTTS_v2.0_original_model_files"
language_code = "si"
vocab_size = 15000

# Verify script exists
if not os.path.exists(vocab_script):
    print(f"‚ùå Error: {vocab_script} not found!")
    print("Current directory:", os.getcwd())
    print("Files in current directory:")
    !ls -la
else:
    print(f"‚úÖ Found {vocab_script}")

# Verify metadata exists
if not os.path.exists(train_metadata_path):
    print(f"‚ùå Error: Training metadata not found at {train_metadata_path}")
else:
    print(f"‚úÖ Found training metadata: {train_metadata_path}")

# Run vocabulary extension
print(f"\nüîπ Running vocabulary extension...")
print(f"   Script: {vocab_script}")
print(f"   Metadata: {train_metadata_path}")
print(f"   Output: {output_path}")
print(f"   Language: {language_code}")
print(f"   Vocab size: {vocab_size}")

cmd = [
    sys.executable,
    vocab_script,
    "--metadata_path", train_metadata_path,
    "--output_path", output_path,  # CORRECT: Full path to XTTS_v2.0_original_model_files
    "--language", language_code,
    "--vocab_size", str(vocab_size)
]

print(f"\n[Running] Command: {' '.join(cmd)}")

try:
    result = subprocess.run(cmd, capture_output=True, text=True, timeout=1800)
    print(result.stdout)
    
    if result.returncode != 0:
        print(f"‚ùå Error:")
        print(result.stderr)
        raise RuntimeError("Vocabulary extension failed")
    else:
        print(f"‚úÖ Vocabulary extension SUCCESSFUL")
        
except subprocess.TimeoutExpired:
    print(f"‚ùå Timeout - vocabulary extension took too long")
    raise
except Exception as e:
    print(f"‚ùå Exception: {e}")
    raise

# Verify vocab.json was created/updated
vocab_path = os.path.join(output_path, "vocab.json")
if os.path.exists(vocab_path):
    with open(vocab_path, 'r', encoding='utf-8') as f:
        vocab = json.load(f)
    print(f"\n‚úÖ Extended vocabulary size: {len(vocab)} tokens")
    
    # Check for Sinhala characters in vocab
    sinhala_tokens = [token for token in vocab.keys() if any('\u0D80' <= char <= '\u0DFF' for char in token)]
    print(f"‚úÖ Sinhala-specific tokens: {len(sinhala_tokens)}")
else:
    print(f"‚ùå Vocabulary file not found at {vocab_path}!")

# Verify config.json was updated
config_path = os.path.join(output_path, "config.json")
if os.path.exists(config_path):
    with open(config_path, 'r', encoding='utf-8') as f:
        config = json.load(f)
    
    if 'languages' in config and language_code in config['languages']:
        print(f"‚úÖ Language '{language_code}' added to config.json")
    else:
        print(f"‚ö†Ô∏è Warning: Language '{language_code}' not found in config.json")

print(f"\n{'=' * 80}")
print("‚úÖ VOCABULARY EXTENSION COMPLETE")
print(f"{'=' * 80}")

In [None]:
# Cell 9: (OPTIONAL) DVAE Fine-tuning
import os
import subprocess
import sys

print("=" * 80)
print("DVAE FINE-TUNING (OPTIONAL)")
print("=" * 80)
print("‚ö†Ô∏è Only run this if you have 20+ hours of training data")
print("‚ö†Ô∏è This step is optional and usually not needed")
print("=" * 80)

# Set to True to enable DVAE training, False to skip
ENABLE_DVAE_TRAINING = False

if ENABLE_DVAE_TRAINING:
    print("\nüîπ Starting DVAE fine-tuning...")
    
    dvae_script = "train_dvae_xtts.py"
    output_path = "/kaggle/working/checkpoints"
    train_csv_path = "/kaggle/working/datasets/metadata_train.csv"
    eval_csv_path = "/kaggle/working/datasets/metadata_eval.csv"
    language = "si"
    num_epochs = 3
    batch_size = 256
    lr = 5e-6
    
    # Verify script exists
    if not os.path.exists(dvae_script):
        print(f"‚ùå Error: {dvae_script} not found!")
    else:
        cmd = [
            sys.executable,
            dvae_script,
            "--output_path", output_path,
            "--train_csv_path", train_csv_path,
            "--eval_csv_path", eval_csv_path,
            "--language", language,
            "--num_epochs", str(num_epochs),
            "--batch_size", str(batch_size),
            "--lr", str(lr)
        ]
        
        print(f"Command: {' '.join(cmd)}")
        print("\n‚ö†Ô∏è This may take several hours...")
        
        try:
            result = subprocess.run(cmd, text=True)
            if result.returncode == 0:
                print("\n‚úÖ DVAE fine-tuning completed successfully!")
            else:
                print(f"\n‚ö†Ô∏è DVAE fine-tuning had errors (return code: {result.returncode})")
        except Exception as e:
            print(f"‚ùå Error during DVAE training: {e}")
else:
    print("\n‚ÑπÔ∏è DVAE fine-tuning skipped (recommended for most cases)")
    print("   Set ENABLE_DVAE_TRAINING = True to enable")

print(f"\n{'=' * 80}")

In [None]:
# Cell 10: GPT Fine-tuning
import os
import subprocess
import sys

print("=" * 80)
print("STARTING GPT FINE-TUNING")
print("=" * 80)

# Configuration
gpt_script = "train_gpt_xtts.py"
output_path = "/kaggle/working/checkpoints"
train_metadata = "/kaggle/working/datasets/metadata_train.csv"
eval_metadata = "/kaggle/working/datasets/metadata_eval.csv"
language = "si"

# Training parameters
num_epochs = 5
batch_size = 8
grad_acumm = 4
max_text_length = 400
max_audio_length = 330750
weight_decay = 1e-2
lr = 5e-6
save_step = 50000

# Verify script exists
if not os.path.exists(gpt_script):
    print(f"‚ùå Error: {gpt_script} not found!")
    print("Current directory:", os.getcwd())
    !ls -la
else:
    print(f"‚úÖ Found {gpt_script}")

# Verify metadata files exist
if not os.path.exists(train_metadata):
    print(f"‚ùå Error: Training metadata not found: {train_metadata}")
elif not os.path.exists(eval_metadata):
    print(f"‚ùå Error: Evaluation metadata not found: {eval_metadata}")
else:
    print(f"‚úÖ Training metadata: {train_metadata}")
    print(f"‚úÖ Evaluation metadata: {eval_metadata}")

# Construct metadata string
metadata_string = f"{train_metadata},{eval_metadata},{language}"

print(f"\nüìã Training Configuration:")
print(f"   - Epochs: {num_epochs}")
print(f"   - Batch size: {batch_size}")
print(f"   - Gradient accumulation: {grad_acumm}")
print(f"   - Effective batch size: {batch_size * grad_acumm}")
print(f"   - Learning rate: {lr}")
print(f"   - Max text length: {max_text_length}")
print(f"   - Max audio length: {max_audio_length}")
print(f"   - Save step: {save_step}")

# Run GPT training
cmd = [
    sys.executable,
    gpt_script,
    "--output_path", output_path,
    "--metadatas", metadata_string,
    "--num_epochs", str(num_epochs),
    "--batch_size", str(batch_size),
    "--grad_acumm", str(grad_acumm),
    "--max_text_length", str(max_text_length),
    "--max_audio_length", str(max_audio_length),
    "--weight_decay", str(weight_decay),
    "--lr", str(lr),
    "--save_step", str(save_step)
]

print(f"\n[Running] Command: {' '.join(cmd)}")
print(f"\n‚ö†Ô∏è This may take several hours (4-8 hours depending on dataset size)...")
print(f"‚ö†Ô∏è Make sure Kaggle notebook is set to GPU and has enough time...")

try:
    result = subprocess.run(cmd, text=True)
    
    if result.returncode == 0:
        print(f"\n{'=' * 80}")
        print("‚úÖ GPT TRAINING COMPLETED SUCCESSFULLY!")
        print(f"{'=' * 80}")
        
        # Check for checkpoint
        checkpoint_dir = os.path.join(output_path, "run", "training")
        if os.path.exists(checkpoint_dir):
            checkpoints = [f for f in os.listdir(checkpoint_dir) if f.endswith('.pth')]
            if checkpoints:
                print(f"\n‚úÖ Found {len(checkpoints)} checkpoint(s):")
                for cp in checkpoints:
                    cp_path = os.path.join(checkpoint_dir, cp)
                    size_mb = os.path.getsize(cp_path) / (1024 * 1024)
                    print(f"   - {cp} ({size_mb:.1f} MB)")
    else:
        print(f"\n‚ùå Training failed with return code: {result.returncode}")
except KeyboardInterrupt:
    print(f"\n‚ö†Ô∏è Training interrupted by user")
except Exception as e:
    print(f"\n‚ùå Error during training: {e}")
    import traceback
    traceback.print_exc()

print(f"\n{'=' * 80}")

In [None]:
# Cell 11: Summary and Next Steps
import os

print("\n" + "#" * 80)
print("#" + " " * 78 + "#")
print("#" + " " * 15 + "‚úÖ SINHALA XTTS-v2 FINE-TUNING COMPLETE!" + " " * 26 + "#")
print("#" + " " * 78 + "#")
print("#" * 80)

print("\n‚úÖ COMPLETED PHASES:")
phases = [
    "Environment Setup",
    "Clone Repository",
    "Download Dataset",
    "Download XTTS-v2 Model",
    "Prepare Dataset",
    "Extend Vocabulary (15,000 Sinhala tokens)",
    "GPT Fine-tuning"
]

for i, phase in enumerate(phases, 1):
    print(f"   {i}. ‚úÖ {phase}")

print("\nüìä MODEL SPECIFICATIONS:")
print(f"   Language: Sinhala (‡∑É‡∑í‡∂Ç‡∑Ñ‡∂Ω)")
print(f"   Language Code: si")
print(f"   Tokenization: ByteLevel BPE")
print(f"   Vocabulary: 15,000 tokens")

print("\nüìÅ OUTPUT FILES:")
checkpoint_dir = "/kaggle/working/checkpoints"
model_dir = os.path.join(checkpoint_dir, "XTTS_v2.0_original_model_files")
training_dir = os.path.join(checkpoint_dir, "run", "training")

print(f"   Model Files: {model_dir}")
print(f"   - vocab.json: Extended vocabulary")
print(f"   - config.json: Updated with Sinhala language")
print(f"   Training Checkpoints: {training_dir}")

if os.path.exists(training_dir):
    checkpoints = [f for f in os.listdir(training_dir) if f.endswith('.pth')]
    if checkpoints:
        print(f"   - Found {len(checkpoints)} checkpoint(s)")

print("\nüéâ NEXT STEPS:")
print(f"   1. Download checkpoints from Kaggle")
print(f"   2. Use trained model for Sinhala text-to-speech inference")
print(f"   3. Test with Sinhala test texts")

print("\n" + "#" * 80)
print("#" * 80)