# XTTS-v2 Sinhala Fine-tuning on Kaggle

This notebook fine-tunes XTTS-v2 for Sinhala language using the complete pipeline.

**Steps:**
1. Environment setup (PyTorch, TTS, dependencies)
2. Clone repository
3. Download dataset
4. Download XTTS-v2 base model
5. Prepare dataset
6. Extend vocabulary for Sinhala
7. Fine-tune GPT model
8. Test inference


In [None]:
# ============================================================================
# CELL 1: Install PyTorch with CUDA support
# ============================================================================

!pip install torch==2.1.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118

# Verify
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"PyTorch version: {torch.__version__}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")


In [None]:
# ============================================================================
# CELL 2: Set environment variables and verify setup
# ============================================================================

import os
import sys

# ‚ö†Ô∏è CRITICAL: Set these BEFORE any TTS imports
os.environ['TRANSFORMERS_NO_TORCHAO_IMPORT'] = '1'
os.environ['TORCH_ALLOW_UNSAFE_DESERIALIZATION'] = '1'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

print("‚úÖ Environment variables set")
print(f"TRANSFORMERS_NO_TORCHAO_IMPORT = {os.environ['TRANSFORMERS_NO_TORCHAO_IMPORT']}")
print(f"TORCH_ALLOW_UNSAFE_DESERIALIZATION = {os.environ['TORCH_ALLOW_UNSAFE_DESERIALIZATION']}")
print(f"PYTORCH_CUDA_ALLOC_CONF = {os.environ['PYTORCH_CUDA_ALLOC_CONF']}")

# Check Python version
print(f"\nPython version: {sys.version}")
print(f"Working directory: {os.getcwd()}")

# Check GPU
import torch
print(f"\nCUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")


In [None]:
# ============================================================================
# CELL 3: Install TTS and all dependencies
# ============================================================================

# Install TTS and related packages
!pip install -q TTS==0.22.0

# ‚ö†Ô∏è CRITICAL FIX: Use transformers 4.36.0 instead of 4.45.2
!pip install -q transformers==4.36.0 tokenizers==0.15.0

!pip install -q librosa==0.10.2 soundfile==0.12.1 scipy==1.11.2 pysbd==0.3.4
!pip install -q pandas==1.5.3 scikit-learn==1.3.2 tqdm==4.66.3
!pip install -q einops==0.7.0 unidecode==1.3.8 inflect==7.0.0
!pip install -q coqpit==0.0.16 trainer==0.0.36 mutagen
!pip install -q pypinyin hangul_romanize num2words kagglehub
!pip install -q requests

print("‚úÖ All dependencies installed successfully!")


In [None]:
# ============================================================================
# CELL 4: Verify critical packages
# ============================================================================

import trainer
import TTS
import transformers
import librosa
import tokenizers

print(f"trainer version: {trainer.__version__}")
print(f"TTS installed: {TTS.__version__}")
print(f"transformers version: {transformers.__version__}")
print(f"tokenizers version: {tokenizers.__version__}")
print(f"librosa version: {librosa.__version__}")
print("‚úÖ All packages verified!")


In [None]:
# ============================================================================
# CELL 5: Clone repository
# ============================================================================

import os

repo_url = "https://github.com/amalshafernando/XTTSv2-sinhala.git"
repo_name = "XTTSv2-sinhala"

# Clone only if it doesn't exist
if not os.path.exists(repo_name):
    print(f"üîπ Cloning {repo_name}...")
    !git clone {repo_url}
    print("‚úÖ Repository cloned")
else:
    print(f"‚úÖ Repository already exists: {repo_name}")

# Change to repo directory
os.chdir(repo_name)
print(f"‚úÖ Current directory: {os.getcwd()}")

# List contents
print("\nüîπ Repository contents:")
!ls -la | head -20


In [None]:
# ============================================================================
# CELL 6: Download Sinhala TTS dataset
# ============================================================================

import kagglehub
import os

# Download dataset
path = kagglehub.dataset_download("amalshaf/sinhala-tts-dataset")
print(f"‚úÖ Dataset downloaded to: {path}")

# Setup paths
kaggle_dataset_path = f"{path}/sinhala-tts-dataset"
print(f"üìÅ Kaggle dataset path: {kaggle_dataset_path}")

# Verify dataset structure
if os.path.exists(kaggle_dataset_path):
    print(f"\nüìÇ Dataset contents:")
    !ls -lh {kaggle_dataset_path}
    
    # Check for metadata files
    metadata_train = f"{kaggle_dataset_path}/metadata_train.csv"
    metadata_eval = f"{kaggle_dataset_path}/metadata_eval.csv"
    
    if os.path.exists(metadata_train):
        print(f"\n‚úÖ Found: metadata_train.csv")
    if os.path.exists(metadata_eval):
        print(f"‚úÖ Found: metadata_eval.csv")
    
    # Check for audio directory
    audio_dirs = ["wav", "wavs", "audio", "audio_files"]
    for audio_dir in audio_dirs:
        audio_path = os.path.join(kaggle_dataset_path, audio_dir)
        if os.path.exists(audio_path):
            print(f"‚úÖ Found audio directory: {audio_dir}")
            break
else:
    print(f"‚ùå Dataset path not found: {kaggle_dataset_path}")


In [None]:
# ============================================================================
# CELL 7: Prepare dataset using prepare_dataset_sinhala.py
# ============================================================================

import os
import sys

print("=" * 80)
print("PREPARING DATASET FOR XTTS-v2")
print("=" * 80)

# Get dataset path from previous cell
kaggle_dataset_path = "/kaggle/input/sinhala-tts-dataset/sinhala-tts-dataset"
output_dataset_path = "/kaggle/working/datasets"

# Create output directory
os.makedirs(output_dataset_path, exist_ok=True)

# Run dataset preparation script
!python prepare_dataset_sinhala.py \
    --kaggle_path {kaggle_dataset_path} \
    --output_path {output_dataset_path}

print("\n" + "=" * 80)
print("‚úÖ DATASET PREPARATION COMPLETED")
print("=" * 80)

# Verify output files
train_metadata = f"{output_dataset_path}/metadata_train.csv"
eval_metadata = f"{output_dataset_path}/metadata_eval.csv"

if os.path.exists(train_metadata):
    import pandas as pd
    df_train = pd.read_csv(train_metadata, sep='|', header=None)
    print(f"\n‚úÖ Training samples: {len(df_train)}")
    
if os.path.exists(eval_metadata):
    df_eval = pd.read_csv(eval_metadata, sep='|', header=None)
    print(f"‚úÖ Evaluation samples: {len(df_eval)}")


In [None]:
# ============================================================================
# CELL 8: Download XTTS-v2 base model files
# ============================================================================

import os
import requests
from tqdm import tqdm

# Create output directory
output_dir = "/kaggle/working/checkpoints/XTTS_v2.0_original_model_files"
os.makedirs(output_dir, exist_ok=True)

print("=" * 80)
print("DOWNLOADING XTTS-v2 MODEL FILES")
print("=" * 80)

# Define all required files from Hugging Face
base_url = "https://huggingface.co/coqui/XTTS-v2/resolve/main/"

files_to_download = {
    "config.json": f"{base_url}config.json",
    "vocab.json": f"{base_url}vocab.json",
    "model.pth": f"{base_url}model.pth",
    "dvae.pth": f"{base_url}dvae.pth",
    "mel_stats.pth": f"{base_url}mel_stats.pth",
}

def download_file(url, output_path):
    '''Download file with progress bar'''
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    
    with open(output_path, 'wb') as f:
        with tqdm(total=total_size, unit='B', unit_scale=True, desc=os.path.basename(output_path)) as pbar:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
                    pbar.update(len(chunk))

# Download each file
for filename, url in files_to_download.items():
    output_path = os.path.join(output_dir, filename)
    
    if os.path.exists(output_path):
        size_mb = os.path.getsize(output_path) / (1024 * 1024)
        print(f"‚úÖ {filename} already exists ({size_mb:.1f} MB), skipping...")
    else:
        print(f"\nüîπ Downloading {filename}...")
        try:
            download_file(url, output_path)
            print(f"‚úÖ {filename} downloaded successfully")
        except Exception as e:
            print(f"‚ùå Failed to download {filename}: {e}")

# Verify all files downloaded
print(f"\n{'=' * 80}")
print("VERIFICATION")
print(f"{'=' * 80}")

all_downloaded = True
for filename in files_to_download.keys():
    filepath = os.path.join(output_dir, filename)
    if os.path.exists(filepath):
        size_mb = os.path.getsize(filepath) / (1024 * 1024)
        print(f"‚úÖ {filename}: {size_mb:.1f} MB")
    else:
        print(f"‚ùå {filename}: MISSING!")
        all_downloaded = False

if all_downloaded:
    print(f"\n{'=' * 80}")
    print("‚úÖ ALL XTTS-v2 MODEL FILES DOWNLOADED SUCCESSFULLY!")
    print(f"{'=' * 80}")


In [None]:
# ============================================================================
# CELL 9: Extend vocabulary for Sinhala language
# ============================================================================

import os
import json

print("=" * 80)
print("EXTENDING VOCABULARY FOR SINHALA")
print("=" * 80)

# Paths
metadata_path = "/kaggle/working/datasets/metadata_train.csv"
output_path = "/kaggle/working/checkpoints/XTTS_v2.0_original_model_files"

# Verify paths exist
if not os.path.exists(metadata_path):
    print(f"‚ùå Error: Metadata file not found: {metadata_path}")
else:
    print(f"‚úÖ Metadata file found: {metadata_path}")

if not os.path.exists(output_path):
    print(f"‚ùå Error: Output path not found: {output_path}")
else:
    print(f"‚úÖ Output path exists: {output_path}")

# Run extend_vocab_sinhala.py
print(f"\nüîπ Running extend_vocab_sinhala.py...")
!python extend_vocab_sinhala.py \
    --metadata_path {metadata_path} \
    --output_path {output_path} \
    --language si \
    --vocab_size 15000

print("\n‚úÖ Vocabulary extension completed!")

# Verify the extended vocab
vocab_path = os.path.join(output_path, "vocab.json")
if os.path.exists(vocab_path):
    with open(vocab_path, 'r', encoding='utf-8') as f:
        vocab = json.load(f)
    print(f"\n‚úÖ Extended vocabulary size: {len(vocab):,} tokens")
    
    # Check for Sinhala characters in vocab
    sinhala_tokens = [token for token in vocab.keys() if any('\u0D80' <= char <= '\u0DFF' for char in token)]
    print(f"‚úÖ Sinhala-specific tokens: {len(sinhala_tokens)}")
    
    # Verify config.json was updated
    config_path = os.path.join(output_path, "config.json")
    if os.path.exists(config_path):
        with open(config_path, 'r', encoding='utf-8') as f:
            config = json.load(f)
        if 'language_ids' in config and 'si' in config['language_ids']:
            print(f"‚úÖ Sinhala language (si) added to config.json")
            print(f"   Language ID: {config['language_ids']['si']}")
else:
    print(f"\n‚ùå Vocabulary file not found at: {vocab_path}")
    
print("\n" + "=" * 80)


In [None]:
# ============================================================================
# CELL 10: Run complete training pipeline using kaggle_train_sinhala.py
# ============================================================================

import os
import sys

print("=" * 80)
print("STARTING COMPLETE TRAINING PIPELINE")
print("=" * 80)

# Verify we're in the repo directory
if not os.path.exists("kaggle_train_sinhala.py"):
    print("‚ùå Error: kaggle_train_sinhala.py not found in current directory")
    print(f"Current directory: {os.getcwd()}")
    print("\nTrying to find it...")
    !find . -name "kaggle_train_sinhala.py" -type f
else:
    print("‚úÖ Found kaggle_train_sinhala.py")
    
    # Run the complete training pipeline
    print("\nüöÄ Starting training pipeline...")
    print("This will run all phases:")
    print("  1. Setup verification")
    print("  2. Dataset preparation")
    print("  3. Model download")
    print("  4. Vocabulary extension")
    print("  5. GPT fine-tuning")
    print("\n‚ö†Ô∏è This may take several hours...")
    
    !python kaggle_train_sinhala.py
    
    print("\n" + "=" * 80)
    print("‚úÖ TRAINING PIPELINE COMPLETED!")
    print("=" * 80)


In [None]:
# ============================================================================
# CELL 11: Alternative - Run GPT training directly (if pipeline fails)
# ============================================================================

# Uncomment this cell only if kaggle_train_sinhala.py fails
# This runs GPT training directly with all required parameters

import os

print("=" * 80)
print("ALTERNATIVE: DIRECT GPT TRAINING")
print("=" * 80)
print("‚ö†Ô∏è Only use this if the pipeline in Cell 10 failed")
print("=" * 80)

# Uncomment below to run directly:
"""
!CUDA_VISIBLE_DEVICES=0 python train_gpt_xtts.py \
    --output_path /kaggle/working/checkpoints/ \
    --metadatas /kaggle/working/datasets/metadata_train.csv,/kaggle/working/datasets/metadata_eval.csv,si \
    --num_epochs 5 \
    --batch_size 8 \
    --grad_acumm 4 \
    --max_text_length 400 \
    --max_audio_length 330750 \
    --weight_decay 1e-2 \
    --lr 5e-6 \
    --save_step 50000

print("\n‚úÖ GPT training completed!")
"""

print("‚ÑπÔ∏è This cell is commented out. Uncomment to use if needed.")


In [None]:
# ============================================================================
# CELL 12: Verify training output and find best model
# ============================================================================

import os
import glob

print("=" * 80)
print("VERIFYING TRAINING OUTPUT")
print("=" * 80)

# Search for trained models
checkpoint_dir = "/kaggle/working/checkpoints"

# Look for GPT_XTTS_FT directories
model_dirs = glob.glob(f"{checkpoint_dir}/GPT_XTTS_FT*")

if model_dirs:
    print(f"\n‚úÖ Found {len(model_dirs)} training output(s):")
    for model_dir in model_dirs:
        print(f"\nüìÅ {model_dir}")
        
        # Look for best_model.pth
        best_model = os.path.join(model_dir, "best_model.pth")
        if os.path.exists(best_model):
            size_mb = os.path.getsize(best_model) / (1024 * 1024)
            print(f"   ‚úÖ best_model.pth ({size_mb:.1f} MB)")
        
        # Look for config.json
        config_file = os.path.join(model_dir, "config.json")
        if os.path.exists(config_file):
            print(f"   ‚úÖ config.json")
        
        # List all files
        files = os.listdir(model_dir)
        print(f"   üìÑ Total files: {len(files)}")
        if len(files) <= 10:
            for f in files:
                print(f"      - {f}")
else:
    print(f"\n‚ö†Ô∏è No training output found in {checkpoint_dir}")
    print("   Training may still be in progress or may have failed")

print("\n" + "=" * 80)


In [None]:
# ============================================================================
# CELL 13: Test inference with trained model
# ============================================================================

import os
import glob

print("=" * 80)
print("TESTING INFERENCE")
print("=" * 80)

# Find the best model
checkpoint_dir = "/kaggle/working/checkpoints"
model_dirs = glob.glob(f"{checkpoint_dir}/GPT_XTTS_FT*")

if not model_dirs:
    print("‚ùå No trained model found. Please complete training first.")
else:
    model_dir = model_dirs[0]  # Use the first one
    best_model = os.path.join(model_dir, "best_model.pth")
    config_file = os.path.join(model_dir, "config.json")
    vocab_file = "/kaggle/working/checkpoints/XTTS_v2.0_original_model_files/vocab.json"
    
    if not os.path.exists(best_model):
        print(f"‚ùå Best model not found: {best_model}")
    elif not os.path.exists(config_file):
        print(f"‚ùå Config file not found: {config_file}")
    elif not os.path.exists(vocab_file):
        print(f"‚ùå Vocab file not found: {vocab_file}")
    else:
        print(f"‚úÖ Found trained model: {model_dir}")
        print(f"\nüìù Test Sinhala texts:")
        test_texts = [
            "‡∂±‡∑í‡∂ª‡∂±‡∑ä‡∂≠‡∂ª‡∂∫‡∑í ‡∂â‡∂≠‡∑è ‡∑Ä‡∑ê‡∂Ø‡∂ú‡∂≠‡∑ä",  # "Always very important"
            "‡∑Å‡∑ä‚Äç‡∂ª‡∑ì ‡∂Ω‡∂Ç‡∂ö‡∑è ‡∂î‡∂∂‡∑ö ‡∂ã‡∂≠‡∑î‡∂ª‡∑î‡∂Ø‡∑ô‡∑É‡∑í‡∂±‡∑ä",  # "Sri Lanka from your north"
            "‡∑É‡∑í‡∂Ç‡∑Ñ‡∂Ω ‡∂∑‡∑è‡∑Ç‡∑è‡∑Ä ‡∂Ö‡∂¥‡∂ú‡∑ö ‡∂¢‡∑è‡∂≠‡∑í‡∂ö ‡∂∑‡∑è‡∑Ç‡∑è‡∑Ä‡∂∫‡∑í",  # "Sinhala is our national language"
        ]
        
        for i, text in enumerate(test_texts, 1):
            print(f"   {i}. {text}")
        
        print(f"\nüîπ To test inference, use:")
        print(f"   python inference_sinhala.py \\")
        print(f"     --checkpoint_path {best_model} \\")
        print(f"     --config_path {config_file} \\")
        print(f"     --vocab_path {vocab_file} \\")
        print(f"     --text \"‡∂±‡∑í‡∂ª‡∂±‡∑ä‡∂≠‡∂ª‡∂∫‡∑í ‡∂â‡∂≠‡∑è ‡∑Ä‡∑ê‡∂Ø‡∂ú‡∂≠‡∑ä\" \\")
        print(f"     --reference_audio <path_to_reference_audio.wav> \\")
        print(f"     --output_path output.wav")
        
        # Try to find a reference audio file
        reference_audio = "/kaggle/working/datasets/wavs"
        if os.path.exists(reference_audio):
            audio_files = [f for f in os.listdir(reference_audio) if f.endswith('.wav')]
            if audio_files:
                ref_audio_path = os.path.join(reference_audio, audio_files[0])
                print(f"\n‚úÖ Found reference audio: {ref_audio_path}")
                print(f"\nüîπ Running inference test...")
                
                # Run inference
                output_audio = "/kaggle/working/test_output.wav"
                !python inference_sinhala.py \
                    --checkpoint_path {best_model} \
                    --config_path {config_file} \
                    --vocab_path {vocab_file} \
                    --text "‡∂±‡∑í‡∂ª‡∂±‡∑ä‡∂≠‡∂ª‡∂∫‡∑í ‡∂â‡∂≠‡∑è ‡∑Ä‡∑ê‡∂Ø‡∂ú‡∂≠‡∑ä" \
                    --reference_audio {ref_audio_path} \
                    --output_path {output_audio}
                
                if os.path.exists(output_audio):
                    size_mb = os.path.getsize(output_audio) / (1024 * 1024)
                    print(f"\n‚úÖ Inference successful!")
                    print(f"   Output: {output_audio} ({size_mb:.2f} MB)")
                else:
                    print(f"\n‚ö†Ô∏è Inference may have failed - output file not found")

print("\n" + "=" * 80)


In [None]:
# ============================================================================
# CELL 14: Summary and next steps
# ============================================================================

print("=" * 80)
print("TRAINING SUMMARY")
print("=" * 80)

print("\n‚úÖ Training pipeline completed!")
print("\nüìÅ Output locations:")
print("   - Trained model: /kaggle/working/checkpoints/GPT_XTTS_FT-*/")
print("   - Base model files: /kaggle/working/checkpoints/XTTS_v2.0_original_model_files/")
print("   - Dataset: /kaggle/working/datasets/")

print("\nüöÄ To use the trained model:")
print("   1. Download the checkpoint directory from Kaggle")
print("   2. Use inference_sinhala.py to generate Sinhala speech")
print("   3. Provide any Sinhala text and a reference audio file")

print("\nüìù Example inference command:")
print("   python inference_sinhala.py \\")
print("     --checkpoint_path checkpoints/GPT_XTTS_FT-*/best_model.pth \\")
print("     --config_path checkpoints/GPT_XTTS_FT-*/config.json \\")
print("     --vocab_path checkpoints/XTTS_v2.0_original_model_files/vocab.json \\")
print("     --text \"‡∂±‡∑í‡∂ª‡∂±‡∑ä‡∂≠‡∂ª‡∂∫‡∑í ‡∂â‡∂≠‡∑è ‡∑Ä‡∑ê‡∂Ø‡∂ú‡∂≠‡∑ä\" \\")
print("     --reference_audio reference.wav \\")
print("     --output_path output.wav")

print("\n" + "=" * 80)
print("‚úÖ ALL DONE!")
print("=" * 80)
