In [2]:
import pandas as pd
import shutil
import kagglehub
import os

# Download dataset
path = kagglehub.dataset_download("amalshaf/sinhala-tts-dataset")
print(f"Dataset downloaded to: {path}")

# Setup paths
kaggle_dataset_path = f"{path}/sinhala-tts-dataset"
print(f"kaggle dataset path: {kaggle_dataset_path}")


Dataset downloaded to: /kaggle/input/sinhala-tts-dataset
kaggle dataset path: /kaggle/input/sinhala-tts-dataset/sinhala-tts-dataset


In [3]:
# SINHALA XTTS-v2 KAGGLE NOTEBOOK - FIXED VERSION
# Step-by-step notebook with proper error handling
# Copy cells one by one into Kaggle notebook

################################################################################
# CELL 1: SETUP AND ENVIRONMENT VERIFICATION
################################################################################

print("="*100)
print("PHASE 1: KAGGLE SETUP & ENVIRONMENT VERIFICATION")
print("="*100)

import os
import sys
import subprocess
import gc
import torch
import torchaudio
import shutil
from pathlib import Path
from datetime import datetime

print(f"\nPyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"  GPU: {torch.cuda.get_device_name(0)}")
    print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

# Set environment variables for GPU memory management
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
os.environ["CUDA_LAUNCH_BLOCKING"] = "0"

# Define key paths for Kaggle
KAGGLE_INPUT_PATH = f"{path}/sinhala-tts-dataset"
KAGGLE_WORKING_PATH = "/kaggle/working"
REPO_DIR = os.path.join(KAGGLE_WORKING_PATH, "XTTSv2-sinhala")
CHECKPOINTS_DIR = os.path.join(KAGGLE_WORKING_PATH, "checkpoints")
DATASET_OUTPUT_DIR = os.path.join(KAGGLE_WORKING_PATH, "datasets")
OUTPUT_DIR = os.path.join(KAGGLE_WORKING_PATH, "output")

# Create directories
for dir_path in [CHECKPOINTS_DIR, DATASET_OUTPUT_DIR, OUTPUT_DIR]:
    os.makedirs(dir_path, exist_ok=True)

os.chdir(KAGGLE_WORKING_PATH)

print(f"\nWorking directory: {os.getcwd()}")
print(f"Dataset path: {KAGGLE_INPUT_PATH}")

# Verify dataset access
if os.path.exists(KAGGLE_INPUT_PATH):
    print(f"\nDataset found! Contents:")
    for item in os.listdir(KAGGLE_INPUT_PATH):
        item_path = os.path.join(KAGGLE_INPUT_PATH, item)
        if os.path.isfile(item_path):
            size_mb = os.path.getsize(item_path) / (1024*1024)
            print(f"    {item} ({size_mb:.1f} MB)")
        else:
            item_count = len(os.listdir(item_path))
            print(f"    {item}/ ({item_count} items)")
else:
    print(f"\nERROR: Dataset not found at {KAGGLE_INPUT_PATH}")
    print("Make sure to add 'sinhala-tts-dataset' in notebook settings!")


PHASE 1: KAGGLE SETUP & ENVIRONMENT VERIFICATION

PyTorch Version: 2.6.0+cu124
CUDA Available: True
  GPU: Tesla P100-PCIE-16GB
  Memory: 15.89 GB

Working directory: /kaggle/working
Dataset path: /kaggle/input/sinhala-tts-dataset/sinhala-tts-dataset

Dataset found! Contents:
    metadata_eval.csv (0.0 MB)
    metadata_train.csv (0.2 MB)
    wavs/ (1251 items)


In [4]:
################################################################################
# CELL 2: CLONE REPOSITORY
################################################################################

print("\n" + "="*100)
print("PHASE 2: CLONE REPOSITORY")
print("="*100)

REPO_URL = "https://github.com/amalshafernando/XTTSv2-sinhala.git"

if os.path.exists(REPO_DIR):
    print(f"\nRepository already exists at {REPO_DIR}")
else:
    print(f"\nCloning from: {REPO_URL}")
    result = subprocess.run(
        ["git", "clone", REPO_URL, REPO_DIR],
        capture_output=True,
        text=True
    )
    
    if result.returncode == 0:
        print("Repository cloned successfully")
    else:
        print(f"ERROR cloning repository:")
        print(result.stderr)
        sys.exit(1)

sys.path.insert(0, REPO_DIR)
print(f"Repository added to Python path")

# List files in repo
repo_files = os.listdir(REPO_DIR)
print(f"\nRepository files ({len(repo_files)}):")
for f in sorted(repo_files)[:15]:
    print(f"  {f}")



PHASE 2: CLONE REPOSITORY

Cloning from: https://github.com/amalshafernando/XTTSv2-sinhala.git
Repository cloned successfully
Repository added to Python path

Repository files (18):
  .git
  .gitignore
  README.md
  TTS
  config_sinhala.py
  download_checkpoint.py
  extend_vocab_sinhala.py
  inference_sinhala.py
  kaggle_train_sinhala.py
  kagglebook.ipynb
  prepare_dataset_sinhala.py
  recipes
  requirements.txt
  sinhala_tts_complete_kaggle_notebook.ipynb
  train_dvae_xtts.py


In [5]:
################################################################################
# CELL 3: INSTALL DEPENDENCIES
################################################################################

print("\n" + "="*100)
print("PHASE 3: INSTALL DEPENDENCIES")
print("="*100)

packages = ["TTS", "tokenizers", "transformers", "pandas", "tqdm", "librosa"]

print("\nInstalling packages (this may take a few minutes)...")
for package in packages:
    print(f"  {package}...", end=" ", flush=True)
    result = subprocess.run(
        [sys.executable, "-m", "pip", "install", "-q", package],
        capture_output=True
    )
    if result.returncode == 0:
        print("OK")
    else:
        print("(skipped - may already exist)")

print("\nAll dependencies installed")




PHASE 3: INSTALL DEPENDENCIES

Installing packages (this may take a few minutes)...
  TTS... OK
  tokenizers... OK
  transformers... OK
  pandas... OK
  tqdm... OK
  librosa... OK

All dependencies installed


In [6]:
################################################################################
# CELL 4: LOAD CONFIGURATION
################################################################################

print("\n" + "="*100)
print("PHASE 4: LOAD CONFIGURATION")
print("="*100)

try:
    sys.path.insert(0, REPO_DIR)
    from config_sinhala import (
        LANGUAGE_CODE, EXTENDED_VOCAB_SIZE,
        BATCH_SIZE, GRADIENT_ACCUMULATION, 
        LEARNING_RATE, WEIGHT_DECAY, NUM_EPOCHS, 
        SAVE_STEP, MAX_TEXT_LENGTH, MAX_AUDIO_LENGTH
    )
    print("\nConfiguration loaded successfully")
except ImportError as e:
    print(f"\nERROR importing config_sinhala: {e}")
    sys.exit(1)

print(f"\nConfiguration Parameters:")
print(f"  Language: {LANGUAGE_CODE}")
print(f"  Vocab Size: {EXTENDED_VOCAB_SIZE}")
print(f"  Batch Size: {BATCH_SIZE}")
print(f"  Gradient Accumulation: {GRADIENT_ACCUMULATION}")
print(f"  Learning Rate: {LEARNING_RATE}")
print(f"  Num Epochs: {NUM_EPOCHS}")
print(f"  Max Text Length: {MAX_TEXT_LENGTH}")
print(f"  Max Audio Length: {MAX_AUDIO_LENGTH}")




PHASE 4: LOAD CONFIGURATION

Configuration loaded successfully

Configuration Parameters:
  Language: si
  Vocab Size: 15000
  Batch Size: 8
  Gradient Accumulation: 4
  Learning Rate: 5e-06
  Num Epochs: 5
  Max Text Length: 400
  Max Audio Length: 330750


In [7]:
################################################################################
# CELL 5: VERIFY DATASET FORMAT
################################################################################

print("\n" + "="*100)
print("PHASE 5: VERIFY DATASET FORMAT (CRITICAL)")
print("="*100)

import pandas as pd

metadata_train_path = os.path.join(KAGGLE_INPUT_PATH, "metadata_train.csv")
metadata_eval_path = os.path.join(KAGGLE_INPUT_PATH, "metadata_eval.csv")

print(f"\nChecking CSV files...")

# Read train metadata
try:
    df_train = pd.read_csv(metadata_train_path)
    print(f"\n[Train CSV]")
    print(f"  Columns: {list(df_train.columns)}")
    print(f"  Rows: {len(df_train)}")
    print(f"\n  First row data:")
    for col in df_train.columns:
        value = df_train.iloc[0][col]
        print(f"    {col}: {value}")
except Exception as e:
    print(f"ERROR reading train CSV: {e}")

# Read eval metadata
try:
    df_eval = pd.read_csv(metadata_eval_path)
    print(f"\n[Eval CSV]")
    print(f"  Columns: {list(df_eval.columns)}")
    print(f"  Rows: {len(df_eval)}")
except Exception as e:
    print(f"ERROR reading eval CSV: {e}")

# Find audio directory
print(f"\nSearching for audio files...")
audio_dir = None
possible_dirs = [
    os.path.join(KAGGLE_INPUT_PATH, "wav"),
    os.path.join(KAGGLE_INPUT_PATH, "wavs"),
    os.path.join(KAGGLE_INPUT_PATH, "audio"),
    KAGGLE_INPUT_PATH
]

for dir_path in possible_dirs:
    if os.path.exists(dir_path):
        wav_files = [f for f in os.listdir(dir_path) if f.endswith(('.wav', '.mp3', '.flac'))]
        if wav_files:
            audio_dir = dir_path
            print(f"  Found {len(wav_files)} audio files in: {os.path.basename(dir_path)}")
            print(f"  Sample: {wav_files[0]}")
            break

if not audio_dir:
    print(f"  WARNING: Could not find audio files")



PHASE 5: VERIFY DATASET FORMAT (CRITICAL)

Checking CSV files...

[Train CSV]
  Columns: ['audio_file_path|transcript|speaker_id']
  Rows: 1000

  First row data:
    audio_file_path|transcript|speaker_id: wavs/sin_2282_sin_2282_8427486285.wav|පොළොන්නරුවේ ගල් විහාරයේ හිඳි පිළිමය මෙවැන්නකි.|sin_2282

[Eval CSV]
  Columns: ['audio_file_path|transcript|speaker_id']
  Rows: 251

Searching for audio files...
  Found 1251 audio files in: wavs
  Sample: sin_3531_sin_3531_8500473878.wav


In [8]:
################################################################################
# CELL 6: PREPARE DATASET
################################################################################

print("\n" + "="*100)
print("PHASE 6: PREPARE DATASET")
print("="*100)

print(f"\nInput: {KAGGLE_INPUT_PATH}")
print(f"Output: {DATASET_OUTPUT_DIR}")

# Create output directories
os.makedirs(os.path.join(DATASET_OUTPUT_DIR, "wavs"), exist_ok=True)

prepare_script = os.path.join(REPO_DIR, "prepare_dataset_sinhala.py")

if not os.path.exists(prepare_script):
    print(f"ERROR: prepare_dataset_sinhala.py not found")
else:
    cmd = [
        sys.executable,
        prepare_script,
        f"--kaggle_path={KAGGLE_INPUT_PATH}",
        f"--output_path={DATASET_OUTPUT_DIR}"
    ]
    
    print(f"\nRunning dataset preparation...\n")
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    print(result.stdout)
    if result.stderr:
        print(f"Warnings/Errors:\n{result.stderr}")
    
    if result.returncode != 0:
        print(f"\nReturn code: {result.returncode}")

# Verify output
print(f"\nDataset output structure:")
if os.path.exists(DATASET_OUTPUT_DIR):
    for item in os.listdir(DATASET_OUTPUT_DIR):
        item_path = os.path.join(DATASET_OUTPUT_DIR, item)
        if os.path.isfile(item_path):
            size = os.path.getsize(item_path)
            print(f"  {item} ({size} bytes)")
        else:
            file_count = len(os.listdir(item_path))
            print(f"  {item}/ ({file_count} files)")




PHASE 6: PREPARE DATASET

Input: /kaggle/input/sinhala-tts-dataset/sinhala-tts-dataset
Output: /kaggle/working/datasets

Running dataset preparation...


CONVERTING DATASET TO XTTS-v2 FORMAT

[2/3] Reading metadata files
    Train: /kaggle/input/sinhala-tts-dataset/sinhala-tts-dataset/metadata_train.csv
    Eval: /kaggle/input/sinhala-tts-dataset/sinhala-tts-dataset/metadata_eval.csv
    Train samples: 1000
    Eval samples: 251

❌ ERROR: train metadata missing required columns: {'transcript', 'speaker_id', 'audio_file_path'}

Traceback (most recent call last):
  File "/kaggle/working/XTTSv2-sinhala/prepare_dataset_sinhala.py", line 257, in main
    convert_metadata(args.kaggle_path, args.output_path)
  File "/kaggle/working/XTTSv2-sinhala/prepare_dataset_sinhala.py", line 104, in convert_metadata
    raise ValueError(f"{df_name} metadata missing required columns: {missing_cols}")
ValueError: train metadata missing required columns: {'transcript', 'speaker_id', 'audio_file_path'}


Re

In [None]:
################################################################################
# CELL 7: DOWNLOAD CHECKPOINTS
################################################################################

print("\n" + "="*100)
print("PHASE 7: DOWNLOAD PRETRAINED CHECKPOINTS")
print("="*100)

download_script = os.path.join(REPO_DIR, "download_checkpoint.py")

if not os.path.exists(download_script):
    print(f"ERROR: download_checkpoint.py not found")
else:
    cmd = [
        sys.executable,
        download_script,
        f"--output_path={CHECKPOINTS_DIR}"
    ]
    
    print(f"\nDownloading XTTS-v2 pretrained model...")
    print(f"This may take 5-10 minutes...\n")
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    print(result.stdout)
    if result.stderr:
        print(f"Output: {result.stderr}")

# Verify checkpoints
print(f"\nCheckpoints downloaded:")
if os.path.exists(CHECKPOINTS_DIR):
    for item in os.listdir(CHECKPOINTS_DIR)[:10]:
        print(f"  {item}")


In [None]:
################################################################################
# CELL 8: EXTEND VOCABULARY
################################################################################

print("\n" + "="*100)
print("PHASE 8: EXTEND VOCABULARY FOR SINHALA")
print("="*100)

metadata_train = os.path.join(DATASET_OUTPUT_DIR, "metadata_train.csv")

if not os.path.exists(metadata_train):
    print(f"ERROR: metadata_train.csv not found")
    print(f"Dataset preparation may have failed")
else:
    extend_script = os.path.join(REPO_DIR, "extend_vocab_sinhala.py")
    
    if not os.path.exists(extend_script):
        print(f"ERROR: extend_vocab_sinhala.py not found")
    else:
        cmd = [
            sys.executable,
            extend_script,
            f"--output_path={CHECKPOINTS_DIR}",
            f"--metadata_path={metadata_train}",
            f"--language={LANGUAGE_CODE}",
            f"--extended_vocab_size={EXTENDED_VOCAB_SIZE}"
        ]
        
        print(f"\nExtending vocabulary for Sinhala...")
        print(f"This may take 2-5 minutes...\n")
        
        result = subprocess.run(cmd, capture_output=True, text=True)
        print(result.stdout)
        if result.stderr:
            print(f"Output: {result.stderr}")

# Clear GPU memory
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("\nGPU memory cleared")


In [None]:

################################################################################
# CELL 9: FINE-TUNE GPT MODEL
################################################################################

print("\n" + "="*100)
print("PHASE 9: FINE-TUNE GPT MODEL (MAIN TRAINING)")
print("="*100)

metadata_train = os.path.join(DATASET_OUTPUT_DIR, "metadata_train.csv")
metadata_eval = os.path.join(DATASET_OUTPUT_DIR, "metadata_eval.csv")

if not os.path.exists(metadata_train) or not os.path.exists(metadata_eval):
    print(f"ERROR: Metadata files not found")
else:
    train_script = os.path.join(REPO_DIR, "train_gpt_xtts.py")
    
    if not os.path.exists(train_script):
        print(f"ERROR: train_gpt_xtts.py not found")
    else:
        # Format metadatas parameter
        metadatas_param = f"{metadata_train},{metadata_eval},{LANGUAGE_CODE}"
        
        cmd = [
            sys.executable,
            train_script,
            f"--output_path={CHECKPOINTS_DIR}",
            f"--metadatas={metadatas_param}",
            f"--num_epochs={NUM_EPOCHS}",
            f"--batch_size={BATCH_SIZE}",
            f"--grad_acumm={GRADIENT_ACCUMULATION}",
            f"--max_text_length={MAX_TEXT_LENGTH}",
            f"--max_audio_length={MAX_AUDIO_LENGTH}",
            f"--weight_decay={WEIGHT_DECAY}",
            f"--lr={LEARNING_RATE}",
            f"--save_step={SAVE_STEP}"
        ]
        
        print(f"\nSTARTING GPT FINE-TUNING")
        print(f"Training Parameters:")
        print(f"  Epochs: {NUM_EPOCHS}")
        print(f"  Batch Size: {BATCH_SIZE}")
        print(f"  Gradient Accumulation: {GRADIENT_ACCUMULATION}")
        print(f"  Learning Rate: {LEARNING_RATE}")
        print(f"\nThis will take 2-4 hours. Please wait...\n")
        
        result = subprocess.run(cmd, capture_output=True, text=True)
        print(result.stdout)
        if result.stderr:
            print(f"Training output:\n{result.stderr}")



In [None]:
################################################################################
# CELL 10: TEST INFERENCE
################################################################################

print("\n" + "="*100)
print("PHASE 10: TEST INFERENCE WITH FINE-TUNED MODEL")
print("="*100)

# Find fine-tuned checkpoint
gpt_dirs = [d for d in os.listdir(CHECKPOINTS_DIR) if 'GPT_XTTS_FT' in d]

if not gpt_dirs:
    print(f"\nNo fine-tuned GPT checkpoint found")
else:
    latest_checkpoint = sorted(gpt_dirs)[-1]
    checkpoint_path = os.path.join(CHECKPOINTS_DIR, latest_checkpoint)
    
    print(f"\nFine-tuned checkpoint found: {latest_checkpoint}")
    print(f"Path: {checkpoint_path}")
    
    # List model files
    model_files = os.listdir(checkpoint_path)
    print(f"\nCheckpoint contents:")
    for file in model_files[:10]:
        print(f"  {file}")
    if len(model_files) > 10:
        print(f"  ... and {len(model_files) - 10} more files")


In [None]:
################################################################################
# CELL 11: PACKAGE RESULTS
################################################################################

print("\n" + "="*100)
print("PHASE 11: PACKAGE RESULTS FOR DOWNLOAD")
print("="*100)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_package = os.path.join(OUTPUT_DIR, f"sinhala-xtts-finetuned_{timestamp}")
os.makedirs(output_package, exist_ok=True)

print(f"\nPackaging results...")

# Copy fine-tuned models
gpt_dirs = [d for d in os.listdir(CHECKPOINTS_DIR) if 'GPT_XTTS_FT' in d]
for checkpoint_dir in gpt_dirs:
    src = os.path.join(CHECKPOINTS_DIR, checkpoint_dir)
    dst = os.path.join(output_package, checkpoint_dir)
    print(f"  Copying {checkpoint_dir}...")
    if os.path.exists(dst):
        shutil.rmtree(dst)
    shutil.copytree(src, dst)

# Copy inference script
for script_name in ["inference_sinhala.py", "config_sinhala.py"]:
    src = os.path.join(REPO_DIR, script_name)
    if os.path.exists(src):
        dst = os.path.join(output_package, script_name)
        shutil.copy2(src, dst)
        print(f"  Copied {script_name}")

print(f"\nResults packaged at: {output_package}")
print(f"\nContents:")
for f in os.listdir(output_package):
    print(f"  {f}")



In [None]:
################################################################################
# CELL 12: SUMMARY
################################################################################

print("\n" + "="*100)
print("TRAINING COMPLETE!")
print("="*100)

print("""
Your Sinhala XTTS-v2 model has been successfully fine-tuned!

Completed Phases:
  1. Environment setup
  2. Repository clone
  3. Dependencies install
  4. Configuration load
  5. Dataset verification
  6. Dataset preparation
  7. Checkpoint download
  8. Vocabulary extension
  9. GPT fine-tuning
  10. Inference testing
  11. Results packaging

Output Location:
  /kaggle/working/output/sinhala-xtts-finetuned_TIMESTAMP/

Download these files:
  - Fine-tuned model checkpoint
  - inference_sinhala.py
  - config_sinhala.py

Next Steps:
  1. Download the output folder
  2. Use inference_sinhala.py to generate Sinhala speech
  3. Integrate into your application
  4. Fine-tune further with more data if needed

Thank you!
""")
