<a href="https://colab.research.google.com/github/arhammxo/mealMedic/blob/main/tts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/jaywalnut310/vits.git

In [None]:
!cd vits

In [None]:
!pip install -r /content/vits/requirements.txt

In [None]:
%cd /content/vits/monotonic_align

In [None]:
!ls

In [None]:
%mkdir monotonic_align

In [None]:
!python setup.py build_ext --inplace

In [None]:
!wget https://indic-asr-public.objectstore.e2enetworks.net/svarah.tar

In [None]:
!tar -xvf svarah.tar

In [None]:
%cd svarah
!ls

In [None]:
import json
import random
from typing import List, Dict
import os

def read_json_lines(file_path: str) -> List[Dict]:
    """Read JSON Lines file and return list of dictionaries."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():  # Skip empty lines
                data.append(json.loads(line))
    return data

def format_line(entry: Dict) -> str:
    """Format entry to required format: path_to_wav|transcript"""
    return f"{entry['audio_filepath']}|{entry['text']}"

def split_and_save_data(data: List[Dict], output_dir: str, train_ratio=0.8, test_ratio=0.1, val_ratio=0.1):
    """Split data and save to files."""
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Shuffle data
    random.shuffle(data)

    # Calculate split indices
    total = len(data)
    train_end = int(total * train_ratio)
    test_end = train_end + int(total * test_ratio)

    # Split data
    train_data = data[:train_end]
    test_data = data[train_end:test_end]
    val_data = data[test_end:]

    # Save splits to files
    splits = {
        'train.txt': train_data,
        'test.txt': test_data,
        'val.txt': val_data
    }

    for filename, split_data in splits.items():
        output_path = os.path.join(output_dir, filename)
        with open(output_path, 'w', encoding='utf-8') as f:
            for entry in split_data:
                f.write(format_line(entry) + '\n')

        # Print statistics
        print(f"{filename}: {len(split_data)} entries ({len(split_data)/total*100:.1f}%)")

def main():
    # Configuration
    input_file = 'svarah_manifest.json'  # Replace with your input file path
    output_dir = 'splits'     # Output directory for split files

    # Read data
    print("Reading data...")
    data = read_json_lines(input_file)
    print(f"Total entries: {len(data)}")

    # Split and save data
    print("\nSplitting and saving data...")
    split_and_save_data(data, output_dir)

    print("\nDone! Files have been saved in the 'splits' directory.")

if __name__ == "__main__":
    main()

In [None]:
%cd /content/vits
!ls

In [None]:
!apt-get install espeak -y

In [None]:
!pip install espeakng
!pip install --upgrade espeakng

# After installing, restart the runtime

In [None]:
!python preprocess.py --text_index 1 --filelists /content/vits/monotonic_align/svarah/splits/train.txt /content/vits/monotonic_align/svarah/splits/val.txt /content/vits/monotonic_align/svarah/splits/test.txt

In [None]:
%%writefile conf.py

import json
import argparse
from pathlib import Path

def create_vits_config(
    train_filelist: str,
    val_filelist: str,
    output_path: str,
    sampling_rate: int = 22050,
    batch_size: int = 32,
    n_speakers: int = 0,
    epochs: int = 20000,
    is_cleaned_text: bool = True,
    language_cleaners: list = ["english_cleaners2"]
):
    """
    Create a VITS configuration file based on input parameters.

    Args:
        train_filelist: Path to training filelist
        val_filelist: Path to validation filelist
        output_path: Where to save the config file
        sampling_rate: Audio sampling rate
        batch_size: Training batch size
        n_speakers: Number of speakers (0 for single speaker)
        epochs: Number of training epochs
        is_cleaned_text: Whether text has been cleaned
        language_cleaners: List of text cleaners to apply
    """

    config = {
        "train": {
            "log_interval": 200,
            "eval_interval": 1000,
            "seed": 1234,
            "epochs": epochs,
            "learning_rate": 2e-4,
            "betas": [0.8, 0.99],
            "eps": 1e-9,
            "batch_size": batch_size,
            "fp16_run": True,
            "lr_decay": 0.999875,
            "segment_size": 8192,
            "init_lr_ratio": 1,
            "warmup_epochs": 0,
            "c_mel": 45,
            "c_kl": 1.0
        },
        "data": {
            "training_files": train_filelist,
            "validation_files": val_filelist,
            "text_cleaners": language_cleaners,
            "max_wav_value": 32768.0,
            "sampling_rate": sampling_rate,
            "filter_length": 1024,
            "hop_length": 256,
            "win_length": 1024,
            "n_mel_channels": 80,
            "mel_fmin": 0.0,
            "mel_fmax": None,
            "add_blank": True,
            "n_speakers": n_speakers,
            "cleaned_text": is_cleaned_text
        },
        "model": {
            "inter_channels": 192,
            "hidden_channels": 192,
            "filter_channels": 768,
            "n_heads": 2,
            "n_layers": 6,
            "kernel_size": 3,
            "p_dropout": 0.1,
            "resblock": "1",
            "resblock_kernel_sizes": [3,7,11],
            "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
            "upsample_rates": [8,8,2,2],
            "upsample_initial_channel": 512,
            "upsample_kernel_sizes": [16,16,4,4],
            "n_layers_q": 3,
            "use_spectral_norm": False
        }
    }

    # Calculate optimal segment size based on sampling rate
    config["train"]["segment_size"] = int(8192 * (sampling_rate / 22050))

    # Adjust filter_length and window parameters based on sampling rate
    if sampling_rate != 22050:
        scale_factor = sampling_rate / 22050
        config["data"]["filter_length"] = int(1024 * scale_factor)
        config["data"]["hop_length"] = int(256 * scale_factor)
        config["data"]["win_length"] = int(1024 * scale_factor)

    # Create output directory if it doesn't exist
    output_dir = Path(output_path).parent
    output_dir.mkdir(parents=True, exist_ok=True)

    # Save the configuration
    with open(output_path, 'w') as f:
        json.dump(config, f, indent=2)

    print(f"Configuration file saved to: {output_path}")
    return config

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Generate VITS configuration file")
    parser.add_argument("--train_filelist", required=True, help="Path to training filelist")
    parser.add_argument("--val_filelist", required=True, help="Path to validation filelist")
    parser.add_argument("--output", required=True, help="Output path for config file")
    parser.add_argument("--sampling_rate", type=int, default=22050, help="Audio sampling rate")
    parser.add_argument("--batch_size", type=int, default=32, help="Training batch size")
    parser.add_argument("--n_speakers", type=int, default=0, help="Number of speakers (0 for single speaker)")
    parser.add_argument("--epochs", type=int, default=20000, help="Number of training epochs")
    parser.add_argument("--no_cleaned_text", action="store_false", dest="is_cleaned_text",
                       help="Set this flag if text is not cleaned")
    parser.add_argument("--language_cleaners", nargs="+", default=["english_cleaners2"],
                       help="List of text cleaners to apply")

    args = parser.parse_args()

    create_vits_config(
        train_filelist=args.train_filelist,
        val_filelist=args.val_filelist,
        output_path=args.output,
        sampling_rate=args.sampling_rate,
        batch_size=args.batch_size,
        n_speakers=args.n_speakers,
        epochs=args.epochs,
        is_cleaned_text=args.is_cleaned_text,
        language_cleaners=args.language_cleaners
    )

In [None]:
!python conf.py \
    --train_filelist "/content/vits/monotonic_align/svarah/splits/train.txt" \
    --val_filelist "/content/vits/monotonic_align/svarah/splits/val.txt" \
    --output "configs/your_config.json"

In [None]:
!python train.py -c configs/your_config.json -m your_model_name