In [None]:

!pip install datasets transformers torch torchaudio soundfile librosa

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupt

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import required libraries
from datasets import load_dataset
import torch
import torchaudio
import json
import os
import numpy as np
from pathlib import Path
import soundfile as sf

Mounted at /content/drive


In [None]:

# Import required libraries
from datasets import load_dataset
import torch
import torchaudio
import json
import os
import numpy as np
import soundfile as sf

In [None]:
DRIVE_DIR = Path("/content/drive/My Drive/VITS_Bangla_779")

In [None]:
# Define base directory in Google Drive
DRIVE_DIR = "/content/drive/My Drive/VITS_Bangla_779"

def setup_environment():
    """Configure the training environment and create directories in Drive"""
    # Create project directories in Drive
    os.makedirs(DRIVE_DIR, exist_ok=True)
    os.makedirs(os.path.join(DRIVE_DIR, "dataset_779"), exist_ok=True)
    os.makedirs(os.path.join(DRIVE_DIR, "bn_vits_output"), exist_ok=True)
    os.makedirs(os.path.join(DRIVE_DIR, "filelists"), exist_ok=True)

    # Setup CUDA
    torch.backends.cudnn.benchmark = True
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    return device

In [None]:
def load_and_filter_dataset():
    """Load dataset and filter for speaker ID 779"""
    dataset = load_dataset("intelsense/openslr-bangla")

    # Filter for speaker ID 779
    filtered_dataset = dataset.filter(
        lambda x: str(x["speaker_id"]) == "779"
    )

    filtered_count = len(filtered_dataset['train'])
    print(f"Total samples for speaker 779: {filtered_count}")

    if filtered_count == 0:
        raise ValueError("No samples found for speaker ID 779")

    return filtered_dataset


In [None]:
def save_audio_file(audio_data, output_path):
    """Save audio data to WAV file with correct sampling rate"""
    try:
        array = audio_data['array']
        sampling_rate = audio_data['sampling_rate']

        # Convert to 22050 Hz if needed
        if sampling_rate != 22050:
            print(f"Resampling from {sampling_rate} Hz to 22050 Hz")
            resampler = torchaudio.transforms.Resample(sampling_rate, 22050)
            tensor_audio = torch.from_numpy(array).float()
            if len(tensor_audio.shape) == 1:
                tensor_audio = tensor_audio.unsqueeze(0)
            tensor_audio = resampler(tensor_audio)
            array = tensor_audio.squeeze().numpy()
            sampling_rate = 22050

        # Save as WAV
        sf.write(output_path, array, sampling_rate)
        print(f"Successfully saved audio to {output_path}")
        return output_path
    except Exception as e:
        print(f"Error saving audio file: {str(e)}")
        raise

In [None]:
def prepare_dataset_files(dataset):
    """Prepare audio files and create training/validation splits"""
    print("Preparing dataset files...")

    # Get dataset directory in Drive
    dataset_dir = os.path.join(DRIVE_DIR, "dataset_779")

    # Process all samples
    file_paths = []
    texts = []

    print(f"Processing {len(dataset['train'])} samples...")
    for idx, item in enumerate(dataset['train']):
        try:
            # Save audio file
            output_path = os.path.join(dataset_dir, f"audio_{idx:04d}.wav")
            saved_path = save_audio_file(item['audio'], output_path)

            file_paths.append(saved_path)
            texts.append(item['text'])

            if idx % 10 == 0:  # Progress update
                print(f"Processed {idx + 1} files...")
        except Exception as e:
            print(f"Error processing sample {idx}: {str(e)}")
            continue

    # Split into train/validation (90/10)
    total_samples = len(file_paths)
    train_size = int(0.9 * total_samples)

    train_paths = file_paths[:train_size]
    train_texts = texts[:train_size]
    val_paths = file_paths[train_size:]
    val_texts = texts[train_size:]

    # Write filelists in Drive
    filelist_dir = os.path.join(DRIVE_DIR, "filelists")

    # Write training filelist
    with open(os.path.join(filelist_dir, "train.txt"), "w", encoding="utf-8") as f:
        for path, text in zip(train_paths, train_texts):
            f.write(f"{path}|{text}\n")

    # Write validation filelist
    with open(os.path.join(filelist_dir, "val.txt"), "w", encoding="utf-8") as f:
        for path, text in zip(val_paths, val_texts):
            f.write(f"{path}|{text}\n")

    print(f"Created training set with {len(train_paths)} samples")
    print(f"Created validation set with {len(val_paths)} samples")


In [None]:
def create_config():
    """Create fine-tuning configuration"""
    config = {
        "train": {
            "batch_size": 2,
            "epochs": 1000,
            "learning_rate": 2e-4,
            "save_every": 1000,
            "validation_every": 1000,
            "segment_size": 8192
        },
        "data": {
            "training_files": os.path.join(DRIVE_DIR, "filelists/train.txt"),
            "validation_files": os.path.join(DRIVE_DIR, "filelists/val.txt"),
            "text_cleaners": ["bangla_cleaners"],
            "sampling_rate": 22050,
            "filter_length": 1024,
            "hop_length": 256,
            "win_length": 1024,
            "max_wav_value": 32768.0
        },
        "model": {
            "base_model": "facebook/mms-tts-ben",
            "speaker_id": 779,
            "inter_channels": 192,
            "hidden_channels": 192,
            "filter_channels": 768
        }
    }

    # Save config to Drive
    config_path = os.path.join(DRIVE_DIR, "finetune_ben.json")
    with open(config_path, "w") as f:
        json.dump(config, f, indent=2)

    print(f"Created configuration file: {config_path}")
    return config


In [None]:
def main():
    try:
        # Setup environment
        print("Setting up environment...")
        device = setup_environment()

        # Clone VITS repository if needed
        if not os.path.exists("VitsRetrain-bangla"):
            !git clone https://github.com/Intelsense-Excellence/VitsRetrain-bangla.git

        # Load and filter dataset
        print("Loading and filtering dataset...")
        dataset = load_and_filter_dataset()

        # Prepare dataset files
        print("Preparing dataset files...")
        prepare_dataset_files(dataset)

        # Create configuration
        print("Creating configuration...")
        config = create_config()

        print(f"\nAll files saved to Google Drive at: {DRIVE_DIR}")
        print("\nContents of the project directory:")
        !ls -R "{DRIVE_DIR}"

    except Exception as e:
        print(f"Error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Setting up environment...
Using device: cuda
Cloning into 'VitsRetrain-bangla'...
fatal: could not read Username for 'https://github.com': No such device or address
Loading and filtering dataset...
Total samples for speaker 779: 204
Preparing dataset files...
Preparing dataset files...
Processing 204 samples...
Resampling from 48000 Hz to 22050 Hz
Successfully saved audio to /content/drive/My Drive/VITS_Bangla_779/dataset_779/audio_0000.wav
Processed 1 files...
Resampling from 48000 Hz to 22050 Hz
Successfully saved audio to /content/drive/My Drive/VITS_Bangla_779/dataset_779/audio_0001.wav
Resampling from 48000 Hz to 22050 Hz
Successfully saved audio to /content/drive/My Drive/VITS_Bangla_779/dataset_779/audio_0002.wav
Resampling from 48000 Hz to 22050 Hz
Successfully saved audio to /content/drive/My Drive/VITS_Bangla_779/dataset_779/audio_0003.wav
Resampling from 48000 Hz to 22050 Hz
Successfully saved audio to /content/drive/My Drive/VITS_Bangla_779/dataset_779/audio_0004.wav
Resamp

In [None]:
def new_file:
  if this not in that:
    return True
  else:
    return False