<a href="https://colab.research.google.com/github/aqeelabpro/BitNet/blob/main/stt_tts_engine_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Prerequisites
#### This cell installs everything needed for scraping, audio generation, and deep learning training.

In [None]:
# Install libraries for scraping, audio processing, and AI training
!pip install wikipedia-api parler-tts soundfile pandas tqdm accelerate datasets
!pip install git+https://github.com/huggingface/parler-tts.git

# 2. Scraping & JSON Generation
#### This cell crawls Wikipedia to build texts.json dataset.

In [None]:
import wikipediaapi, json, re, os

# Configuration
SEED_TOPIC = "Artificial intelligence"
TARGET_SENTENCES = 2000 # ~3 hours of audio
OUTPUT_FILE = "texts.json"

wiki = wikipediaapi.Wikipedia(user_agent="TTSDataCollector/1.0", language='en')

def clean_text(text):
    text = re.sub(r'\[\d+\]', '', text) # Remove citations
    text = re.sub(r'\([^)]*\)', '', text) # Remove bracketed text
    return ' '.join(text.split())

tasks = []
visited = set()
queue = [SEED_TOPIC]

print(f"ðŸš€ Collecting {TARGET_SENTENCES} sentences...")
while len(tasks) < TARGET_SENTENCES and queue:
    topic = queue.pop(0)
    if topic in visited: continue
    page = wiki.page(topic)
    if not page.exists(): continue
    visited.add(topic)

    sentences = re.split(r'(?<=[.!?]) +', page.text)
    for s in sentences:
        cleaned = clean_text(s)
        if 10 <= len(cleaned.split()) <= 25 and cleaned[0].isalnum():
            tasks.append({"file": f"audio_{len(tasks):04d}.wav", "text": cleaned})
        if len(tasks) >= TARGET_SENTENCES: break
    queue.extend(list(page.links.keys())[:10])

with open(OUTPUT_FILE, 'w') as f:
    json.dump(tasks, f, indent=2)
print(f"âœ… Saved {len(tasks)} sentences to {OUTPUT_FILE}")

# 3. Audio Generation (Resumable)
#### This cell generates the .wav files. If it stops, just run it again; it skips existing files.

In [None]:
import torch
import soundfile as sf
import json
import os
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
from tqdm.notebook import tqdm

# -------- Setup --------
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Load model with eager attention (required)
model = ParlerTTSForConditionalGeneration.from_pretrained(
    "parler-tts/parler-tts-mini-v1",
    attn_implementation="eager",
    torch_dtype=torch.float16
).to(device)

# Optional: compile model for faster inference (PyTorch 2+)
try:
    model = torch.compile(model, mode="default")
except Exception as e:
    print("Model compilation skipped or failed:", e)

tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")

# -------- Voice Description --------
voice_description = "A male speaker delivers a very clear and steady speech at a moderate pace in a quiet environment."
desc_inputs = tokenizer(voice_description, return_tensors="pt", padding=True)
# Explicitly move all tensors to device to remove warnings
desc_inputs = {k: v.to(device) for k, v in desc_inputs.items()}

# -------- Load Dataset --------
with open("texts.json", "r") as f:
    tasks = json.load(f)

os.makedirs("audio", exist_ok=True)

# -------- Load Progress Log --------
progress_file = "progress.txt"
completed = set()
if os.path.exists(progress_file):
    with open(progress_file, "r") as f:
        completed = set(line.strip() for line in f)

print(f"Starting generation for {len(tasks)} items...")

# -------- Generation Loop with Detailed Progress --------
for item in tqdm(tasks, desc="Overall Progress", unit="file"):
    filename = item["file"]
    path = os.path.join("audio", filename)

    # Skip if already complete
    if filename in completed or os.path.exists(path):
        continue

    # Tokenize text
    prompt_inputs = tokenizer(item["text"], return_tensors="pt", padding=True)
    prompt_inputs = {k: v.to(device) for k, v in prompt_inputs.items()}

    try:
        # Detailed per-file progress
        with tqdm(total=1, desc=f"Generating {filename}", leave=False) as file_bar:
            with torch.inference_mode():
                generation = model.generate(
                    input_ids=desc_inputs["input_ids"],
                    attention_mask=desc_inputs["attention_mask"],
                    prompt_input_ids=prompt_inputs["input_ids"],
                    prompt_attention_mask=prompt_inputs["attention_mask"],
                    max_new_tokens=600,   # reduce for short sentences
                    do_sample=True,
                    temperature=1.0
                )
                file_bar.update(1)

        # Save audio
        audio_arr = generation.cpu().numpy().squeeze()
        sf.write(path, audio_arr, model.config.sampling_rate)

        # Log progress
        with open(progress_file, "a") as pf:
            pf.write(filename + "\n")
        completed.add(filename)

    except Exception as e:
        print(f"Error generating {filename}: {e}")

print("\nâœ… Audio generation complete!")
print(f"Total files generated: {len(completed)} / {len(tasks)}")


  "_name_or_path": "google/flan-t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.46.1",
  "use_cache": true,
  "vocab_size": 32128
}

  "_name_or_path": "parler-tts/dac_44khZ_8kbps",
  "architectures": [
    "DACModel"
  ],
  "codebook_size": 1024,
  "frame_rate": 86,
  "latent_dim": 1024,
  "model_bitrate": 8,
  "model_type": "dac_on_the_hub",


Starting generation for 2000 items...


Overall Progress:   0%|          | 0/2000 [00:00<?, ?file/s]

Generating audio_0000.wav:   0%|          | 0/1 [00:00<?, ?it/s]

# 4. Training (Fine-Tuning)
#### This final step converts the JSON to a training manifest and starts the engine training.

In [None]:
import pandas as pd

# Prepare manifest
with open('texts.json', 'r') as f:
    df = pd.DataFrame(json.load(f))
df['audio_path'] = df['file'].apply(lambda x: os.path.abspath(f"audio/{x}"))
df['description'] = voice_description
df.rename(columns={'text': 'transcription'}).to_csv("train.csv", index=False)

# Start Fine-Tuning
!python parler-tts/training/run_parler_tts_training.py \
    --model_name_or_path "parler-tts/parler-tts-mini-v1" \
    --train_file "train.csv" \
    --text_column_name "transcription" \
    --description_column_name "description" \
    --audio_column_name "audio_path" \
    --output_dir "./my_custom_model" \
    --per_device_train_batch_size 2 \
    --gradient_accumulation_steps 4 \
    --max_steps 1000 \
    --learning_rate 2e-5 \
    --fp16 True