# Creating Calibration Dataset for IndicVoices (All 22 Languages)

This notebook loads the IndicVoices dataset for all 22 Indian languages, filters diverse samples, preprocesses audio to mel-spectrograms, and saves the calibration dataset for PTQ of the Indic Conformer model.

In [None]:
!curl -LsSf https://astral.sh/uv/install.sh | sh

!uv pip install datasets torchaudio numpy pandas pyarrow huggingface-hub torchcodec torch

In [None]:
import os
import numpy as np
from datasets import load_dataset
import torchaudio
from torchaudio.transforms import MelSpectrogram
from huggingface_hub import login

# Authenticate with Hugging Face using Colab secret
hf_token = os.environ.get('HF_TOKEN')
if hf_token:
    login(hf_token)
else:
    print("HF_TOKEN not found in environment. Please set it in Colab secrets.")

In [None]:
# List of 22 languages with codes
languages = {
    "assamese": "as",
    "bengali": "bn",
    "bodo": "brx",
    "dogri": "doi",
    "gujarati": "gu",
    "hindi": "hi",
    "kannada": "kn",
    "konkani": "kok",
    "kashmiri": "ks",
    "maithili": "mai",
    "malayalam": "ml",
    "manipuri": "mni",
    "marathi": "mr",
    "nepali": "ne",
    "odia": "or",
    "punjabi": "pa",
    "sanskrit": "sa",
    "santali": "sat",
    "sindhi": "sd",
    "tamil": "ta",
    "telugu": "te",
    "urdu": "ur"
}

In [None]:
from collections import Counter, defaultdict
import pandas as pd
from datasets import load_dataset

# ==================== CONFIG ====================
SAMPLES_PER_LANG = 64
MAX_PER_SPEAKER = 3
AGE_GROUPS = ["18-30", "30-45", "45-60", "60+"]
SAMPLES_PER_AGE_GENDER = 8                    # 4 age × 2 gender × 8 = 64
SCENARIO_TARGET = {"read": 8, "extempore": 46, "conversation": 10}  # 8+46+10=64
FALLBACK_EXTRA_SCENARIO = 2                   # allow slight overfill for scarce scenarios in fallback
FALLBACK_EXTRA_AGE = 2                        # allow slight overfill per age-gender in fallback
# ===============================================

calibration_rows = []
seen_paths = set()  # global dedupe across languages

for lang_name, lang_code in languages.items():
    print(f"\nProcessing {lang_name} ({lang_code}) ...")

    ds = load_dataset("ai4bharat/IndicVoices", lang_name, split="valid", streaming=True)
    ds = ds.filter(lambda x: 3.0 <= x["duration"] <= 15.0)

    counters = {
        "speaker": defaultdict(int),
        "gender": Counter(),
        "age_gender": defaultdict(Counter),
        "scenario": Counter(),
        "district": set()
    }
    candidates = []

    for example in ds:
        audio_dict = example.get("audio_filepath")
        if not isinstance(audio_dict, dict) or not audio_dict.get("path"):
            continue
        audio_path = audio_dict["path"]

        # Skip if already used in a previous language (defensive)
        if audio_path in seen_paths:
            continue

        speaker = example.get("speaker_id") or "unknown"
        gender = str(example.get("gender", "")).strip().lower()
        if gender not in ("male", "female"):
            continue

        age = example.get("age_group")
        if age not in AGE_GROUPS:
            continue

        district = example.get("district") or "unknown"
        scenario = (example.get("scenario") or "extempore").strip().lower()
        if scenario not in SCENARIO_TARGET:
            scenario = "extempore"

        # Scoring (soft quotas)
        score = 0
        if counters["speaker"][speaker] < MAX_PER_SPEAKER:
            score += 100
        if counters["gender"][gender] < SAMPLES_PER_LANG // 2:
            score += 50
        if counters["age_gender"][age][gender] < SAMPLES_PER_AGE_GENDER:
            score += 50
        if counters["scenario"][scenario] < SCENARIO_TARGET[scenario]:
            score += 30
        if district not in counters["district"]:
            score += 20

        candidates.append((score, {
            "audio_filepath": audio_path,
            "duration": float(example["duration"]),
            "lang": lang_code,
            "speaker_id": speaker,
            "gender": gender,
            "age_group": age,
            "district": district,
            "scenario": scenario
        }))

    # Primary selection (strict caps)
    candidates.sort(reverse=True, key=lambda x: x[0])
    selected = []
    for score, row in candidates:
        spk = row["speaker_id"]
        gen = row["gender"]
        age = row["age_group"]
        sce = row["scenario"]
        dist = row["district"]

        if (counters["speaker"][spk] >= MAX_PER_SPEAKER or
            counters["gender"][gen] >= SAMPLES_PER_LANG // 2 or
            counters["age_gender"][age][gen] >= SAMPLES_PER_AGE_GENDER or
            counters["scenario"][sce] >= SCENARIO_TARGET[sce]):
            continue

        selected.append(row)
        seen_paths.add(row["audio_filepath"])
        counters["speaker"][spk] += 1
        counters["gender"][gen] += 1
        counters["age_gender"][age][gen] += 1
        counters["scenario"][sce] += 1
        counters["district"].add(dist)

        if len(selected) >= SAMPLES_PER_LANG:
            break

    # Fallback pass — relaxed quotas
    if len(selected) < SAMPLES_PER_LANG:
        remaining = SAMPLES_PER_LANG - len(selected)
        print(f"  Fallback pass: adding {remaining} samples (relaxed quotas) for {lang_name}")

        for _, row in sorted(candidates, key=lambda x: x[0], reverse=True):
            if len(selected) >= SAMPLES_PER_LANG:
                break
            path = row["audio_filepath"]
            if path in seen_paths:
                continue  # already used anywhere
            spk, gen, age, sce = row["speaker_id"], row["gender"], row["age_group"], row["scenario"]

            # Relaxed caps
            if counters["speaker"][spk] >= MAX_PER_SPEAKER + 1:
                continue
            if counters["gender"][gen] >= (SAMPLES_PER_LANG // 2) + 4:
                continue
            if age != "60+" and counters["age_gender"][age][gen] >= SAMPLES_PER_AGE_GENDER + FALLBACK_EXTRA_AGE:
                continue
            scenario_cap = SCENARIO_TARGET[sce] + (FALLBACK_EXTRA_SCENARIO if sce in ("read", "conversation") else 10)
            if counters["scenario"][sce] >= scenario_cap:
                continue

            # Accept
            selected.append(row)
            seen_paths.add(path)
            counters["speaker"][spk] += 1
            counters["gender"][gen] += 1
            counters["age_gender"][age][gen] += 1
            counters["scenario"][sce] += 1
            counters["district"].add(row["district"])

    if len(selected) < SAMPLES_PER_LANG:
        print(f"  Warning: Only got {len(selected)}/{SAMPLES_PER_LANG} for {lang_name}")

    calibration_rows.extend(selected)

    # Stats
    df = pd.DataFrame(selected)
    print(f"  Final → {len(selected)} samples")
    print(f"    Gender: {dict(counters['gender'])}")
    print(f"    Age   : {df['age_group'].value_counts().to_dict()}")
    print(f"    Scenario: {dict(counters['scenario'])}")
    print(f"    Districts: {len(counters['district'])} | Speakers: {df['speaker_id'].nunique()}")

# Save
final_df = pd.DataFrame(calibration_rows)
final_df.to_parquet("indicvoices_calibration_1408.parquet", compression="snappy", index=False)
print(f"\nDone! Saved {len(final_df)} samples (expected 1408) → indicvoices_calibration_1408.parquet")