In [2]:
%pwd

'/itf-fi-ml/home/arunps/Projects/speaker-type-classifier/notebooks'

In [3]:
import os
os.chdir("../")  # Navigate to project root

In [4]:
%pwd

'/itf-fi-ml/home/arunps/Projects/speaker-type-classifier'

### Paths + HF cache on scratch + v1 folders

In [5]:
from pathlib import Path
import os

SCRATCH = Path("/scratch/users/arunps")
PROJECT = "speaker-type-classifier"
DATASET_VERSION = "v1"

DATA_ROOT    = SCRATCH / PROJECT / "data_hf"
RAW_DIR      = DATA_ROOT / "raw" / DATASET_VERSION
EXPORT_DIR   = DATA_ROOT / "export" / DATASET_VERSION
MANIFEST_DIR = DATA_ROOT / "manifest" / DATASET_VERSION
REPORTS_DIR  = DATA_ROOT / "reports" / DATASET_VERSION

for p in [RAW_DIR, EXPORT_DIR, MANIFEST_DIR, REPORTS_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# HF cache to scratch (avoid filling $HOME)
os.environ["HF_HOME"] = str(SCRATCH / ".cache" / "huggingface")
os.environ["HF_DATASETS_CACHE"] = str(SCRATCH / ".cache" / "huggingface" / "datasets")
os.environ["TRANSFORMERS_CACHE"] = str(SCRATCH / ".cache" / "huggingface" / "transformers")

# Create export class folders
CLASS_DIRS = {
    "adult_male": EXPORT_DIR / "adult_male",
    "adult_female": EXPORT_DIR / "adult_female",
    "child": EXPORT_DIR / "child",
    "background": EXPORT_DIR / "background",
}
for d in CLASS_DIRS.values():
    d.mkdir(parents=True, exist_ok=True)

print("DATASET_VERSION:", DATASET_VERSION)
print("RAW_DIR:", RAW_DIR)
print("EXPORT_DIR:", EXPORT_DIR)
print("MANIFEST_DIR:", MANIFEST_DIR)
print("HF_HOME:", os.environ["HF_HOME"])

DATASET_VERSION: v1
RAW_DIR: /scratch/users/arunps/speaker-type-classifier/data_hf/raw/v1
EXPORT_DIR: /scratch/users/arunps/speaker-type-classifier/data_hf/export/v1
MANIFEST_DIR: /scratch/users/arunps/speaker-type-classifier/data_hf/manifest/v1
HF_HOME: /scratch/users/arunps/.cache/huggingface


In [7]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()  

login(token=os.getenv("HF_TOKEN"))
print("Logged in to Hugging Face Hub")


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Logged in to Hugging Face Hub


### Create unified class folders for v1

In [8]:
from pathlib import Path

class_dirs = {
    "adult_male": EXPORT_DIR / "adult_male",
    "adult_female": EXPORT_DIR / "adult_female",
    "child": EXPORT_DIR / "child",
    "background": EXPORT_DIR / "background",
}
for d in class_dirs.values():
    d.mkdir(parents=True, exist_ok=True)

class_dirs

{'adult_male': PosixPath('/scratch/users/arunps/speaker-type-classifier/data_hf/export/v1/adult_male'),
 'adult_female': PosixPath('/scratch/users/arunps/speaker-type-classifier/data_hf/export/v1/adult_female'),
 'child': PosixPath('/scratch/users/arunps/speaker-type-classifier/data_hf/export/v1/child'),
 'background': PosixPath('/scratch/users/arunps/speaker-type-classifier/data_hf/export/v1/background')}

### Audio write helper: mono + 16kHz WAV

In [9]:
import numpy as np
import soundfile as sf
import librosa

TARGET_SR = 16000

def to_mono(x: np.ndarray) -> np.ndarray:
    if x.ndim == 1:
        return x
    # average channels
    return x.mean(axis=1) if x.shape[1] > 1 else x[:, 0]

def normalize_and_write_wav(out_path, audio_np, sr):
    audio_np = np.asarray(audio_np)
    audio_np = to_mono(audio_np)

    if sr != TARGET_SR:
        audio_np = librosa.resample(audio_np, orig_sr=sr, target_sr=TARGET_SR)

    sf.write(str(out_path), audio_np, TARGET_SR)
    return out_path


### Generic inspector helpers (configs, splits, columns)

In [19]:
from datasets import get_dataset_config_names, load_dataset_builder
from huggingface_hub import HfApi

def inspect_hf_dataset(repo_id: str, max_configs: int = 5):
    print("\n" + "="*90)
    print("DATASET:", repo_id)

    # 1) configs
    try:
        configs = get_dataset_config_names(repo_id)
    except Exception as e:
        configs = []
        print("Could not fetch config names:", repr(e))

    if configs:
        print(f"Configs ({len(configs)}):", configs[:max_configs], ("..." if len(configs)>max_configs else ""))
    else:
        print("Configs: (none / default)")

    # 2) builder info for configs (splits + features)
    cfgs_to_check = configs[:max_configs] if configs else [None]
    for cfg in cfgs_to_check:
        try:
            b = load_dataset_builder(repo_id, cfg) if cfg else load_dataset_builder(repo_id)
            splits = list(b.info.splits.keys()) if b.info.splits else []
            print("\n--- Config:", cfg if cfg else "(default)")
            print("Splits:", splits)

            # features/columns
            feats = b.info.features
            if feats is None:
                print("Features: None")
            else:
                print("Columns/features:")
                for k, v in feats.items():
                    print(f"  - {k}: {v}")
        except Exception as e:
            print("\n--- Config:", cfg if cfg else "(default)")
            print("Builder inspection failed:", repr(e))


def inspect_repo_files(repo_id: str, repo_type: str = "dataset", max_files: int = 40):
    """Useful for datasets that are basically files/shards (like WDS tar shards)."""
    print("\n" + "="*90)
    print("REPO FILE LIST:", repo_id)
    api = HfApi()
    files = api.list_repo_files(repo_id=repo_id, repo_type=repo_type)
    print(f"Total files: {len(files)}")
    for f in files[:max_files]:
        print(" ", f)
    if len(files) > max_files:
        print(" ...")


### Inspect Vaani (columns + splits + configs)


In [20]:
inspect_hf_dataset("ARTPARK-IISc/Vaani", max_configs=10)



DATASET: ARTPARK-IISc/Vaani


Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

Configs (170): ['AndhraPradesh_Anantpur', 'AndhraPradesh_Annamaya', 'AndhraPradesh_Chittoor', 'AndhraPradesh_Guntur', 'AndhraPradesh_Krishna', 'AndhraPradesh_Manyam', 'AndhraPradesh_SriSatyaSai', 'AndhraPradesh_Srikakulam', 'AndhraPradesh_Vishakapattanam', 'ArunachalPradesh_Longding'] ...


Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]


--- Config: AndhraPradesh_Anantpur
Splits: ['train']
Columns/features:
  - audio: Audio(sampling_rate=None, decode=True, num_channels=None, stream_index=None)
  - language: Value('string')
  - speakerID: Value('float64')
  - languagesKnown: Value('string')
  - gender: Value('string')
  - state: Value('string')
  - district: Value('string')
  - pincode: Value('int64')
  - stay(years): Value('string')
  - isTranscriptionAvailable: Value('string')
  - transcript: Value('string')
  - referenceImage: Value('string')


Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]


--- Config: AndhraPradesh_Annamaya
Splits: ['train']
Columns/features:
  - audio: Audio(sampling_rate=None, decode=True, num_channels=None, stream_index=None)
  - language: Value('string')
  - speakerID: Value('float64')
  - languagesKnown: Value('string')
  - gender: Value('string')
  - state: Value('string')
  - district: Value('string')
  - pincode: Value('int64')
  - stay(years): Value('string')
  - isTranscriptionAvailable: Value('string')
  - transcript: Value('string')
  - referenceImage: Value('string')


Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/64 [00:00<?, ?it/s]


--- Config: AndhraPradesh_Chittoor
Splits: ['train']
Columns/features:
  - audio: Audio(sampling_rate=None, decode=True, num_channels=None, stream_index=None)
  - language: Value('string')
  - speakerID: Value('float64')
  - languagesKnown: Value('string')
  - gender: Value('string')
  - state: Value('string')
  - district: Value('string')
  - pincode: Value('int64')
  - stay(years): Value('string')
  - isTranscriptionAvailable: Value('string')
  - transcript: Value('string')
  - referenceImage: Value('string')


Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/50 [00:00<?, ?it/s]


--- Config: AndhraPradesh_Guntur
Splits: ['train']
Columns/features:
  - audio: Audio(sampling_rate=None, decode=True, num_channels=None, stream_index=None)
  - language: Value('string')
  - speakerID: Value('float64')
  - languagesKnown: Value('string')
  - gender: Value('string')
  - state: Value('string')
  - district: Value('string')
  - pincode: Value('int64')
  - stay(years): Value('string')
  - isTranscriptionAvailable: Value('string')
  - transcript: Value('string')
  - referenceImage: Value('string')


Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/43 [00:00<?, ?it/s]


--- Config: AndhraPradesh_Krishna
Splits: ['train']
Columns/features:
  - audio: Audio(sampling_rate=None, decode=True, num_channels=None, stream_index=None)
  - language: Value('string')
  - speakerID: Value('float64')
  - languagesKnown: Value('string')
  - gender: Value('string')
  - state: Value('string')
  - district: Value('string')
  - pincode: Value('int64')
  - stay(years): Value('string')
  - isTranscriptionAvailable: Value('string')
  - transcript: Value('string')
  - referenceImage: Value('string')


Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]


--- Config: AndhraPradesh_Manyam
Splits: ['train']
Columns/features:
  - audio: Audio(sampling_rate=None, decode=True, num_channels=None, stream_index=None)
  - language: Value('string')
  - speakerID: Value('float64')
  - languagesKnown: Value('string')
  - gender: Value('string')
  - state: Value('string')
  - district: Value('string')
  - pincode: Value('int64')
  - stay(years): Value('string')
  - isTranscriptionAvailable: Value('string')
  - transcript: Value('string')
  - referenceImage: Value('string')


Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]


--- Config: AndhraPradesh_SriSatyaSai
Splits: ['train']
Columns/features:
  - audio: Audio(sampling_rate=None, decode=True, num_channels=None, stream_index=None)
  - language: Value('string')
  - speakerID: Value('float64')
  - languagesKnown: Value('string')
  - gender: Value('string')
  - state: Value('string')
  - district: Value('string')
  - pincode: Value('int64')
  - stay(years): Value('string')
  - isTranscriptionAvailable: Value('string')
  - transcript: Value('string')
  - referenceImage: Value('string')


Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/50 [00:00<?, ?it/s]


--- Config: AndhraPradesh_Srikakulam
Splits: ['train']
Columns/features:
  - audio: Audio(sampling_rate=None, decode=True, num_channels=None, stream_index=None)
  - language: Value('string')
  - speakerID: Value('float64')
  - languagesKnown: Value('string')
  - gender: Value('string')
  - state: Value('string')
  - district: Value('string')
  - pincode: Value('int64')
  - stay(years): Value('string')
  - isTranscriptionAvailable: Value('string')
  - transcript: Value('string')
  - referenceImage: Value('string')


Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]


--- Config: AndhraPradesh_Vishakapattanam
Splits: ['train']
Columns/features:
  - audio: Audio(sampling_rate=None, decode=True, num_channels=None, stream_index=None)
  - language: Value('string')
  - speakerID: Value('float64')
  - languagesKnown: Value('string')
  - gender: Value('string')
  - state: Value('string')
  - district: Value('string')
  - pincode: Value('int64')
  - stay(years): Value('string')
  - isTranscriptionAvailable: Value('string')
  - transcript: Value('string')
  - referenceImage: Value('string')


Resolving data files:   0%|          | 0/56 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/49 [00:00<?, ?it/s]


--- Config: ArunachalPradesh_Longding
Splits: ['train']
Columns/features:
  - audio: Audio(sampling_rate=None, decode=True, num_channels=None, stream_index=None)
  - language: Value('string')
  - speakerID: Value('float64')
  - languagesKnown: Value('string')
  - gender: Value('string')
  - state: Value('string')
  - district: Value('string')
  - pincode: Value('int64')
  - stay(years): Value('string')
  - isTranscriptionAvailable: Value('string')
  - transcript: Value('string')
  - referenceImage: Value('string')


### Inspect ChildMandarin (columns + splits)

In [21]:
inspect_hf_dataset("BAAI/ChildMandarin", max_configs=10)



DATASET: BAAI/ChildMandarin
Configs (1): ['default'] 

--- Config: default
Splits: ['train', 'validation', 'test']
Columns/features:
  - json: {'accent': Value('string'), 'age': Value('int64'), 'device': Value('string'), 'gender': Value('string'), 'id': Value('string'), 'location': Value('string'), 'speaker_id': Value('string'), 'text': Value('string')}
  - wav: Audio(sampling_rate=None, decode=True, num_channels=None, stream_index=None)
  - __key__: Value('string')
  - __url__: Value('string')


### Inspect AudioSet WDS (file-based repo + splits)

In [22]:
inspect_repo_files("confit/audioset-16khz-wds", repo_type="dataset", max_files=80)



REPO FILE LIST: confit/audioset-16khz-wds
Total files: 735
  .gitattributes
  20k/test/shard-00000.tar
  20k/test/shard-00001.tar
  20k/test/shard-00002.tar
  20k/test/shard-00003.tar
  20k/test/shard-00004.tar
  20k/test/shard-00005.tar
  20k/train/shard-00000.tar
  20k/train/shard-00001.tar
  20k/train/shard-00002.tar
  20k/train/shard-00003.tar
  20k/train/shard-00004.tar
  20k/train/shard-00005.tar
  20k/train/shard-00006.tar
  2m/test/shard-00000.tar
  2m/test/shard-00001.tar
  2m/test/shard-00002.tar
  2m/test/shard-00003.tar
  2m/test/shard-00004.tar
  2m/test/shard-00005.tar
  2m/train/shard-00000.tar
  2m/train/shard-00001.tar
  2m/train/shard-00002.tar
  2m/train/shard-00003.tar
  2m/train/shard-00004.tar
  2m/train/shard-00005.tar
  2m/train/shard-00006.tar
  2m/train/shard-00007.tar
  2m/train/shard-00008.tar
  2m/train/shard-00009.tar
  2m/train/shard-00010.tar
  2m/train/shard-00011.tar
  2m/train/shard-00012.tar
  2m/train/shard-00013.tar
  2m/train/shard-00014.tar
  2m

In [23]:
from huggingface_hub import snapshot_download
from pathlib import Path

AUD_ROOT = Path("/scratch/users/arunps/speaker-type-classifier/data_hf/raw/v1/audioset_wds_one")

snapshot_download(
    repo_id="confit/audioset-16khz-wds",
    repo_type="dataset",
    local_dir=str(AUD_ROOT),
    allow_patterns=[
        "20k/train/shard-00000.tar",
        "README.md",
        "dataset_infos.json",
    ],
)

print("Saved to:", AUD_ROOT)
print("Shard:", AUD_ROOT / "20k/train/shard-00000.tar")


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Saved to: /scratch/users/arunps/speaker-type-classifier/data_hf/raw/v1/audioset_wds_one
Shard: /scratch/users/arunps/speaker-type-classifier/data_hf/raw/v1/audioset_wds_one/20k/train/shard-00000.tar


In [31]:
import re

EXCLUDE_PATTERNS = [
    r"\bspeech\b",
    r"\btalking\b",
    r"\bconversation\b",
    r"\bwhisper(ing)?\b",
    r"\bvocal(ization|ise|ize|)\b",
    r"\bvoice\b",
    r"\bsinging\b",
    r"\bchoir\b",
    r"\blaughter\b",
    r"\bgiggle\b",
    r"\bcry(ing)?\b",
    r"\bbaby\b",
    r"\binfant\b",
    r"\bchild\b",
    r"\bkid\b",
    r"\btoddler\b",
    r"\bbabble\b",
]

exclude_re = re.compile("|".join(EXCLUDE_PATTERNS), re.IGNORECASE)

def is_background_label_list(label_list):
    text = " | ".join([str(x) for x in label_list])
    return exclude_re.search(text) is None


In [32]:
import webdataset as wds
import json
from itertools import islice

shard_path = str(AUD_ROOT / "20k/train/shard-00000.tar")
ds = wds.WebDataset(shard_path).decode()

bg = 0
non_bg = 0

for i, sample in enumerate(islice(ds, 50)):
    print(f"\n--- sample {i} keys:", list(sample.keys()))

    raw = sample["json"]

    # raw can be dict OR bytes/str
    if isinstance(raw, dict):
        meta = raw
    else:
        if isinstance(raw, (bytes, bytearray)):
            raw = raw.decode("utf-8", errors="replace")
        meta = json.loads(raw)

    print("json keys:", list(meta.keys()))

    labels = meta.get("label") or meta.get("labels") or []
    if isinstance(labels, str):
        labels = [labels]

    if is_background_label_list(labels):
        tag = "BACKGROUND"
        bg += 1
    else:
        tag = "EXCLUDED"
        non_bg += 1

    print(f"{tag} → label (first 10):", labels[:10], "... len =", len(labels))

print("\nSummary")
print("Background candidates:", bg)
print("Excluded (speech/child/etc):", non_bg)



--- sample 0 keys: ['__key__', '__url__', 'wav', '__local_path__', 'json']
json keys: ['id', 'label', 'label_id']
BACKGROUND → label (first 10): ['Clarinet'] ... len = 1

--- sample 1 keys: ['__key__', '__url__', 'wav', '__local_path__', 'json']
json keys: ['id', 'label', 'label_id']
BACKGROUND → label (first 10): ['Tabla', 'Folk music', 'Music', 'Classical music', 'Flute'] ... len = 5

--- sample 2 keys: ['__key__', '__url__', 'wav', '__local_path__', 'json']
json keys: ['id', 'label', 'label_id']
EXCLUDED → label (first 10): ['Singing', 'Music', 'Salsa music'] ... len = 3

--- sample 3 keys: ['__key__', '__url__', 'wav', '__local_path__', 'json']
json keys: ['id', 'label', 'label_id']
BACKGROUND → label (first 10): ['Telephone'] ... len = 1

--- sample 4 keys: ['__key__', '__url__', 'wav', '__local_path__', 'json']
json keys: ['id', 'label', 'label_id']
BACKGROUND → label (first 10): ['Brass instrument', 'Trumpet'] ... len = 2

--- sample 5 keys: ['__key__', '__url__', 'wav', '__loc