# Read Parquet Files

This notebook reads and explores the three types of parquet files produced by the HindiBabyNet pipeline:

| File | Stage | Description |
|------|-------|-------------|
| `recordings.parquet` | Data Ingestion | Metadata for each raw WAV recording |
| `*_audio_manifest.parquet` | Audio Preparation | Segment mapping from source recordings to combined audio |
| `*_segments.parquet` | Speaker Classification | Per-segment speaker predictions |

In [1]:
from pathlib import Path
import pandas as pd

ARTIFACTS_DIR = Path("../artifacts/runs")
print(f"Artifacts root: {ARTIFACTS_DIR.resolve()}")
print(f"Available runs: {sorted(p.name for p in ARTIFACTS_DIR.iterdir())}")

Artifacts root: /itf-fi-ml/home/arunps/Projects/HindiBabyNet/artifacts/runs
Available runs: ['.gitkeep', '20260209_163255', '20260209_163812', '20260210_130337', '20260210_131458', '20260210_131852', '20260210_132648', '20260210_145617', '20260210_163159', '20260210_163202', '20260210_163227', '20260210_172217', '20260210_172220', '20260210_173908', '20260211_104943', '20260211_104949', '20260211_110614', '20260211_133515', '20260211_133518', '20260211_135201', '20260212_132214', '20260212_132222', '20260212_133925', '20260212_134445', '20260212_134452']


## 1. Select a run

Set `RUN_ID` to the timestamp folder you want to inspect, or leave as `"latest"` to auto-pick the most recent run.

In [2]:
RUN_ID = "latest"  # e.g. "20260212_134445" or "latest"

if RUN_ID == "latest":
    RUN_ID = sorted(p.name for p in ARTIFACTS_DIR.iterdir())[-1]

run_dir = ARTIFACTS_DIR / RUN_ID
print(f"Selected run: {run_dir}")
print(f"Stages: {[p.name for p in run_dir.iterdir()]}")

Selected run: ../artifacts/runs/20260212_134452
Stages: ['speaker_classification']


## 2. Helper: discover & load parquets

In [3]:
def find_parquets(root: Path) -> list[Path]:
    """Recursively find all .parquet files under `root`."""
    return sorted(root.rglob("*.parquet"))


def load_parquets(root: Path) -> dict[str, pd.DataFrame]:
    """Load every parquet under `root` into a {stem: DataFrame} dict."""
    return {p.stem: pd.read_parquet(p) for p in find_parquets(root)}


# Show what's available in the selected run
parquet_files = find_parquets(run_dir)
for p in parquet_files:
    print(p.relative_to(ARTIFACTS_DIR))

20260212_134452/speaker_classification/ZOIB270124_segments.parquet


## 3. Recordings (`data_ingestion/recordings.parquet`)

One row per raw WAV file with duration, sample-rate, format, and file-size metadata.

In [4]:
recordings_path = next(
    (p for p in parquet_files if p.name == "recordings.parquet"), None
)

if recordings_path:
    recordings = pd.read_parquet(recordings_path)
    print(f"Shape: {recordings.shape}")
    display(recordings.head())
    display(recordings.describe())
else:
    # Fall back: look across all runs for the latest recordings.parquet
    all_rec = sorted(ARTIFACTS_DIR.rglob("recordings.parquet"))
    if all_rec:
        recordings = pd.read_parquet(all_rec[-1])
        print(f"Loaded from: {all_rec[-1]}")
        print(f"Shape: {recordings.shape}")
        display(recordings.head())
        display(recordings.describe())
    else:
        print("No recordings.parquet found.")

Loaded from: ../artifacts/runs/20260212_134445/data_ingestion/recordings.parquet
Shape: (111, 11)


Unnamed: 0,participant_id,session_date,recording_id,path,duration_sec,sample_rate,channels,frames,subtype,format,size_bytes
0,ABAN141223,20250216,1739683525,/scratch/users/arunps/hindibabynet/audio_raw/R...,17940.02,16000,1,287040320,PCM_16,WAV,574081152
1,ABAN141223,20250216,1739701628,/scratch/users/arunps/hindibabynet/audio_raw/R...,7331.1,16000,1,117297600,PCM_16,WAV,234595712
2,ADGA210923,20250216,1739685429,/scratch/users/arunps/hindibabynet/audio_raw/R...,17940.02,16000,1,287040320,PCM_16,WAV,574081152
3,ADGA210923,20250216,1739703674,/scratch/users/arunps/hindibabynet/audio_raw/R...,5878.4,16000,1,94054400,PCM_16,WAV,188109312
4,AHKV290824,20250323,1742709081,/scratch/users/arunps/hindibabynet/audio_raw/R...,17940.02,16000,1,287040320,PCM_16,WAV,574081152


Unnamed: 0,duration_sec,sample_rate,channels,frames,size_bytes
count,111.0,111.0,111.0,111.0,111.0
mean,11211.920541,16000.0,1.0,179390700.0,358782000.0
std,6643.092994,0.0,0.0,106289500.0,212579000.0
min,1.04,16000.0,1.0,16640.0,33792.0
25%,6025.72,16000.0,1.0,96411520.0,192823600.0
50%,11543.34,16000.0,1.0,184693400.0,369387400.0
75%,17940.02,16000.0,1.0,287040300.0,574081200.0
max,17940.02,16000.0,1.0,287040300.0,574081200.0


## 4. Audio Manifest (`audio_preparation/*_audio_manifest.parquet`)

Maps source recordings to their combined/processed audio files.

In [5]:
manifest_files = sorted(ARTIFACTS_DIR.rglob("*_audio_manifest.parquet"))
print(f"Found {len(manifest_files)} audio manifest file(s)")

if manifest_files:
    manifests = pd.concat(
        [pd.read_parquet(f) for f in manifest_files],
        ignore_index=True,
    )
    print(f"Combined shape: {manifests.shape}")
    display(manifests.head())
    display(manifests.dtypes)

Found 229 audio manifest file(s)
Combined shape: (562, 11)


Unnamed: 0,source_index,source_path,source_recording_id,combined_start_sec,combined_end_sec,source_duration_sec,sample_rate,channels,participant_id,recording_id,combined_raw_path
0,0,/scratch/users/arunps/hindibabynet/audio_raw/R...,1739683525,0.0,17940.02,17940.02,16000,1,ABAN141223,ABAN141223,/scratch/users/arunps/hindibabynet/audio_proce...
1,1,/scratch/users/arunps/hindibabynet/audio_raw/R...,1739701628,17940.02,25271.12,7331.1,16000,1,ABAN141223,ABAN141223,/scratch/users/arunps/hindibabynet/audio_proce...
2,0,/scratch/users/arunps/hindibabynet/audio_raw/R...,1739685429,0.0,17940.02,17940.02,16000,1,ADGA210923,ADGA210923,/scratch/users/arunps/hindibabynet/audio_proce...
3,1,/scratch/users/arunps/hindibabynet/audio_raw/R...,1739703674,17940.02,23818.42,5878.4,16000,1,ADGA210923,ADGA210923,/scratch/users/arunps/hindibabynet/audio_proce...
4,0,/scratch/users/arunps/hindibabynet/audio_raw/R...,1739702694,0.0,7448.34,7448.34,16000,1,1739702694,1739702694,


source_index             int64
source_path             object
source_recording_id     object
combined_start_sec     float64
combined_end_sec       float64
source_duration_sec    float64
sample_rate              int64
channels                 int64
participant_id          object
recording_id            object
combined_raw_path       object
dtype: object

## 5. Segments (`speaker_classification/*_segments.parquet`)

Per-segment speaker classification results with probabilities for adult-male, adult-female, child, and background.

In [6]:
segment_files = sorted(ARTIFACTS_DIR.rglob("*_segments.parquet"))
print(f"Found {len(segment_files)} segment file(s)")

if segment_files:
    segments = pd.concat(
        [
            pd.read_parquet(f).assign(file=f.stem.replace("_segments", ""))
            for f in segment_files
        ],
        ignore_index=True,
    )
    print(f"Combined shape: {segments.shape}")
    display(segments.head())
    display(segments.describe())

Found 72 segment file(s)
Combined shape: (416694, 15)


Unnamed: 0,_merge_group,chunk_id,speaker_id_local,start_sec,end_sec,n_merged,duration_sec,n_windows,probs_adult_male,probs_adult_female,probs_child,probs_background,predicted_class,predicted_confidence,file
0,48,0,SPEAKER_02,0.132219,4.485969,3,4.35375,8,0.008513,0.142738,0.339449,0.5093,background,0.5093,ABAN141223
1,49,0,SPEAKER_02,5.498469,117.09,15,111.591531,223,0.006136,0.213114,0.24539,0.53536,background,0.53536,ABAN141223
2,1,0,SPEAKER_00,16.652844,16.973469,1,0.320625,1,0.000498,0.006875,0.136892,0.855735,background,0.855735,ABAN141223
3,181,0,SPEAKER_03,85.620969,86.295969,1,0.675,1,0.004964,0.608248,0.114383,0.272405,adult_female,0.608248,ABAN141223
4,182,0,SPEAKER_03,91.409094,92.084094,1,0.675,1,0.002387,0.997238,9.3e-05,0.000282,adult_female,0.997238,ABAN141223


Unnamed: 0,_merge_group,chunk_id,start_sec,end_sec,n_merged,duration_sec,n_windows,probs_adult_male,probs_adult_female,probs_child,probs_background,predicted_confidence
count,416694.0,416694.0,416694.0,416694.0,416694.0,416694.0,416694.0,416694.0,416694.0,416694.0,416694.0,416694.0
mean,3508.152311,16.54276,15168.411443,15170.704222,1.389262,2.292779,4.23788,0.044977,0.19715,0.2970522,0.460821,0.714164
std,2719.875311,13.426303,11950.674033,11950.690031,1.065656,3.421928,6.769273,0.143918,0.288765,0.2789528,0.308983,0.180303
min,1.0,0.0,0.030969,0.334719,1.0,0.200281,1.0,3e-06,4e-06,3.350371e-07,1e-06,0.259652
25%,1447.0,7.0,6552.651437,6554.655344,1.0,0.658125,1.0,0.001082,0.004475,0.05328809,0.185398,0.560518
50%,2943.0,14.0,13225.995656,13227.910969,1.0,1.2825,2.0,0.004455,0.03712,0.2156228,0.443388,0.708591
75%,4908.0,22.0,20457.206125,20459.231125,1.0,2.59875,5.0,0.016294,0.289702,0.4749274,0.72583,0.878773
max,15700.0,73.0,65362.290344,65363.16,55.0,144.264375,288.0,0.999972,0.999883,0.9998043,0.999885,0.999972


## 6. Quick Summary

In [7]:
if "segments" in dir():
    print("Speaker class distribution:")
    display(segments["predicted_class"].value_counts())

    print("\nMean confidence by class:")
    display(
        segments.groupby("predicted_class")["predicted_confidence"]
        .mean()
        .sort_values(ascending=False)
    )

    print(f"\nTotal segments duration: {segments['duration_sec'].sum():.1f}s")

Speaker class distribution:


predicted_class
background      208359
child           111541
adult_female     82498
adult_male       14296
Name: count, dtype: int64


Mean confidence by class:


predicted_class
adult_male      0.727879
background      0.725185
adult_female    0.718236
child           0.688808
Name: predicted_confidence, dtype: float64


Total segments duration: 955387.1s


## 7. Load a specific parquet by path

Use this cell to load any arbitrary parquet file.

In [None]:
# Set the path to any parquet file 
PARQUET_PATH = ""  # e.g. "../artifacts/runs/20260212_134452/speaker_classification/ZOIB270124_segments.parquet"

if PARQUET_PATH:
    df = pd.read_parquet(PARQUET_PATH)
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    display(df.head(10))
    display(df.dtypes)
else:
    print("Set PARQUET_PATH above to load a specific file.")