In [None]:
import matplotlib.pyplot as plt
from datasets import Audio, load_dataset
from IPython.display import Audio as IPythonAudio


def play_sample(sample: dict):
    """Play the audio of a sample."""
    audio = sample["audio"]["array"]
    display(IPythonAudio(audio, rate=16_000))


plt.style.use("ggplot")

In [None]:
while True:
    try:
        coral = load_dataset("alexandrainst/coral", name="read_aloud", split="train")
        break
    except Exception as e:
        print(f"Encountered error: {str(e)}. Retrying...")
        continue
coral = coral.cast_column("audio", Audio(sampling_rate=16_000))

In [None]:
# Look at a sample

coral[0]

In [None]:
# Plot the WER distribution of approved vs non-approved samples

approved_samples = coral.filter(lambda example: example["validated"] == "approved")
rejected_samples = coral.filter(lambda example: example["validated"] == "rejected")

plt.hist(
    approved_samples["asr_wer"], bins=50, label="approved", alpha=0.5, density=True
)
plt.hist(
    rejected_samples["asr_wer"], bins=50, label="rejected", alpha=0.5, density=True
)
plt.xlim(0, 1)
plt.legend()
plt.savefig("wer-distribution-approved-rejected.png", dpi=200)
plt.show()

In [None]:
# Play some samples with the worst WER

worst_samples = coral.sort("asr_wer", reverse=True).select(range(100))
for sample in worst_samples:
    print(f"WER: {sample['asr_wer']:.0%}")
    print(f"Text: {sample['text']!r}")
    play_sample(sample)
    print()