# Unit 2. A gentle introduction to audio applications

## Audio classification pipeline

In [1]:
from datasets import load_dataset
from datasets import Audio

minds = load_dataset("PolyAI/minds14", name="en-AU", split="train")
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import pipeline

classifier = pipeline("audio-classification", model="anton-l/xtreme_s_xlsr_300m_minds14")

Downloading pytorch_model.bin: 100%|██████████| 1.26G/1.26G [04:28<00:00, 4.71MB/s]
Downloading (…)rocessor_config.json: 100%|██████████| 212/212 [00:00<00:00, 839kB/s]


In [4]:
example = minds[0]
example

{'path': '/home/mpp/.cache/huggingface/datasets/downloads/extracted/9ab7eb46ae068511cd333afda67d68200178ff794ae6e73c1c1966ecb33eacac/en-AU~PAY_BILL/response_4.wav',
 'audio': {'path': '/home/mpp/.cache/huggingface/datasets/downloads/extracted/9ab7eb46ae068511cd333afda67d68200178ff794ae6e73c1c1966ecb33eacac/en-AU~PAY_BILL/response_4.wav',
  'array': array([2.36119668e-05, 1.92324660e-04, 2.19284790e-04, ...,
         9.40907281e-04, 1.16613181e-03, 7.20883254e-04]),
  'sampling_rate': 16000},
 'transcription': 'I would like to pay my electricity bill using my card can you please assist',
 'english_transcription': 'I would like to pay my electricity bill using my card can you please assist',
 'intent_class': 13,
 'lang_id': 2}

In [5]:
classifier(example["audio"]["array"])

[{'score': 0.9625310301780701, 'label': 'pay_bill'},
 {'score': 0.028672782704234123, 'label': 'freeze'},
 {'score': 0.003349803853780031, 'label': 'card_issues'},
 {'score': 0.002005806425586343, 'label': 'abroad'},
 {'score': 0.0008484335266984999, 'label': 'high_value_payment'}]

In [6]:
id2label = minds.features["intent_class"].int2str
id2label(example["intent_class"])

'pay_bill'

## Automatic speech recognition with a pipeline

In [7]:
from transformers import pipeline

asr = pipeline("automatic-speech-recognition")

No model was supplied, defaulted to facebook/wav2vec2-base-960h and revision 55bb623 (https://huggingface.co/facebook/wav2vec2-base-960h).
Using a pipeline without specifying a model name and revision in production is not recommended.
Downloading (…)lve/main/config.json: 100%|██████████| 1.60k/1.60k [00:00<00:00, 2.07MB/s]
Downloading model.safetensors: 100%|██████████| 378M/378M [01:14<00:00, 5.04MB/s] 
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading (…)okenizer_config.json: 100%|██████████| 163/163 [00:00<00:00, 431kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 291/291 [00:00<00:00, 344kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 85.0/85.0 [00:00<00:00, 286kB/s]
Downloading (…)rocessor_config.json: 100%|██████████| 159

In [8]:
example = minds[0]
example

{'path': '/home/mpp/.cache/huggingface/datasets/downloads/extracted/9ab7eb46ae068511cd333afda67d68200178ff794ae6e73c1c1966ecb33eacac/en-AU~PAY_BILL/response_4.wav',
 'audio': {'path': '/home/mpp/.cache/huggingface/datasets/downloads/extracted/9ab7eb46ae068511cd333afda67d68200178ff794ae6e73c1c1966ecb33eacac/en-AU~PAY_BILL/response_4.wav',
  'array': array([2.36119668e-05, 1.92324660e-04, 2.19284790e-04, ...,
         9.40907281e-04, 1.16613181e-03, 7.20883254e-04]),
  'sampling_rate': 16000},
 'transcription': 'I would like to pay my electricity bill using my card can you please assist',
 'english_transcription': 'I would like to pay my electricity bill using my card can you please assist',
 'intent_class': 13,
 'lang_id': 2}

In [9]:
asr(example["audio"]["array"])

{'text': 'I WOULD LIKE TO PAY MY ELECTRICITY BILL USING MY CAD CAN YOU PLEASE ASSIST'}

In [10]:
example["english_transcription"]

'I would like to pay my electricity bill using my card can you please assist'