In [6]:
from datasets import load_dataset
from datasets import Audio
from transformers import pipeline

In [7]:
minds = load_dataset("PolyAI/minds14", name="en-AU", split="train")
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))

## Audio Classification

In [8]:
classifier = pipeline(
    "audio-classification",
    model="anton-l/xtreme_s_xlsr_300m_minds14",
)

Device set to use cpu


In [9]:
classifier

<transformers.pipelines.audio_classification.AudioClassificationPipeline at 0x1dbf63382d0>

In [10]:
example = minds[0]

In [11]:
classifier(example["audio"]["array"])

[{'score': 0.9625311493873596, 'label': 'pay_bill'},
 {'score': 0.02867273800075054, 'label': 'freeze'},
 {'score': 0.003349794540554285, 'label': 'card_issues'},
 {'score': 0.002005802933126688, 'label': 'abroad'},
 {'score': 0.0008484324789606035, 'label': 'high_value_payment'},
 {'score': 0.000736794900149107, 'label': 'direct_debit'},
 {'score': 0.0004056991310790181, 'label': 'latest_transactions'},
 {'score': 0.0003397076216060668, 'label': 'joint_account'},
 {'score': 0.00033127880305983126, 'label': 'address'},
 {'score': 0.0003288650477770716, 'label': 'balance'},
 {'score': 0.00014877492503728718, 'label': 'app_error'},
 {'score': 0.00014772488793823868, 'label': 'atm_limit'},
 {'score': 8.815681940177456e-05, 'label': 'cash_deposit'},
 {'score': 6.512475374620408e-05, 'label': 'business_loan'}]

In [12]:
id2label = minds.features["intent_class"].int2str
id2label(example["intent_class"])

'pay_bill'

## Speech Recognition (STT)

Automatic speech recognition: transform audio clips into text by transcribing them automatically. You can get a text representation of a recording of someone speaking, like “How are you doing today?“. Rather useful for note taking!

In [13]:
asr = pipeline("automatic-speech-recognition")

No model was supplied, defaulted to facebook/wav2vec2-base-960h and revision 22aad52 (https://huggingface.co/facebook/wav2vec2-base-960h).
Using a pipeline without specifying a model name and revision in production is not recommended.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


In [14]:
asr(example["audio"]["array"])

{'text': 'I WOULD LIKE TO PAY MY ELECTRICITY BILL USING MY CAD CAN YOU PLEASE ASSIST'}

In [15]:
example["english_transcription"]

'I would like to pay my electricity bill using my card can you please assist'

In [16]:
import transformers
print(transformers.__version__)

4.51.3


## Text to Speech

In [17]:
pipe = pipeline("text-to-speech", model="suno/bark-small")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


In [18]:
from IPython.display import Audio

In [None]:
text = "Ladybugs have had important roles in culture and religion, being associated with luck, love, fertility and prophecy. "
output = pipe(text)
Audio(output["audio"], rate=output["sampling_rate"])

## Music Generation

In [None]:
music_pipe = pipeline("text-to-audio", model="facebook/musicgen-small")

In [None]:
text = "90s rock song with electric guitar and heavy drums"

In [None]:
forward_params = {"max_new_tokens": 512}

output = music_pipe(text, forward_params=forward_params)
Audio(output["audio"][0], rate=output["sampling_rate"])