## Zero-Shot Audio Classification

Used for classifying audio into categories based on the model used

In [2]:
#install the needed libraries
!pip install datasets
!pip install soundfile
!pip install librosa



In [3]:
#load audio dataset from the huggingFace Hub
from datasets import load_dataset


In [4]:
# This dataset is a collection of different sounds of 5 seconds
dataset = load_dataset("ashraq/esc50",
                      split="train[0:10]")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/345 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading metadata:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/387M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/387M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [5]:
#get the first sample of the dataset
audio_sample = dataset[0]

In [6]:
audio_sample

{'filename': '1-100032-A-0.wav',
 'fold': 1,
 'target': 0,
 'category': 'dog',
 'esc10': True,
 'src_file': 100032,
 'take': 'A',
 'audio': {'path': None,
  'array': array([0., 0., 0., ..., 0., 0., 0.]),
  'sampling_rate': 44100}}

In [7]:
#listen to the audio
from IPython.display import Audio as IPythonAudio
IPythonAudio(data = audio_sample['audio']['array'], rate = audio_sample['audio']['sampling_rate'])


In [8]:
from transformers import pipeline
#for this type of classification you'll need a pretrained clap model (laion/clap-htsat-unfused") you can get it on huggingFace
zero_shot_classifier = pipeline(task = 'zero-shot-audio-classification', model="laion/clap-htsat-unfused")

config.json:   0%|          | 0.00/5.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/615M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

In [9]:
#sampling means measuring the value of a continues signal in a fixed time step
#sampling rate is the number of samples gathered in one second
#for example 16,000Hz = human speech recoding, 192,000Hz = high-speech recording

#get the sampling rate that the model dataset
zero_shot_classifier.feature_extractor.sampling_rate

48000

In [10]:
#also check the sampling rate our audio dataset
audio_sample['audio']['sampling_rate']

44100

In [11]:
#let's see how you can automatically cast the whole datasets to
#the correct sampling rate (model sampling rate) when loading it with datasets library

from datasets import Audio

#making sure that our dataset sampling rate is the same with the model we're using
dataset = dataset.cast_column('audio', Audio(sampling_rate = 48_000))

In [12]:
audio_sample = dataset[0]

audio_sample #now it has the same sampling rate as the model we're using, ensure you load the datset this way

{'filename': '1-100032-A-0.wav',
 'fold': 1,
 'target': 0,
 'category': 'dog',
 'esc10': True,
 'src_file': 100032,
 'take': 'A',
 'audio': {'path': None,
  'array': array([0., 0., 0., ..., 0., 0., 0.]),
  'sampling_rate': 48000}}

In [13]:
#define some candidate labels to compare the samples with

candidate_labels = ["Sound of a dog",
                    "Sound of vacuum cleaner"]



In [14]:
#pass the audio_sample and the candidate labels to the model to view which candidate label is similar to the sample audio
zero_shot_classifier(audio_sample['audio']['array'], candidate_labels = candidate_labels)

[{'score': 0.9985589385032654, 'label': 'Sound of a dog'},
 {'score': 0.0014411048032343388, 'label': 'Sound of vacuum cleaner'}]

In [15]:
candidate_labels = ["Sound of a child crying",
                    "Sound of vacuum cleaner",
                    "Sound of a bird singing",
                    "Sound of an airplane"]

In [16]:
zero_shot_classifier(audio_sample['audio']['array'], candidate_labels = candidate_labels)

[{'score': 0.6172533631324768, 'label': 'Sound of a bird singing'},
 {'score': 0.21602550148963928, 'label': 'Sound of vacuum cleaner'},
 {'score': 0.12547214329242706, 'label': 'Sound of an airplane'},
 {'score': 0.04124903678894043, 'label': 'Sound of a child crying'}]