# Audio Classification

# Libraries

Balancing, torch, torchaudio, and transformers can be tricky! Here are the versions used for this notebook:

## Library and Versions

In [1]:
import torch, transformers, torchaudio
print("These are the versions used for this notebook, but watch the lecture for an important note on this")
print(torch.__version__)
print(torchaudio.__version__)
print(transformers.__version__)


These are the versions used for this notebook, but watch the lecture for an important note on this
2.3.0+cpu
2.3.0+cpu
4.44.2


In [2]:
from transformers import AutoFeatureExtractor, ASTForAudioClassification

In [3]:
feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [4]:
import librosa
audio_path = 'example.mp3'
y, sr = librosa.load(audio_path, sr=None)

## Sampling Rate Issues

Recall that most ML models are trained on 16 kHz sampling rate, you will run into issues if you try to force your own sampling rate:

In [5]:
# ERROR!

# You can leave the sampling_rate column if your sampling rate doesnt match 16000,
# the model will automatically upsample / downsample to 16000 Hz

# result = feature_extractor(y,sampling_rate=sr)

In [6]:
result = feature_extractor(y,return_tensors="pt")

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [7]:
result

{'input_values': tensor([[[ 0.0770, -0.2676,  0.1092,  ..., -1.2776, -1.2776, -1.2776],
         [ 0.0846, -0.2771,  0.0997,  ..., -1.2776, -1.2776, -1.2776],
         [-0.2939, -0.3674,  0.0095,  ..., -1.2776, -1.2776, -1.2776],
         ...,
         [ 0.2184, -0.0845,  0.2923,  ..., -1.2776, -1.2776, -1.2776],
         [ 0.1963, -0.1293,  0.2475,  ..., -1.2776, -1.2776, -1.2776],
         [-0.0509, -0.4521, -0.0752,  ..., -1.2776, -1.2776, -1.2776]]])}

In [8]:
model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [9]:
prediction_logits = model(result['input_values']).logits

In [10]:
prediction_logits

tensor([[ -5.9081,  -9.2253,  -9.4442,  -9.4990, -10.3686,  -9.8169, -10.5372,
          -9.9140, -11.0119, -11.1294, -10.8732, -11.3651, -12.3890, -11.9322,
         -10.1947, -10.5203, -10.9214, -11.4615, -11.2874, -10.9701, -11.3812,
         -11.1792, -10.5879, -10.7083,  -9.4158, -11.2141, -10.9959,  -8.1345,
          -8.3063, -12.1890,  -8.9141,  -8.1373,  -9.5586, -10.1567, -10.2660,
         -11.6801, -10.3482, -10.8475, -10.4001, -10.8535, -11.0698,  -9.5788,
         -12.0086, -10.2645, -10.1869, -10.8037, -10.6147, -10.8702, -11.1634,
         -11.6032, -12.3197, -11.4073, -10.7273, -10.6266, -10.7189, -10.2814,
         -12.3984, -10.6907, -11.4477, -10.8105, -12.4599, -10.8371, -11.6523,
         -11.1757,  -8.7801,  -9.1707, -11.2029, -10.2863, -11.8267,  -9.5817,
         -10.8891, -11.8293,  -7.8608,  -8.4752,  -9.2617, -10.5787, -10.6426,
         -10.5211,  -9.6369, -11.0325,  -9.8193,  -9.8407, -11.3109, -10.0103,
          -9.1413, -11.3183,  -9.6363, -10.5181, -10

In [11]:
predicted_class_ids = torch.argmax(prediction_logits, dim=-1).item()

In [12]:
predicted_class_ids

137

In [13]:
predicted_label = model.config.id2label[predicted_class_ids]

In [14]:
predicted_label

'Music'

In [15]:
# model.config.id2label

## Pipeline for Audio Classification

In [16]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("audio-classification", model="MIT/ast-finetuned-audioset-10-10-0.4593")

In [17]:
pipe.model

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ASTLayer(
          (attention): ASTSdpaAttention(
            (attention): ASTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
       

In [18]:
pipe('example.mp3')

  waveform = torch.from_numpy(waveform).unsqueeze(0)


[{'score': 0.48486775159835815, 'label': 'Music'},
 {'score': 0.1913110315799713, 'label': 'Violin, fiddle'},
 {'score': 0.08519726246595383, 'label': 'Musical instrument'},
 {'score': 0.04692425578832626, 'label': 'Bowed string instrument'},
 {'score': 0.04536106064915657, 'label': 'Orchestra'}]

In [19]:
len(pipe.model.config.id2label)

527