Reference

- [AudioSet](https://research.google.com/audioset/)

More model ideas

- Other (smaller) versions of AST

- https://huggingface.co/topel/ConvNeXt-Tiny-AT

- https://huggingface.co/search/full-text?q=audioset&p=1&type=model

- https://paperswithcode.com/paper/efficient-large-scale-audio-tagging-via

- https://paperswithcode.com/paper/dynamic-convolutional-neural-networks-as

- https://paperswithcode.com/paper/panns-large-scale-pretrained-audio-neural-1

# Imports, installs, etc.

In [1]:
!pip install -qq transformers

In [2]:
import requests
import sys
import time

import numpy as np

from tqdm.notebook import tqdm

import torch
import torchaudio

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda


In [4]:
sys.path.append('./drive/MyDrive/Projects/MiniSoundFinder_v2/model/library/')

import event_finder, preprocess

In [30]:
# from importlib import reload
# reload(event_finder)

<module 'event_finder' from '/content/./drive/MyDrive/Projects/MiniSoundFinder_v2/model/library/event_finder.py'>

# Samples

In [5]:
!cp ./drive/MyDrive/Projects/MiniSoundFinder_v2/samples/* .

In [6]:
sample_path = 'freesound_442485_dogs_barking_60sec.wav'
print(torchaudio.info(sample_path))

AudioMetaData(sample_rate=48000, num_frames=2847537, num_channels=2, bits_per_sample=24, encoding=PCM_S)


In [7]:
waveform, sampling_rate = torchaudio.load(sample_path)
waveform.shape

torch.Size([2, 2847537])

In [8]:
wf_prep = preprocess.convert_audio(waveform, sampling_rate)
wf_prep.shape

torch.Size([949179])

# Models

## AST

- [AST on HuggingFace](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)

- [AST Paper](https://arxiv.org/pdf/2104.01778.pdf)

In [9]:
from transformers import AutoFeatureExtractor, ASTForAudioClassification

In [10]:
extractor_ast = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
extractor_ast

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

ASTFeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "ASTFeatureExtractor",
  "feature_size": 1,
  "max_length": 1024,
  "mean": -4.2677393,
  "num_mel_bins": 128,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000,
  "std": 4.5689974
}

In [11]:
model_ast = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593").to(DEVICE)
model_ast

config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ASTLayer(
          (attention): ASTSdpaAttention(
            (attention): ASTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
       

In [12]:
target_sampling_rate = extractor_ast.sampling_rate
wf_prep = preprocess.convert_audio(waveform, sampling_rate, sampling_rate=target_sampling_rate)
sample_features = extractor_ast(wf_prep, target_sampling_rate, return_tensors="pt").to(DEVICE)
sample_features['input_values'].shape

torch.Size([1, 1024, 128])

In [13]:
with torch.no_grad():
    probs = torch.sigmoid(model_ast(**sample_features).logits)

  return F.conv2d(input, weight, bias, self.stride,


In [14]:
top_classes = torch.argsort(probs, dim=-1, descending=True).flatten()[:10]
top_labels = [(model_ast.config.id2label[id.item()], probs[0, id].item()) for id in top_classes]
top_labels

[('Dog', 0.7794262766838074),
 ('Animal', 0.7416852712631226),
 ('Domestic animals, pets', 0.6783854961395264),
 ('Bark', 0.6013591885566711),
 ('Bow-wow', 0.4377618432044983),
 ('Canidae, dogs, wolves', 0.1910315901041031),
 ('Yip', 0.1032460406422615),
 ('Whimper (dog)', 0.06323492527008057),
 ('Vehicle', 0.025787677615880966),
 ('Growling', 0.01947779208421707)]

In [18]:
def measure_inference_time_ast(model, feature_extractor,
                               sample_length_sec=60,
                               repeats=10,
                               chunk_length_sec=10):

    sampling_rate = feature_extractor.sampling_rate
    sample_length = sampling_rate * sample_length_sec

    extr_times = []
    inf_times = []
    for i in tqdm(range(repeats)):
        extr_start = time.time()
        wf = torch.distributions.uniform.Uniform(-10000, 10000).sample((sample_length,))
        chunks = event_finder.chunk_audio(wf, sampling_rate, chunk_length_sec)
        inp = feature_extractor(chunks, sampling_rate, return_tensors="pt").to(DEVICE)
        extr_times.append(time.time() - extr_start)

        inf_start = time.time()
        with torch.no_grad():
            probs = torch.sigmoid(model(**inp).logits)
        inf_times.append(time.time() - inf_start)

    print("Extraction:", np.mean(extr_times), "±", np.std(extr_times))
    print("Inference:", np.mean(inf_times), "±", np.std(inf_times))

print("1 minute")
measure_inference_time_ast(model_ast, extractor_ast, sample_length_sec=60)
print()

print("2 minutes")
measure_inference_time_ast(model_ast, extractor_ast, sample_length_sec=120)
print()

print("5 minutes")
measure_inference_time_ast(model_ast, extractor_ast, sample_length_sec=300)
print()

1 minute


  0%|          | 0/10 [00:00<?, ?it/s]

  return F.conv2d(input, weight, bias, self.stride,


Extraction: 0.3500690221786499 ± 0.08108287374318109
Inference: 0.06947240829467774 ± 0.1536756070772051

2 minutes


  0%|          | 0/10 [00:00<?, ?it/s]

Extraction: 0.8158706665039063 ± 0.1465797115506997
Inference: 0.008660721778869628 ± 0.004713759858395743

5 minutes


  0%|          | 0/10 [00:00<?, ?it/s]

Extraction: 1.7674006700515748 ± 0.5863253076747661
Inference: 0.21228258609771727 ± 0.6123435826036645



## AST Distilled

https://huggingface.co/bookbot/distil-ast-audioset

In [19]:
extractor_ast_distil = AutoFeatureExtractor.from_pretrained("bookbot/distil-ast-audioset")
extractor_ast_distil

preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

ASTFeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "ASTFeatureExtractor",
  "feature_size": 1,
  "max_length": 1024,
  "mean": -4.2677393,
  "num_mel_bins": 128,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000,
  "std": 4.5689974
}

In [20]:
model_ast_distil = ASTForAudioClassification.from_pretrained("bookbot/distil-ast-audioset").to(DEVICE)
model_ast_distil

config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/176M [00:00<?, ?B/s]

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-5): 6 x ASTLayer(
          (attention): ASTSdpaAttention(
            (attention): ASTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
         

In [21]:
target_sampling_rate = extractor_ast_distil.sampling_rate
wf_prep = preprocess.convert_audio(waveform, sampling_rate, sampling_rate=target_sampling_rate)
sample_features = extractor_ast_distil(wf_prep, target_sampling_rate, return_tensors="pt").to(DEVICE)

with torch.no_grad():
    probs = torch.sigmoid(model_ast_distil(**sample_features).logits)

top_classes = torch.argsort(probs, dim=-1, descending=True).flatten()[:10]
top_labels = [(model_ast.config.id2label[id.item()], probs[0, id].item()) for id in top_classes]
top_labels

[('Animal', 0.828461766242981),
 ('Dog', 0.7962841987609863),
 ('Domestic animals, pets', 0.7366089820861816),
 ('Bark', 0.5144554376602173),
 ('Bow-wow', 0.46760764718055725),
 ('Speech', 0.3647525906562805),
 ('Canidae, dogs, wolves', 0.18395493924617767),
 ('Yip', 0.1634262055158615),
 ('Whimper (dog)', 0.1528150737285614),
 ('Growling', 0.056862421333789825)]

In [22]:
print("1 minute")
measure_inference_time_ast(model_ast_distil, extractor_ast_distil, sample_length_sec=60)
print()

print("2 minutes")
measure_inference_time_ast(model_ast_distil, extractor_ast_distil, sample_length_sec=120)
print()

print("5 minutes")
measure_inference_time_ast(model_ast_distil, extractor_ast_distil, sample_length_sec=300)
print()

1 minute


  0%|          | 0/10 [00:00<?, ?it/s]

Extraction: 0.19973981380462646 ± 0.04650025047313424
Inference: 0.005167722702026367 ± 0.0018990997426804638

2 minutes


  0%|          | 0/10 [00:00<?, ?it/s]

Extraction: 0.4113872289657593 ± 0.07677851878670704
Inference: 0.007021546363830566 ± 0.0022180525627791487

5 minutes


  0%|          | 0/10 [00:00<?, ?it/s]

Extraction: 0.9794687032699585 ± 0.17541534225596292
Inference: 0.009921550750732422 ± 0.004932944028655129



# Finding events

In [36]:
model = model_ast
extractor = extractor_ast
chunk_length_sec = 10
sampling_rate = extractor.sampling_rate

finder = event_finder.EventFinder(
    model.config,
    chunk_length_sec=chunk_length_sec,
    probability_threshold=0.2)

def find_events(audio_path):
    waveform_raw, source_sampling_rate = torchaudio.load(audio_path)
    waveform = preprocess.convert_audio(waveform_raw, source_sampling_rate,
                                        channels="mono", sampling_rate=sampling_rate)
    chunks = event_finder.chunk_audio(waveform, sampling_rate, chunk_length_sec=chunk_length_sec)

    features = extractor(chunks, sampling_rate, return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        probs = torch.sigmoid(model(**features).logits)

    events = finder(probs)
    return probs, events

In [37]:
probs, events = find_events('/content/freesound_442485_dogs_barking_60sec.wav')
events

[('Dog', 0, 60)]

In [38]:
probs, events = find_events('/content/freesound_471408_birds_90sec.wav')
events

  return F.conv2d(input, weight, bias, self.stride,


[('Music', 0, 10),
 ('Crow', 0, 40),
 ('Bird vocalization, bird call, bird song', 40, 50),
 ('Crow', 50, 60),
 ('Bird vocalization, bird call, bird song', 60, 80),
 ('Crow', 80, 100)]

In [39]:
probs, events = find_events('/content/recorded_street_150sec.wav')
events

[('Vehicle', 10, 20),
 ('Speech', 10, 80),
 ('Music', 20, 30),
 ('Vehicle', 40, 90),
 ('Music', 80, 100),
 ('Speech', 90, 150)]

In [None]:
# probs, top_labels = find_events('/content/freesound_442485_dogs_barking_60sec.wav')
# top_classes = torch.argsort(probs, dim=-1, descending=True)[:, :5]
# for i in range(top_classes.shape[0]):
#     print([(model.config.id2label[id.item()], id.item(), probs[i, id].item()) for id in top_classes[i]])