Reference

- [AudioSet](https://research.google.com/audioset/)

More model ideas

- Other (smaller) versions of AST

- https://huggingface.co/topel/ConvNeXt-Tiny-AT

- https://huggingface.co/search/full-text?q=audioset&p=1&type=model

- https://paperswithcode.com/paper/efficient-large-scale-audio-tagging-via

- https://paperswithcode.com/paper/dynamic-convolutional-neural-networks-as

- https://paperswithcode.com/paper/panns-large-scale-pretrained-audio-neural-1

# Imports, installs, etc.

In [1]:
!pip install -qq transformers

In [2]:
import time

import numpy as np

from tqdm.notebook import tqdm

import torch
import torchaudio

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda


In [4]:
# TODO: move to the library?

def downmix_to_mono(waveform):
    return waveform.mean(dim=0)

def preprocess(waveform, sampling_rate, target_sampling_rate):
    waveform = downmix_to_mono(waveform)
    return torchaudio.functional.resample(waveform, sampling_rate, target_sampling_rate)

In [5]:
!cp ./drive/MyDrive/Projects/MiniSoundFinder_v2/samples/* .

In [6]:
sample_path = 'freesound_442485_dogs_barking_60sec.wav'
print(torchaudio.info(sample_path))

AudioMetaData(sample_rate=48000, num_frames=2847537, num_channels=2, bits_per_sample=24, encoding=PCM_S)


In [7]:
waveform, sampling_rate = torchaudio.load(sample_path)
waveform.shape

torch.Size([2, 2847537])

In [8]:
wf_prep = preprocess(waveform, sampling_rate, 16000)
wf_prep.shape

torch.Size([949179])

# Models

## AST

- [AST on HuggingFace](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)

- [AST Paper](https://arxiv.org/pdf/2104.01778.pdf)

In [9]:
from transformers import AutoFeatureExtractor, ASTForAudioClassification

In [10]:
extractor_ast = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
extractor_ast

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


ASTFeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "ASTFeatureExtractor",
  "feature_size": 1,
  "max_length": 1024,
  "mean": -4.2677393,
  "num_mel_bins": 128,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000,
  "std": 4.5689974
}

In [11]:
model_ast = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593").to(DEVICE)
model_ast

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (de

In [13]:
target_sampling_rate = extractor_ast.sampling_rate
wf_prep = preprocess(waveform, sampling_rate, target_sampling_rate)
sample_features = extractor_ast(wf_prep, target_sampling_rate, return_tensors="pt").to(DEVICE)
sample_features['input_values'].shape

torch.Size([1, 1024, 128])

In [14]:
with torch.no_grad():
    probs = torch.sigmoid(model_ast(**sample_features).logits)

In [15]:
top_classes = torch.argsort(probs, dim=-1, descending=True).flatten()[:10]
top_labels = [(model_ast.config.id2label[id.item()], probs[0, id].item()) for id in top_classes]
top_labels

[('Dog', 0.7794266939163208),
 ('Animal', 0.7416857481002808),
 ('Domestic animals, pets', 0.6783860325813293),
 ('Bark', 0.6013593673706055),
 ('Bow-wow', 0.4377628266811371),
 ('Canidae, dogs, wolves', 0.19103211164474487),
 ('Yip', 0.10324634611606598),
 ('Whimper (dog)', 0.06323516368865967),
 ('Vehicle', 0.025787750259041786),
 ('Growling', 0.019477838650345802)]

In [16]:
def measure_inference_time_ast(model, feature_extractor,
                               sample_length_sec=60,
                               repeats=20,
                               chunk_length_sec=10):

    sampling_rate = feature_extractor.sampling_rate
    sample_length = sampling_rate * sample_length_sec

    extr_times = []
    inf_times = []
    for i in range(repeats):
        extr_start = time.time()
        wf = torch.distributions.uniform.Uniform(-10000, 10000).sample((sample_length,))
        chunk_length = chunk_length_sec * sampling_rate
        chunks = [c.numpy() for c in torch.split(wf, chunk_length)]
        inp = feature_extractor(chunks, sampling_rate, return_tensors="pt").to(DEVICE)
        extr_times.append(time.time() - extr_start)

        inf_start = time.time()
        with torch.no_grad():
            probs = torch.sigmoid(model(**inp).logits)
        inf_times.append(time.time() - inf_start)

    print("Extraction:", np.mean(extr_times), "±", np.std(extr_times))
    print("Inference:", np.mean(inf_times), "±", np.std(inf_times))

print("1 minute")
measure_inference_time_ast(model_ast, extractor_ast, sample_length_sec=60)
print()

print("2 minutes")
measure_inference_time_ast(model_ast, extractor_ast, sample_length_sec=120)
print()

print("5 minutes")
measure_inference_time_ast(model_ast, extractor_ast, sample_length_sec=300)
print()

1 minute
Extraction: 0.49896767139434817 ± 0.05701984130123135
Inference: 0.06294565200805664 ± 0.12200481841226771

2 minutes
Extraction: 1.0849987626075746 ± 0.13093320682879783
Inference: 0.03713115453720093 ± 0.017910840644628657

5 minutes
Extraction: 2.624024844169617 ± 0.5936241239571343
Inference: 0.15126835107803344 ± 0.6031503484581417



## AST Distilled

https://huggingface.co/bookbot/distil-ast-audioset

In [23]:
extractor_ast_distil = AutoFeatureExtractor.from_pretrained("bookbot/distil-ast-audioset")
extractor_ast_distil

ASTFeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "ASTFeatureExtractor",
  "feature_size": 1,
  "max_length": 1024,
  "mean": -4.2677393,
  "num_mel_bins": 128,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000,
  "std": 4.5689974
}

In [18]:
model_ast_distil = ASTForAudioClassification.from_pretrained("bookbot/distil-ast-audioset").to(DEVICE)
model_ast_distil

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-5): 6 x ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (dens

In [19]:
target_sampling_rate = extractor_ast_distil.sampling_rate
wf_prep = preprocess(waveform, sampling_rate, target_sampling_rate)
sample_features = extractor_ast_distil(wf_prep, target_sampling_rate, return_tensors="pt").to(DEVICE)

with torch.no_grad():
    probs = torch.sigmoid(model_ast_distil(**sample_features).logits)

top_classes = torch.argsort(probs, dim=-1, descending=True).flatten()[:10]
top_labels = [(model_ast.config.id2label[id.item()], probs[0, id].item()) for id in top_classes]
top_labels

[('Animal', 0.8284617066383362),
 ('Dog', 0.7962838411331177),
 ('Domestic animals, pets', 0.7366085052490234),
 ('Bark', 0.5144543051719666),
 ('Bow-wow', 0.4676071107387543),
 ('Speech', 0.3647525906562805),
 ('Canidae, dogs, wolves', 0.18395470082759857),
 ('Yip', 0.1634257733821869),
 ('Whimper (dog)', 0.15281470119953156),
 ('Growling', 0.05686230957508087)]

In [21]:
print("1 minute")
measure_inference_time_ast(model_ast_distil, extractor_ast_distil, sample_length_sec=60)
print()

print("2 minutes")
measure_inference_time_ast(model_ast_distil, extractor_ast_distil, sample_length_sec=120)
print()

print("5 minutes")
measure_inference_time_ast(model_ast_distil, extractor_ast_distil, sample_length_sec=300)
print()

1 minute
Extraction: 0.27698051929473877 ± 0.03136304936199706
Inference: 0.01278308629989624 ± 0.004615448690231693

2 minutes
Extraction: 0.5859007120132447 ± 0.06715868676346086
Inference: 0.01448047161102295 ± 0.007278011488104042

5 minutes
Extraction: 1.4067103147506714 ± 0.1870556772212367
Inference: 0.008913743495941161 ± 0.004447023831099555



# Event finder

In [24]:
# TODO: move to the library?

def continuos_segments(values):
    start = -1
    for i in range(len(values)):
        if values[i]:
            if start == -1:
                start = i
        else:
            if start != -1:
                yield (start, i)
            start = -1
    if start != -1:
        yield (start, len(values))


class EventFinder:
    def __init__(self, feature_extractor, model, chunk_length_sec, prob_threshold=0.3):
        self.feature_extractor = feature_extractor
        self.target_sampling_rate = self.feature_extractor.sampling_rate
        self.model = model
        self.chunk_length_sec = chunk_length_sec

        # TMP
        self.event_classes = [
            74,   # Dog
            137,  # Music
            300,  # Vehicle
            0,    # Speech
            117,  # Crow
            112,  # Bird vocalization, bird call, bird song
        ]
        self.prob_threshold = prob_threshold

    def compute_probabilities(self, audio_path):
        waveform_raw, sampling_rate = torchaudio.load(audio_path)
        waveform = preprocess(waveform_raw, sampling_rate, self.target_sampling_rate)

        chunk_length = self.chunk_length_sec * self.target_sampling_rate
        chunks = [c.numpy() for c in torch.split(waveform, chunk_length)]

        features = self.feature_extractor(chunks, self.target_sampling_rate, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            probs = torch.sigmoid(self.model(**features).logits)
        return probs

    def find_events(self, audio_path):
        probs = self.compute_probabilities(audio_path)

        events = []
        for class_ind in self.event_classes:
            class_name = self.model.config.id2label[class_ind]
            class_parts = probs[:, class_ind] >= self.prob_threshold
            for begin, end in continuos_segments(class_parts):
                begin_sec = begin * self.chunk_length_sec
                end_sec = end * self.chunk_length_sec
                events.append((class_name, begin_sec, end_sec))

        events.sort(key=lambda e: e[1])

        return probs, events



finder = EventFinder(extractor_ast, model_ast,
                     chunk_length_sec=10, prob_threshold=0.2)

In [25]:
# probs, top_labels = finder.find_events('/content/freesound_442485_dogs_barking_60sec.wav')
# top_classes = torch.argsort(probs, dim=-1, descending=True)[:, :5]
# for i in range(top_classes.shape[0]):
#     print([(model.config.id2label[id.item()], id.item(), probs[i, id].item()) for id in top_classes[i]])

In [26]:
probs, events = finder.find_events('/content/freesound_442485_dogs_barking_60sec.wav')
events

[('Dog', 0, 60)]

In [27]:
probs, events = finder.find_events('/content/freesound_471408_birds_90sec.wav')
events

[('Music', 0, 10),
 ('Crow', 0, 40),
 ('Bird vocalization, bird call, bird song', 40, 50),
 ('Crow', 50, 60),
 ('Bird vocalization, bird call, bird song', 60, 80),
 ('Crow', 80, 100)]

In [28]:
probs, events = finder.find_events('/content/recorded_street_150sec.wav')
events

[('Vehicle', 10, 20),
 ('Speech', 10, 80),
 ('Music', 20, 30),
 ('Vehicle', 40, 90),
 ('Music', 80, 100),
 ('Speech', 90, 150)]