Reference

- [AudioSet](https://research.google.com/audioset/)

- [AST on HuggingFace](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)

- [AST Paper](https://arxiv.org/pdf/2104.01778.pdf)

# Imports, installs, etc.

In [1]:
!pip install -qq transformers

In [2]:
from tqdm.notebook import tqdm

import torch
import torchaudio

from transformers import AutoFeatureExtractor, ASTForAudioClassification

In [24]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda


# Samples

In [3]:
!cp ./drive/MyDrive/Projects/MiniSoundFinder_v2/samples/* .

In [4]:
sample_path = 'freesound_442485_dogs_barking_60sec.wav'
print(torchaudio.info(sample_path))

AudioMetaData(sample_rate=48000, num_frames=2847537, num_channels=2, bits_per_sample=24, encoding=PCM_S)


In [31]:
waveform, sample_rate = torchaudio.load(sample_path)
waveform.shape

torch.Size([2, 2847537])

In [56]:
TARGET_SAMPLE_RATE = 16000

def downmix_to_mono(waveform):
    return waveform.mean(dim=0)

def preprocess(waveform, sampling_rate, target_sampling_rate=TARGET_SAMPLE_RATE):
    waveform = downmix_to_mono(waveform)
    return torchaudio.functional.resample(waveform, sampling_rate, target_sampling_rate)

In [7]:
wf_prep = preprocess(waveform, sample_rate)

In [8]:
wf_prep.shape

torch.Size([949179])

In [35]:
wf_prep.shape[0] / TARGET_SAMPLE_RATE

59.3236875

# Model

In [27]:
feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
feature_extractor

ASTFeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "ASTFeatureExtractor",
  "feature_size": 1,
  "max_length": 1024,
  "mean": -4.2677393,
  "num_mel_bins": 128,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000,
  "std": 4.5689974
}

In [26]:
model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593").to(DEVICE)
model

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (de

In [40]:
sample_features = feature_extractor(wf_prep, TARGET_SAMPLE_RATE, return_tensors="pt").to(DEVICE)
sample_features['input_values'].shape

torch.Size([1, 1024, 128])

In [29]:
with torch.no_grad():
    probs = torch.sigmoid(model(**sample_features).logits)

In [30]:
top_classes = torch.argsort(probs, dim=-1, descending=True).flatten()[:10]
top_labels = [(model.config.id2label[id.item()], probs[0, id].item()) for id in top_classes]
top_labels

[('Dog', 0.7794268727302551),
 ('Animal', 0.7416858673095703),
 ('Domestic animals, pets', 0.6783862113952637),
 ('Bark', 0.6013599038124084),
 ('Bow-wow', 0.4377628266811371),
 ('Canidae, dogs, wolves', 0.19103215634822845),
 ('Yip', 0.10324634611606598),
 ('Whimper (dog)', 0.06323514133691788),
 ('Vehicle', 0.025787750259041786),
 ('Growling', 0.019477874040603638)]

In [63]:
top_classes[0]

tensor(74, device='cuda:0')

# Event finder

In [80]:
class EventFinder:
    def __init__(self, feature_extractor, model, segment_length_sec):
        self.feature_extractor = feature_extractor
        self.target_sampling_rate = self.feature_extractor.sampling_rate
        self.model = model
        self.segment_length_sec = segment_length_sec

    def compute_probabilities(self, audio_path):
        waveform_raw, sampling_rate = torchaudio.load(audio_path)
        waveform = preprocess(waveform_raw, sampling_rate, self.target_sampling_rate)

        segment_length = self.segment_length_sec * self.target_sampling_rate
        segments = [s.numpy() for s in torch.split(waveform, segment_length)]

        features = self.feature_extractor(segments, self.target_sampling_rate, return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            probs = torch.sigmoid(model(**features).logits)
        return probs

    def find_events(self, audio_path):
        probs = self.compute_probabilities(audio_path)
        top_classes = torch.argsort(probs, dim=-1, descending=True)[:, 0]
        top_labels = [model.config.id2label[id.item()] for id in top_classes]
        return probs, top_labels

finder = EventFinder(feature_extractor, model,
                     segment_length_sec=10)
probs, top_labels = finder.find_events('/content/freesound_442485_dogs_barking_60sec.wav')
top_labels

['Dog', 'Dog', 'Dog', 'Animal', 'Dog', 'Dog']

In [83]:
probs, top_labels = finder.find_events('/content/recorded_street_150sec.wav')
top_labels

['Vehicle',
 'Traffic noise, roadway noise',
 'Speech',
 'Speech',
 'Speech',
 'Traffic noise, roadway noise',
 'Speech',
 'Speech',
 'Vehicle',
 'Speech',
 'Speech',
 'Speech',
 'Speech',
 'Speech',
 'Speech']

In [84]:
probs, top_labels = finder.find_events('/content/freesound_471408_birds_90sec.wav')
top_labels

['Music',
 'Bird',
 'Crow',
 'Crow',
 'Environmental noise',
 'Crow',
 'Bird',
 'Environmental noise',
 'Crow',
 'Caw']