# Imports, installs, etc.

In [1]:
!pip install -qq transformers

In [5]:
from tqdm.notebook import tqdm

import torch
import torchaudio

from transformers import AutoFeatureExtractor, ASTForAudioClassification

# Samples

In [3]:
!cp ./drive/MyDrive/Projects/MiniSoundFinder_v2/samples/* .

In [8]:
sample_path = 'freesound_442485_dogs_barking_60sec.wav'
print(torchaudio.info(sample_path))

AudioMetaData(sample_rate=48000, num_frames=2847537, num_channels=2, bits_per_sample=24, encoding=PCM_S)


In [17]:
waveform, sample_rate = torchaudio.load(sample_path)
waveform.shape

torch.Size([2, 2847537])

In [20]:
TARGET_SAMPLE_RATE = 16000

def downmix_to_mono(waveform):
    return waveform.mean(dim=0)

def preprocess(waveform, sample_rate):
    waveform = downmix_to_mono(waveform)
    return torchaudio.functional.resample(waveform, sample_rate, TARGET_SAMPLE_RATE)

In [21]:
wf_prep = preprocess(waveform, sample_rate)

In [24]:
wf_prep.shape

torch.Size([949179])

# Model

In [25]:
feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
feature_extractor

ASTFeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "ASTFeatureExtractor",
  "feature_size": 1,
  "max_length": 1024,
  "mean": -4.2677393,
  "num_mel_bins": 128,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000,
  "std": 4.5689974
}

In [26]:
model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
model

ASTForAudioClassification(
  (audio_spectrogram_transformer): ASTModel(
    (embeddings): ASTEmbeddings(
      (patch_embeddings): ASTPatchEmbeddings(
        (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ASTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ASTLayer(
          (attention): ASTAttention(
            (attention): ASTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ASTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ASTIntermediate(
            (de

In [32]:
sample_features = feature_extractor(wf_prep, TARGET_SAMPLE_RATE, return_tensors="pt")
sample_features['input_values'].shape

torch.Size([1, 1024, 128])

In [33]:
with torch.no_grad():
    logits = model(**sample_features).logits

In [34]:
top_classes = torch.argsort(logits, dim=-1, descending=True).flatten()[:10]
top_labels = [model.config.id2label[id.item()] for id in top_classes]
top_labels

['Dog',
 'Animal',
 'Domestic animals, pets',
 'Bark',
 'Bow-wow',
 'Canidae, dogs, wolves',
 'Yip',
 'Whimper (dog)',
 'Vehicle',
 'Growling']