In [1]:
!pip install torch torchaudio faiss-cpu transformers pillow tqdm



In [3]:
!apt install libomp-dev
!pip install faiss-cpu

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libomp-14-dev libomp5-14
Suggested packages:
  libomp-14-doc
The following NEW packages will be installed:
  libomp-14-dev libomp-dev libomp5-14
0 upgraded, 3 newly installed, 0 to remove and 35 not upgraded.
Need to get 738 kB of archives.
After this operation, 8,991 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libomp5-14 amd64 1:14.0.0-1ubuntu1.1 [389 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libomp-14-dev amd64 1:14.0.0-1ubuntu1.1 [347 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libomp-dev amd64 1:14.0-55~exp2 [3,074 B]
Fetched 738 kB in 1s (536 kB/s)
Selecting previously unselected package libomp5-14:amd64.
(Reading database ... 126374 files and directories currently installed.)
Preparing to unpack .../libomp5-14_1%3a14

In [4]:
import os
import requests
from io import BytesIO
from tqdm import tqdm

import torch
from torchaudio import transforms
from transformers import CLIPProcessor, CLIPModel
import faiss
import numpy as np

In [19]:
# Load CLIP
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Sample dataset of audio URLs
audio_urls = [
    "https://freeanimalsounds.org/download/95/?tmstv=1757038065",
    "https://freeanimalsounds.org/download/115/?tmstv=1757038065",
    "https://freeanimalsounds.org/download/124/?tmstv=1757038065",
    "https://freeanimalsounds.org/download/150/?tmstv=1757038065",
    "https://freeanimalsounds.org/download/165/?tmstv=1757038065",
    "https://freeanimalsounds.org/download/218/?tmstv=1757038065",
    "https://freeanimalsounds.org/download/220/?tmstv=1757038065",
    "https://freeanimalsounds.org/download/227/?tmstv=1757038065",
    "https://freeanimalsounds.org/download/223/?tmstv=1757038065",
    "https://freeanimalsounds.org/download/230/?tmstv=1757038065",
    "https://freeanimalsounds.org/download/258/?tmstv=1757038065",
    "https://freeanimalsounds.org/download/270/?tmstv=1757038065",
    "https://freeanimalsounds.org/download/271/?tmstv=1757038065",
    "https://freeanimalsounds.org/download/272/?tmstv=1757038065",
    "https://freeanimalsounds.org/download/250/?tmstv=1757038065"
]


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [20]:
os.makedirs("dataset", exist_ok=True)
audio_paths = []

# Download audio files
for i, url in enumerate(tqdm(audio_urls, desc="Downloading animal sounds")):
    response = requests.get(url)
    if response.status_code == 200:
        path = f"dataset/audi_{i}.mp3"
        with open(path, "wb") as f:
            f.write(response.content)
        audio_paths.append(path)

Downloading animal sounds: 100%|██████████| 15/15 [00:22<00:00,  1.52s/it]


In [21]:

from transformers import Wav2Vec2Model, Wav2Vec2Processor

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "facebook/wav2vec2-base-960h"

processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name).to(device)


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
import librosa
import numpy as np

def get_audio_embedding(path):
    waveform, sr = librosa.load(path, sr=16000)
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.last_hidden_state  # shape: (batch, time, features)
        embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()  # mean pooling
    return embedding


In [24]:
import faiss

embeddings = []
for path in tqdm(audio_paths, desc="Embedding audios"):
    vec = get_audio_embedding(path)
    embeddings.append(vec)

embeddings = np.stack(embeddings).astype("float32")

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

faiss.write_index(index, "audio_index.faiss")
with open("audio_paths.txt", "w") as f:
      f.writelines([p + "\n" for p in audio_paths])

Embedding audios: 100%|██████████| 11/11 [00:19<00:00,  1.80s/it]


In [25]:
import torchaudio
def search_similar_audio(query_path, top_k=3):
    index = faiss.read_index("audio_index.faiss")
    with open("audio_paths.txt", "r") as f:
        paths = [line.strip() for line in f.readlines()]

    query_vec = get_audio_embedding(query_path).astype("float32").reshape(1, -1)
    distances, indices = index.search(query_vec, top_k)

    print(f"Query audio: {query_path}")
    print("Top matches:")
    for idx, dist in zip(indices[0], distances[0]):
        print(f"{paths[idx]} - Distance: {dist}")
        #torchaudio.io.play_audio(torchaudio.load(audio_urls[idx]))


In [26]:
search_similar_audio("/content/horse-neigh.mp3")

Query audio: /content/horse-neigh.mp3
Top matches:
dataset/audi_14.mp3 - Distance: 1.3530620336532593
dataset/audi_11.mp3 - Distance: 2.0406339168548584
dataset/audi_10.mp3 - Distance: 2.6627511978149414
