In [2]:
# install and import dependencies

%pip install transformers opensearch-py librosa torch torchvision torchaudio pyaudio

from transformers import pipeline, ClapModel, ClapProcessor, AutoTokenizer
import IPython as ip
import librosa
import json
import csv
from opensearchpy import OpenSearch
from opensearchpy.helpers import bulk
import os

Collecting pyaudio
  Using cached PyAudio-0.2.14.tar.gz (47 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: pyaudio
  Building wheel for pyaudio (pyproject.toml) ... [?25ldone
[?25h  Created wheel for pyaudio: filename=PyAudio-0.2.14-cp39-cp39-macosx_14_0_arm64.whl size=25699 sha256=0bfe03f4aa3fe534a642710405dfb19810dbac759a39cdce08be478d54823fd6
  Stored in directory: /Users/ajwallace/Library/Caches/pip/wheels/28/d3/62/6ad369dc09fe82e1c9ceb83601a800eb305b901df7789aa550
Successfully built pyaudio
Installing collected packages: pyaudio
Successfully installed pyaudio-0.2.14
Note: you may need to restart the kernel to use updated packages.


In [None]:
!docker-compose up -d

In [None]:
# check opensearch connection

!curl -X GET http://localhost:9200

In [None]:
# create opensearch client

host = 'localhost'
port = 9200

# Create the client with ssl and auth disabled, NOT to be used for production!
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    use_ssl = False,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)

print(client.info())

In [None]:
# Create clap index if it doesn't already exist.

index_name = 'clap'

response = client.indices.exists(index=index_name)
print('\nDoes Index already exist?')
print(response)
if response == True:
  print('Skipping creating index')
else:
  # generate the index mappings and settings and create the index
  f = open('./clap_mapping.json')
  index_mappings_and_settings = json.load(f)

  response = client.indices.create(index_name, body=index_mappings_and_settings)
  print('\nCreating index:')
  print(response)

In [None]:
# init ML processors/models/tokenizers
audio_classifier = pipeline(task="zero-shot-audio-classification", model="laion/larger_clap_music_and_speech")
model = ClapModel.from_pretrained("laion/larger_clap_music_and_speech")
processor = ClapProcessor.from_pretrained("laion/larger_clap_music_and_speech")
tokenizer = AutoTokenizer.from_pretrained("laion/larger_clap_music_and_speech")

In [None]:
# function to create audio embedding

def embed_audio(filepath):
    y, sr = librosa.load(filepath)
    inputs = processor(audios=y, return_tensors="pt", sampling_rate=48000)
    audio_embed = model.get_audio_features(**inputs)
    arr = audio_embed.detach().numpy()
    return arr

In [None]:
# List to store documents for bulk indexing
bulk_docs = []
# es bulk batch size
batch_size = 100

# Function to perform bulk indexing
def bulk_index_documents(documents):
    actions = []
    for doc in documents:
        action = {
            "_index": index_name,
            "_source": doc
        }
        actions.append(action)
    
    bulk(client, actions)

In [None]:
# FMA Audio Set

# define range for fma sub directories
start_fma_directory = 134
end_fma_directory = 155  

# Define the common part of fma directory path
base_directory = '../audio_data/fma/data/fma_small/'

# csv with metadata for fma tracks
fma_metadata = '../audio_data/fma/data/fma_metadata/raw_tracks.csv'


# Read CSV file into a dictionary for easy lookup
fma_mapping = {}
with open(fma_metadata, 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        fma_mapping[row['track_id']] = {'artist': row['artist_name'], 'title': row['track_title'], 'album': row['album_title'], 'genres': row['track_genres']}

# Iterate over all audio files in the directory and generate es doc
for directory_number in range(start_fma_directory, end_fma_directory + 1):
    # Construct the directory path
    fma_directory = os.path.join(base_directory, f"{directory_number:03d}")

    for filename in os.listdir(fma_directory):
        track_file = filename.lstrip('0')
        track_id = track_file[:-4]

        genres_arr = fma_mapping[track_id]['genres'].replace("'", '"')
        genres_j = json.loads(genres_arr)
        genres = [genre['genre_title'] for genre in genres_j]
        if filename.endswith(".mp3"):
            filepath = os.path.join(fma_directory, filename)
            print("Processing:", filepath)
            print("track_id: ", track_id)

            # y, sr = librosa.load(filepath)
            # inputs = processor(audios=y, return_tensors="pt", sampling_rate=48000)
            # audio_embed = model.get_audio_features(**inputs)
            # arr = audio_embed.detach().numpy()

            arr = embed_audio(filepath)

            doc = {
                "audio_embedding": arr[0],
                "audio_set": "fma",
                "title": fma_mapping[track_id]['title'],
                "artist": fma_mapping[track_id]['artist'],
                "album": fma_mapping[track_id]['album'],
                "track_id": track_id,
                "genres": genres,
                "filepath": filepath,
            }

            # Add document to bulk indexing list
            bulk_docs.append(doc)
            
            # Perform bulk indexing if batch size is reached
            if len(bulk_docs) == batch_size:
                bulk_index_documents(bulk_docs)
                bulk_docs = []
                
# Index any remaining documents
if bulk_docs:
    bulk_index_documents(bulk_docs)

print("Bulk indexing completed.")


In [None]:
# Vocal Imitations Audio Set

# Iterate over all audio files in the directory and generate es doc
vocal_imitations_directory = '../audio_data/vocal_imitations/included/'
for filename in os.listdir(vocal_imitations_directory):
    track_id = filename[:-4]

    if filename.endswith(".wav"):
        filepath = os.path.join(vocal_imitations_directory, filename)
        print("Processing:", filepath)
        print("track_id: ", track_id)

        arr = embed_audio(filepath)

        doc = {
            "audio_embedding": arr[0],
            "audio_set": "vocal_imitations",
            "track_id": track_id,
            "filepath": filepath,
        }

        # Add document to bulk indexing list
        bulk_docs.append(doc)
        
        # Perform bulk indexing if batch size is reached
        if len(bulk_docs) == batch_size:
            bulk_index_documents(bulk_docs)
            bulk_docs = []
            
# Index any remaining documents
if bulk_docs:
    bulk_index_documents(bulk_docs)

print("Bulk indexing completed.")

In [None]:
# FUSS Audio Set

# Iterate over all audio files in the directory and generate es doc
fuss_directory = '../audio_data/FUSS/source_pure/fsd_data/train/sound'
for filename in os.listdir(fuss_directory):
    track_id = filename[:-4]

    if filename.endswith(".wav"):
        filepath = os.path.join(fuss_directory, filename)
        print("Processing:", filepath)
        print("track_id: ", track_id)

        arr = embed_audio(filepath)

        doc = {
            "audio_embedding": arr[0],
            "audio_set": "fuss",
            "track_id": track_id,
            "filepath": filepath,
        }

        # Add document to bulk indexing list
        bulk_docs.append(doc)
        
        # Perform bulk indexing if batch size is reached
        if len(bulk_docs) == batch_size:
            bulk_index_documents(bulk_docs)
            bulk_docs = []
            
# Index any remaining documents
if bulk_docs:
    bulk_index_documents(bulk_docs)

print("Bulk indexing completed.")

In [None]:
# Check how many docs have audio_embedding
query = {
  'size': 5,
  'query': {
    'exists': {'field': 'audio_embedding'}
  }
}

response = client.search(
    body = query,
    index = index_name
)
print('\nSearch results:')
print(response['hits']['total'])

In [None]:
# similarity search with text input against audio_embeddings

query = input('type a search query: ')
text_data = tokenizer([query], padding=True, return_tensors="pt")
text_embed = model.get_text_features(**text_data)
text_arr = text_embed.detach().numpy()[0]

# Search for the document.
query = {
  'size': 5,
  'query': {
    'knn': {
        'audio_embedding': {
            'k': 10,
            'vector': text_arr
        }
    }
  }
}

response = client.search(
    body = query,
    index = index_name
)
print('\nSearch results:')
hits = response['hits']['hits']

def displayResults(hits):
  for hit in hits:
    if 'title' in hit['_source'] and 'genres' in hit['_source']:
      ip.display.display(hit['_source']['title'], hit['_source']['genres'])
    ip.display.display(hit['_score'])
    filepath = hit['_source']['filepath']
    ip.display.display(ip.display.Audio(filepath))

displayResults(hits)
