In [1]:
pip install laion-clap

Collecting laion-clap
  Downloading laion_clap-1.1.7-py3-none-any.whl.metadata (26 kB)
Collecting torchlibrosa (from laion-clap)
  Downloading torchlibrosa-0.1.0-py3-none-any.whl.metadata (3.5 kB)
Collecting ftfy (from laion-clap)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting braceexpand (from laion-clap)
  Downloading braceexpand-0.1.7-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting webdataset (from laion-clap)
  Downloading webdataset-1.0.2-py3-none-any.whl.metadata (12 kB)
Collecting wget (from laion-clap)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting progressbar (from laion-clap)
  Downloading progressbar-2.5.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers->laion-clap)
  Downloading huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Downloading laion_clap-1.1.7-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [2]:
import json
import logging
import os
import pandas as pd

from laion_clap import CLAP_Module
import torch
import torch.hub

logger = logging.getLogger(__name__)

2025-10-20 02:47:23.754659: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760928443.958401      77 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760928444.023712      77 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

In [3]:
import torch
import torch.nn.functional as F


def average_embeddings(
    embeddings: torch.Tensor, normalize: bool = True
) -> torch.Tensor:
    """
    Compute the averaged embedding of a set of embeddings using best practices.
    """
    if embeddings.ndim == 3 and embeddings.shape[1] == 1:
        embeddings = embeddings.squeeze(1)  # Convert (N, 1, D) → (N, D)
    elif embeddings.ndim != 2:
        raise ValueError(f"Expected embeddings of shape (N, D), got {embeddings.shape}")

    if normalize:
        embeddings = F.normalize(embeddings, dim=-1)

    mean_emb = embeddings.mean(dim=0)

    if normalize:
        mean_emb = F.normalize(mean_emb, dim=-1)

    return mean_emb

In [4]:
_CLAP_MODEL = None


def clap_model():
    global _CLAP_MODEL
    if _CLAP_MODEL is not None:
        return _CLAP_MODEL

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = CLAP_Module(enable_fusion=False)

    weights_url = "https://huggingface.co/lukewys/laion_clap/resolve/main/music_audioset_epoch_15_esc_90.14.pt"

    try:
        state_dict = torch.hub.load_state_dict_from_url(
            weights_url, map_location=device, weights_only=False
        )
        model.load_state_dict(state_dict, strict=False)
    except Exception as e:
        logger.error(f"Error loading model weights from {weights_url}: {e}")
        raise

    model.to(device)
    model.eval()

    logger.info(f"Modelo CLAP cargado correctamente en {device}")
    _CLAP_MODEL = model
    return model


def get_text_embeddings_in_batches(descriptors: list[str], batch_size: int = 16):
    model = clap_model()
    embeddings = []
    with torch.no_grad():
        for pos in range(0, len(descriptors), batch_size):
            batch = descriptors[pos : pos + batch_size]
            emb = F.normalize(
                model.get_text_embedding(batch, use_tensor=True), p=2, dim=-1
            )
            for text, embedding in zip(batch, emb.cpu()):
                embeddings.append({"text": text, "embedding": embedding.tolist()})

    return embeddings


def concat_salient_words(word_pos_map: dict[str, str]) -> str:
    return " ".join(word_pos_map.keys())


def load_spanio_captions() -> list[str]:
    file_path = "/kaggle/input/spanio/spanio_captions.json"
    with open(file_path, "r") as f:
        captions = [val for key, val in json.load(f).items()]

    return [concat_salient_words(caption) for caption in captions]

In [15]:
import csv


def load_only_audio_captions() -> list[str]:
    file_path = "/kaggle/input/audiocaps/only_audio_caps.csv"
    caps: list[str] = []

    with open(file_path, "r") as f:
        csv_reader = csv.reader(f)
        caps = [row[0] for row in csv_reader]

    return caps

In [20]:
captions_ = load_spanio_captions()
caption_embeddings_map = get_text_embeddings_in_batches(captions_)

pd.DataFrame(caption_embeddings_map).to_csv(
    "spanio_captions_embeddings.csv", index=False
)

In [19]:
audio_caps = load_only_audio_captions()

audio_caps_map = get_text_embeddings_in_batches(audio_caps)
pd.DataFrame(audio_caps_map).to_csv("audio_caps_embeddings.csv", index=False)

In [11]:
model = clap_model()

sent_emb = model.get_text_embedding(["sweet"], use_tensor=True)
sent_emb = F.normalize(sent_emb, dim=-1)

print(torch.norm(sent_emb, dim=-1))
emb = sent_emb.cpu().tolist()
print(emb)

pd.DataFrame(
    [
        emb,
    ]
).to_csv("simple_embeddings.csv")

tensor([1.], device='cuda:0', grad_fn=<LinalgVectorNormBackward0>)
[[-0.04226687550544739, 0.043931882828474045, -0.03497587889432907, -0.07991821318864822, -0.02514909766614437, -0.04352453723549843, 0.04772472381591797, 0.008386715315282345, -0.03981844335794449, -0.01233937032520771, 0.015822062268853188, -0.010329811833798885, -0.05462081357836723, -0.009628348983824253, 0.07402922958135605, 0.022557662799954414, -0.06276911497116089, -0.046550288796424866, -0.029799574986100197, 0.012104989029467106, 0.010932580567896366, 0.03845226764678955, -0.0008145154570229352, -0.011602257378399372, 0.06695506721735, 0.0196172334253788, 0.022638583555817604, -0.03012833371758461, 0.04452405497431755, -0.000682803918607533, 0.08728726208209991, 0.01573316939175129, -0.058674298226833344, -0.07345812767744064, 0.03934356942772865, 0.002483195159584284, -0.03970842435956001, -0.04915536567568779, 0.009294159710407257, -0.0019443753408268094, 0.04773446172475815, -0.040968600660562515, 0.0032566

In [None]:
# model = clap_model()

# emb_list = [
#     # model.get_text_embedding(["sweet"], use_tensor=True),
#     # model.get_text_embedding(["melancholic"], use_tensor=True),
#     # model.get_text_embedding(["harp"], use_tensor=True),
#     # model.get_text_embedding(["sweet"], use_tensor=True),
#     # model.get_text_embedding(["piece"], use_tensor=True),
# ]

# # Stack into a single tensor
# embeddings = torch.stack(emb_list)  # shape (4, D)

# # Compute averaged embedding
# avg_emb = average_embeddings(embeddings)

# # Cosine similarity between the averaged and sentence embeddings
# similarity = F.cosine_similarity(avg_emb, sent_emb).item()
# print(f"Cosine similarity: {similarity:.4f}")