In [None]:
! pip install openl3

Collecting openl3
  Using cached openl3-0.4.2.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting kapre>=0.3.5 (from openl3)
  Downloading kapre-0.3.7.tar.gz (26 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting resampy<0.3.0,>=0.2.1 (from openl3)
  Downloading resampy-0.2.2.tar.gz (323 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.4/323.4 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: openl3, kapre, resampy
  Building wheel for openl3 (setup.py) ... [?25l[?25hdone
  Created wheel for openl3: filename=openl3-0.4.2-py2.py3-none-any.whl size=249327030 sha256=08f3f923b520bb7d14a4868e0bff71838a0a0b1aaa5b049bb5ed8f5bfe55b4fe
  Stored in directory: /root/.cache/pip/wheels/35/e9/4c/b1e39385b21f2b4d70c01b8793ec

In [None]:
import os
from google.colab import drive
import librosa
import soundfile as sf
import numpy as np
import openl3
import json
from tqdm import tqdm

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("imsparsh/musicnet-dataset")

Downloading from https://www.kaggle.com/api/v1/datasets/download/imsparsh/musicnet-dataset?dataset_version_number=1...


100%|██████████| 21.5G/21.5G [16:23<00:00, 23.5MB/s]

Extracting files...





In [None]:
print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/imsparsh/musicnet-dataset/versions/1


In [None]:
wav_dir = os.path.join(path, "musicnet")
print("WAV目录:", wav_dir)

WAV目录: /root/.cache/kagglehub/datasets/imsparsh/musicnet-dataset/versions/1/musicnet


In [None]:
OUTPUT_EMBEDDING_DIR = "/content/drive/MyDrive/MuseGuard/musicnet_embeddings"
METADATA_DIR = "/content/drive/MyDrive/MuseGuard/musicnet_metadata"

os.makedirs(OUTPUT_EMBEDDING_DIR, exist_ok=True)
os.makedirs(METADATA_DIR, exist_ok=True)

In [None]:
model = openl3.models.load_audio_embedding_model(
    input_repr="mel256",
    content_type="music",
    embedding_size=6144
)

In [None]:
def segment_audio(audio, sr, segment_length=10.0, stride=5.0):
    # 切音频，10s一段，5秒overlap
    segments = []
    metadata = []
    # 存分段和描述信息
    total_duration = len(audio) / sr
    start = 0.0
    segment_id = 0
    while start < total_duration:
        end = start + segment_length
        start_sample = int(start * sr)
        end_sample = int(end * sr)
        if end_sample > len(audio):
            segment = np.pad(audio[start_sample:], (0, end_sample - len(audio)))
            is_padded = True
        # 处理音频结尾分段超过整首歌长度
        else:
            segment = audio[start_sample:end_sample]
            is_padded = False
        segments.append(segment)
        # 加入分段
        metadata.append({
            "segment_id": segment_id,
            "start_time": float(start),
            "end_time": float(min(end, total_duration)),
            "duration": segment_length,
            "is_padded": is_padded,
            "overlap_previous": start > 0,
            "overlap_next": (end < total_duration),
        })
        # 加入分段信息
        start += stride
        segment_id += 1
    return segments, metadata

In [None]:
for root, dirs, files in os.walk(wav_dir):
    for fname in tqdm(files, desc=f"正在处理目录 {root}"):
        if fname.endswith(".wav"):
            wav_path = os.path.join(root, fname)
            recording_id = os.path.splitext(fname)[0]
            audio, sr = librosa.load(wav_path, sr=48000)
            # 调整采样率

            segments, seg_metadata = segment_audio(audio, sr)
            # 拆成10s一段

            with open(os.path.join(METADATA_DIR, f"{recording_id}_metadata.json"), "w") as f:
                json.dump(seg_metadata, f, indent=2)
            # 存分段信息

            for seg_id, segment in enumerate(segments):
                # 遍历，提取embedding
                emb, _ = openl3.get_audio_embedding(
                    segment,
                    sr,
                    model=model,
                    center=False,
                    hop_size=1.0,
                    verbose=False
                )
                aggregated_emb = np.mean(emb, axis=0)
                # 把每秒1个的10个向量合成一个
                np.save(
                    os.path.join(OUTPUT_EMBEDDING_DIR, f"{recording_id}_seg{seg_id}.npy"),
                    aggregated_emb
                )
                # 存数据


正在处理目录 /root/.cache/kagglehub/datasets/imsparsh/musicnet-dataset/versions/1/musicnet: 0it [00:00, ?it/s]
正在处理目录 /root/.cache/kagglehub/datasets/imsparsh/musicnet-dataset/versions/1/musicnet/musicnet: 0it [00:00, ?it/s]
正在处理目录 /root/.cache/kagglehub/datasets/imsparsh/musicnet-dataset/versions/1/musicnet/musicnet/test_labels: 100%|██████████| 10/10 [00:00<00:00, 108100.62it/s]
正在处理目录 /root/.cache/kagglehub/datasets/imsparsh/musicnet-dataset/versions/1/musicnet/musicnet/test_data: 100%|██████████| 10/10 [03:50<00:00, 23.02s/it]
正在处理目录 /root/.cache/kagglehub/datasets/imsparsh/musicnet-dataset/versions/1/musicnet/musicnet/train_data: 100%|██████████| 320/320 [51:03<00:00,  9.57s/it]
正在处理目录 /root/.cache/kagglehub/datasets/imsparsh/musicnet-dataset/versions/1/musicnet/musicnet/train_labels: 100%|██████████| 320/320 [00:00<00:00, 1342177.28it/s]
