In [None]:
import os
import openai
import pandas as pd

In [None]:
from datasets import load_dataset

dataset = load_dataset("lewtun/music_genres", split="train")

In [None]:
from demucs import pretrained
from demucs.apply import apply_model
from demucs.audio import convert_audio
from datasets import Audio
import torch

demucs = pretrained.get_model("htdemucs")
if torch.cuda.device_count() > 0:
    demucs.to("cuda:0")

audio_column_name = "audio"

def wrap_audio(audio, sr):
    return {"array": audio.cpu().numpy(), "sampling_rate": sr}

def filter_stems(batch, rank=None):
    device = "cpu" if torch.cuda.device_count() == 0 else "cuda:0"

    wavs = [
        convert_audio(
            torch.tensor(audio["array"][None], device=device).to(
                torch.float32
            ),
            audio["sampling_rate"],
            demucs.samplerate,
            demucs.audio_channels,
        ).T
        for audio in batch["audio"]
    ]
    wavs_length = [audio.shape[0] for audio in wavs]

    wavs = torch.nn.utils.rnn.pad_sequence(
        wavs, batch_first=True, padding_value=0.0
    ).transpose(1, 2)
    stems = apply_model(demucs, wavs)

    batch[audio_column_name] = [
        wrap_audio(s[:-1, :, :length].sum(0).mean(0), demucs.samplerate)
        for (s, length) in zip(stems, wavs_length)
    ]

    return batch

num_proc = 1

dataset = dataset.map(
    filter_stems,
    batched=True,
    batch_size=8,
    with_rank=True,
    num_proc=num_proc,
)
dataset = dataset.cast_column(audio_column_name, Audio())

del demucs

In [11]:
# Inspired from https://github.com/lyramakesmusic/finetune-musicgen/blob/main/Finetune_Musicgen.ipynb

genre_labels = [
    "Blues, Boogie Woogie",
    "Blues, Chicago Blues",
    "Blues, Country Blues",
    "Blues, Delta Blues",
    "Blues, Electric Blues",
    "Blues, Harmonica Blues",
    "Blues, Jump Blues",
    "Blues, Louisiana Blues",
    "Blues, Modern Electric Blues",
    "Blues, Piano Blues",
    "Blues, Rhythm & Blues",
    "Blues, Texas Blues",
    "Brass & Military, Brass Band",
    "Brass & Military, Marches",
    "Brass & Military, Military",
    "Children's, Educational",
    "Children's, Nursery Rhymes",
    "Children's, Story",
    "Classical, Baroque",
    "Classical, Choral",
    "Classical, Classical",
    "Classical, Contemporary",
    "Classical, Impressionist",
    "Classical, Medieval",
    "Classical, Modern",
    "Classical, Neo-Classical",
    "Classical, Neo-Romantic",
    "Classical, Opera",
    "Classical, Post-Modern",
    "Classical, Renaissance",
    "Classical, Romantic",
    "Electronic, Abstract",
    "Electronic, Acid",
    "Electronic, Acid House",
    "Electronic, Acid Jazz",
    "Electronic, Ambient",
    "Electronic, Bassline",
    "Electronic, Beatdown",
    "Electronic, Berlin-School",
    "Electronic, Big Beat",
    "Electronic, Bleep",
    "Electronic, Breakbeat",
    "Electronic, Breakcore",
    "Electronic, Breaks",
    "Electronic, Broken Beat",
    "Electronic, Chillwave",
    "Electronic, Chiptune",
    "Electronic, Dance-pop",
    "Electronic, Dark Ambient",
    "Electronic, Darkwave",
    "Electronic, Deep House",
    "Electronic, Deep Techno",
    "Electronic, Disco",
    "Electronic, Disco Polo",
    "Electronic, Donk",
    "Electronic, Downtempo",
    "Electronic, Drone",
    "Electronic, Drum n Bass",
    "Electronic, Dub",
    "Electronic, Dub Techno",
    "Electronic, Dubstep",
    "Electronic, Dungeon Synth",
    "Electronic, EBM",
    "Electronic, Electro",
    "Electronic, Electro House",
    "Electronic, Electroclash",
    "Electronic, Euro House",
    "Electronic, Euro-Disco",
    "Electronic, Eurobeat",
    "Electronic, Eurodance",
    "Electronic, Experimental",
    "Electronic, Freestyle",
    "Electronic, Future Jazz",
    "Electronic, Gabber",
    "Electronic, Garage House",
    "Electronic, Ghetto",
    "Electronic, Ghetto House",
    "Electronic, Glitch",
    "Electronic, Goa Trance",
    "Electronic, Grime",
    "Electronic, Halftime",
    "Electronic, Hands Up",
    "Electronic, Happy Hardcore",
    "Electronic, Hard House",
    "Electronic, Hard Techno",
    "Electronic, Hard Trance",
    "Electronic, Hardcore",
    "Electronic, Hardstyle",
    "Electronic, Hi NRG",
    "Electronic, Hip Hop",
    "Electronic, Hip-House",
    "Electronic, House",
    "Electronic, IDM",
    "Electronic, Illbient",
    "Electronic, Industrial",
    "Electronic, Italo House",
    "Electronic, Italo-Disco",
    "Electronic, Italodance",
    "Electronic, Jazzdance",
    "Electronic, Juke",
    "Electronic, Jumpstyle",
    "Electronic, Jungle",
    "Electronic, Latin",
    "Electronic, Leftfield",
    "Electronic, Makina",
    "Electronic, Minimal",
    "Electronic, Minimal Techno",
    "Electronic, Modern Classical",
    "Electronic, Musique Concrète",
    "Electronic, Neofolk",
    "Electronic, New Age",
    "Electronic, New Beat",
    "Electronic, New Wave",
    "Electronic, Noise",
    "Electronic, Nu-Disco",
    "Electronic, Power Electronics",
    "Electronic, Progressive Breaks",
    "Electronic, Progressive House",
    "Electronic, Progressive Trance",
    "Electronic, Psy-Trance",
    "Electronic, Rhythmic Noise",
    "Electronic, Schranz",
    "Electronic, Sound Collage",
    "Electronic, Speed Garage",
    "Electronic, Speedcore",
    "Electronic, Synth-pop",
    "Electronic, Synthwave",
    "Electronic, Tech House",
    "Electronic, Tech Trance",
    "Electronic, Techno",
    "Electronic, Trance",
    "Electronic, Tribal",
    "Electronic, Tribal House",
    "Electronic, Trip Hop",
    "Electronic, Tropical House",
    "Electronic, UK Garage",
    "Electronic, Vaporwave",
    "Folk, World, & Country, African",
    "Folk, World, & Country, Bluegrass",
    "Folk, World, & Country, Cajun",
    "Folk, World, & Country, Canzone Napoletana",
    "Folk, World, & Country, Catalan Music",
    "Folk, World, & Country, Celtic",
    "Folk, World, & Country, Country",
    "Folk, World, & Country, Fado",
    "Folk, World, & Country, Flamenco",
    "Folk, World, & Country, Folk",
    "Folk, World, & Country, Gospel",
    "Folk, World, & Country, Highlife",
    "Folk, World, & Country, Hillbilly",
    "Folk, World, & Country, Hindustani",
    "Folk, World, & Country, Honky Tonk",
    "Folk, World, & Country, Indian Classical",
    "Folk, World, & Country, Laïkó",
    "Folk, World, & Country, Nordic",
    "Folk, World, & Country, Pacific",
    "Folk, World, & Country, Polka",
    "Folk, World, & Country, Raï",
    "Folk, World, & Country, Romani",
    "Folk, World, & Country, Soukous",
    "Folk, World, & Country, Séga",
    "Folk, World, & Country, Volksmusik",
    "Folk, World, & Country, Zouk",
    "Folk, World, & Country, Éntekhno",
    "Funk / Soul, Afrobeat",
    "Funk / Soul, Boogie",
    "Funk / Soul, Contemporary R&B",
    "Funk / Soul, Disco",
    "Funk / Soul, Free Funk",
    "Funk / Soul, Funk",
    "Funk / Soul, Gospel",
    "Funk / Soul, Neo Soul",
    "Funk / Soul, New Jack Swing",
    "Funk / Soul, P.Funk",
    "Funk / Soul, Psychedelic",
    "Funk / Soul, Rhythm & Blues",
    "Funk / Soul, Soul",
    "Funk / Soul, Swingbeat",
    "Funk / Soul, UK Street Soul",
    "Hip Hop, Bass Music",
    "Hip Hop, Boom Bap",
    "Hip Hop, Bounce",
    "Hip Hop, Britcore",
    "Hip Hop, Cloud Rap",
    "Hip Hop, Conscious",
    "Hip Hop, Crunk",
    "Hip Hop, Cut-up/DJ",
    "Hip Hop, DJ Battle Tool",
    "Hip Hop, Electro",
    "Hip Hop, G-Funk",
    "Hip Hop, Gangsta",
    "Hip Hop, Grime",
    "Hip Hop, Hardcore Hip-Hop",
    "Hip Hop, Horrorcore",
    "Hip Hop, Instrumental",
    "Hip Hop, Jazzy Hip-Hop",
    "Hip Hop, Miami Bass",
    "Hip Hop, Pop Rap",
    "Hip Hop, Ragga HipHop",
    "Hip Hop, RnB/Swing",
    "Hip Hop, Screw",
    "Hip Hop, Thug Rap",
    "Hip Hop, Trap",
    "Hip Hop, Trip Hop",
    "Hip Hop, Turntablism",
    "Jazz, Afro-Cuban Jazz",
    "Jazz, Afrobeat",
    "Jazz, Avant-garde Jazz",
    "Jazz, Big Band",
    "Jazz, Bop",
    "Jazz, Bossa Nova",
    "Jazz, Contemporary Jazz",
    "Jazz, Cool Jazz",
    "Jazz, Dixieland",
    "Jazz, Easy Listening",
    "Jazz, Free Improvisation",
    "Jazz, Free Jazz",
    "Jazz, Fusion",
    "Jazz, Gypsy Jazz",
    "Jazz, Hard Bop",
    "Jazz, Jazz-Funk",
    "Jazz, Jazz-Rock",
    "Jazz, Latin Jazz",
    "Jazz, Modal",
    "Jazz, Post Bop",
    "Jazz, Ragtime",
    "Jazz, Smooth Jazz",
    "Jazz, Soul-Jazz",
    "Jazz, Space-Age",
    "Jazz, Swing",
    "Latin, Afro-Cuban",
    "Latin, Baião",
    "Latin, Batucada",
    "Latin, Beguine",
    "Latin, Bolero",
    "Latin, Boogaloo",
    "Latin, Bossanova",
    "Latin, Cha-Cha",
    "Latin, Charanga",
    "Latin, Compas",
    "Latin, Cubano",
    "Latin, Cumbia",
    "Latin, Descarga",
    "Latin, Forró",
    "Latin, Guaguancó",
    "Latin, Guajira",
    "Latin, Guaracha",
    "Latin, MPB",
    "Latin, Mambo",
    "Latin, Mariachi",
    "Latin, Merengue",
    "Latin, Norteño",
    "Latin, Nueva Cancion",
    "Latin, Pachanga",
    "Latin, Porro",
    "Latin, Ranchera",
    "Latin, Reggaeton",
    "Latin, Rumba",
    "Latin, Salsa",
    "Latin, Samba",
    "Latin, Son",
    "Latin, Son Montuno",
    "Latin, Tango",
    "Latin, Tejano",
    "Latin, Vallenato",
    "Non-Music, Audiobook",
    "Non-Music, Comedy",
    "Non-Music, Dialogue",
    "Non-Music, Education",
    "Non-Music, Field Recording",
    "Non-Music, Interview",
    "Non-Music, Monolog",
    "Non-Music, Poetry",
    "Non-Music, Political",
    "Non-Music, Promotional",
    "Non-Music, Radioplay",
    "Non-Music, Religious",
    "Non-Music, Spoken Word",
    "Pop, Ballad",
    "Pop, Bollywood",
    "Pop, Bubblegum",
    "Pop, Chanson",
    "Pop, City Pop",
    "Pop, Europop",
    "Pop, Indie Pop",
    "Pop, J-pop",
    "Pop, K-pop",
    "Pop, Kayōkyoku",
    "Pop, Light Music",
    "Pop, Music Hall",
    "Pop, Novelty",
    "Pop, Parody",
    "Pop, Schlager",
    "Pop, Vocal",
    "Reggae, Calypso",
    "Reggae, Dancehall",
    "Reggae, Dub",
    "Reggae, Lovers Rock",
    "Reggae, Ragga",
    "Reggae, Reggae",
    "Reggae, Reggae-Pop",
    "Reggae, Rocksteady",
    "Reggae, Roots Reggae",
    "Reggae, Ska",
    "Reggae, Soca",
    "Rock, AOR",
    "Rock, Acid Rock",
    "Rock, Acoustic",
    "Rock, Alternative Rock",
    "Rock, Arena Rock",
    "Rock, Art Rock",
    "Rock, Atmospheric Black Metal",
    "Rock, Avantgarde",
    "Rock, Beat",
    "Rock, Black Metal",
    "Rock, Blues Rock",
    "Rock, Brit Pop",
    "Rock, Classic Rock",
    "Rock, Coldwave",
    "Rock, Country Rock",
    "Rock, Crust",
    "Rock, Death Metal",
    "Rock, Deathcore",
    "Rock, Deathrock",
    "Rock, Depressive Black Metal",
    "Rock, Doo Wop",
    "Rock, Doom Metal",
    "Rock, Dream Pop",
    "Rock, Emo",
    "Rock, Ethereal",
    "Rock, Experimental",
    "Rock, Folk Metal",
    "Rock, Folk Rock",
    "Rock, Funeral Doom Metal",
    "Rock, Funk Metal",
    "Rock, Garage Rock",
    "Rock, Glam",
    "Rock, Goregrind",
    "Rock, Goth Rock",
    "Rock, Gothic Metal",
    "Rock, Grindcore",
    "Rock, Grunge",
    "Rock, Hard Rock",
    "Rock, Hardcore",
    "Rock, Heavy Metal",
    "Rock, Indie Rock",
    "Rock, Industrial",
    "Rock, Krautrock",
    "Rock, Lo-Fi",
    "Rock, Lounge",
    "Rock, Math Rock",
    "Rock, Melodic Death Metal",
    "Rock, Melodic Hardcore",
    "Rock, Metalcore",
    "Rock, Mod",
    "Rock, Neofolk",
    "Rock, New Wave",
    "Rock, No Wave",
    "Rock, Noise",
    "Rock, Noisecore",
    "Rock, Nu Metal",
    "Rock, Oi",
    "Rock, Parody",
    "Rock, Pop Punk",
    "Rock, Pop Rock",
    "Rock, Pornogrind",
    "Rock, Post Rock",
    "Rock, Post-Hardcore",
    "Rock, Post-Metal",
    "Rock, Post-Punk",
    "Rock, Power Metal",
    "Rock, Power Pop",
    "Rock, Power Violence",
    "Rock, Prog Rock",
    "Rock, Progressive Metal",
    "Rock, Psychedelic Rock",
    "Rock, Psychobilly",
    "Rock, Pub Rock",
    "Rock, Punk",
    "Rock, Rock & Roll",
    "Rock, Rockabilly",
    "Rock, Shoegaze",
    "Rock, Ska",
    "Rock, Sludge Metal",
    "Rock, Soft Rock",
    "Rock, Southern Rock",
    "Rock, Space Rock",
    "Rock, Speed Metal",
    "Rock, Stoner Rock",
    "Rock, Surf",
    "Rock, Symphonic Rock",
    "Rock, Technical Death Metal",
    "Rock, Thrash",
    "Rock, Twist",
    "Rock, Viking Metal",
    "Rock, Yé-Yé",
    "Stage & Screen, Musical",
    "Stage & Screen, Score",
    "Stage & Screen, Soundtrack",
    "Stage & Screen, Theme",
]
mood_theme_classes = [
    "action",
    "adventure",
    "advertising",
    "background",
    "ballad",
    "calm",
    "children",
    "christmas",
    "commercial",
    "cool",
    "corporate",
    "dark",
    "deep",
    "documentary",
    "drama",
    "dramatic",
    "dream",
    "emotional",
    "energetic",
    "epic",
    "fast",
    "film",
    "fun",
    "funny",
    "game",
    "groovy",
    "happy",
    "heavy",
    "holiday",
    "hopeful",
    "inspiring",
    "love",
    "meditative",
    "melancholic",
    "melodic",
    "motivational",
    "movie",
    "nature",
    "party",
    "positive",
    "powerful",
    "relaxing",
    "retro",
    "romantic",
    "sad",
    "sexy",
    "slow",
    "soft",
    "soundscape",
    "space",
    "sport",
    "summer",
    "trailer",
    "travel",
    "upbeat",
    "uplifting",
]
instrument_classes = [
    "acapella",
    "accordion",
    "acousticbassguitar",
    "acousticguitar",
    "bass",
    "beat",
    "bell",
    "bongo",
    "brass",
    "cello",
    "clarinet",
    "classicalguitar",
    "computer",
    "doublebass",
    "drummachine",
    "drums",
    "electricguitar",
    "electricpiano",
    "flute",
    "guitar",
    "harmonica",
    "harp",
    "horn",
    "keyboard",
    "oboe",
    "orchestra",
    "organ",
    "pad",
    "percussion",
    "piano",
    "pipeorgan",
    "rhodes",
    "sampler",
    "saxophone",
    "strings",
    "synthesizer",
    "trombone",
    "trumpet",
    "viola",
    "violin",
    "voice",
]

In [13]:
from msclap import CLAP
import librosa
import tempfile
import torchaudio
import random
import numpy as np
import os

clap_model = CLAP(version="2023", use_cuda=True)
instrument_embeddings = clap_model.get_text_embeddings(instrument_classes)
genre_embeddings = clap_model.get_text_embeddings(genre_labels)
mood_embeddings = clap_model.get_text_embeddings(mood_theme_classes)

def enrich_text(batch):
    audio, sampling_rate = (
        batch["audio"]["array"],
        batch["audio"]["sampling_rate"],
    )

    tempo, _ = librosa.beat.beat_track(y=audio, sr=sampling_rate)
    # Ensure tempo is a scalar by selecting the first value if it's an array
    tempo = tempo.item() if isinstance(tempo, np.ndarray) else tempo
    tempo = f"{str(round(tempo))} bpm"  # Convert tempo to string with 'bpm'

    chroma = librosa.feature.chroma_stft(y=audio, sr=sampling_rate)
    key = np.argmax(np.sum(chroma, axis=1))
    key = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"][key]

    with tempfile.TemporaryDirectory() as tempdir:
        path = os.path.join(tempdir, "tmp.wav")
        torchaudio.save(path, torch.tensor(audio).unsqueeze(0), sampling_rate)
        audio_embeddings = clap_model.get_audio_embeddings([path])

    instrument = clap_model.compute_similarity(
        audio_embeddings, instrument_embeddings
    ).argmax(dim=1)[0]
    genre = clap_model.compute_similarity(
        audio_embeddings, genre_embeddings
    ).argmax(dim=1)[0]
    mood = clap_model.compute_similarity(
        audio_embeddings, mood_embeddings
    ).argmax(dim=1)[0]

    instrument = instrument_classes[instrument]
    genre = genre_labels[genre]
    mood = mood_theme_classes[mood]

    metadata = [key, tempo, instrument, genre, mood]

    random.shuffle(metadata)
    batch["metadata"] = ", ".join(metadata)
    return batch


In [15]:
print(dataset[0])

{'audio': {'path': None, 'array': array([-6.10351562e-05, -6.10351562e-05, -3.05175781e-05, ...,
       -3.96728516e-04, -1.52587891e-04, -4.88281250e-04]), 'sampling_rate': 44100}, 'song_id': 11483, 'genre_id': 9, 'genre': 'International'}


In [16]:
# Apply the enrich_text function
dataset = dataset.map(enrich_text, desc="add metadata")

# Inspect the first example in the dataset
print(dataset[0])  # Displays the first example with the added "metadata"

# To get just the metadata from the dataset
metadata_list = [item["metadata"] for item in dataset]
print(metadata_list)  # Displays all metadata entries in the dataset


add metadata:   0%|          | 0/100 [00:00<?, ? examples/s]

{'audio': {'path': None, 'array': array([-6.10351562e-05, -6.10351562e-05, -3.05175781e-05, ...,
       -3.96728516e-04, -1.52587891e-04, -4.88281250e-04]), 'sampling_rate': 44100}, 'song_id': 11483, 'genre_id': 9, 'genre': 'International', 'metadata': 'trombone, 117 bpm, Brass & Military, Marches, D, documentary'}
['trombone, 117 bpm, Brass & Military, Marches, D, documentary', 'sampler, documentary, Funk / Soul, Neo Soul, 161 bpm, G', '120 bpm, sampler, B, Funk / Soul, Swingbeat, documentary', 'groovy, F#, Funk / Soul, Swingbeat, acousticbassguitar, 178 bpm', '172 bpm, soundscape, Funk / Soul, Swingbeat, D, sampler', 'Funk / Soul, Neo Soul, D#, 71 bpm, documentary, sampler', 'documentary, 120 bpm, C#, acousticbassguitar, Stage & Screen, Soundtrack', 'Rock, Funeral Doom Metal, documentary, acousticguitar, E, 91 bpm', 'F, 129 bpm, sampler, Electronic, Acid House, groovy', 'Funk / Soul, Gospel, 167 bpm, A, adventure, acousticbassguitar', '115 bpm, sampler, C, Electronic, Breakbeat, docu

In [None]:
import pandas as pd

# Convert dataset to DataFrame
df = pd.DataFrame(dataset)

# Save as CSV
df.to_csv("audio_dataset_with_metadata.csv", index=False)

In [None]:
# Access the API key
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
# Load the CSV file
data = pd.read_csv('audio_dataset_with_metadata.csv')

# Define a function to generate captions using OpenAI API
def generate_caption(metadata):
    prompt = (
        f"Keywords: {metadata}. "
        "Create an audio caption that merges these keywords into a meaningful description."
    )
    try:
    
        completion = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=50
        )
        return completion.choices[0].message['content'].strip()
    except Exception as e:
        return f"Error generating caption: {e}"

data['caption'] = data['metadata'].apply(generate_caption)

output_path = 'dataset_with_captions.csv'
data.to_csv(output_path, index=False)

print(f"Captions generated and saved to {output_path})
