In [1]:
# Create .jsonl from the extracted features, make a train/test split, and save in the right place.

# set the following variable to True if you want to see a progress bar instead of the printed results:
use_tqdm = False

import os
import json
import random
import librosa
from pydub import AudioSegment
import wave
import re

from functools import partial
from tqdm import tqdm
import numpy as np
tqdm = partial(tqdm, position=0, leave=True)

In [2]:
import pandas as pd
train_labels = pd.read_csv("train_labels.csv")
test_labels = pd.read_csv("test_labels.csv")

In [6]:
# make sure the .jsonl has a place to go
os.makedirs("content/audiocraft/egs/train_small", exist_ok=True)


train_len = 0

dataset_path = "data/wav_files/wav-48/"

with open("content/audiocraft/egs/train_small/data.jsonl", "w") as train_file:
    for filename, caption in tqdm(zip(train_labels["ytid"], train_labels["caption"])):

        # get key and BPM
        y, sr = librosa.load(os.path.join(dataset_path, f"{filename}.wav"))
        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        key = np.argmax(np.sum(chroma, axis=1))
        key = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'][key]
        length = librosa.get_duration(y=y, sr=sr)

        # populate json
        entry = {
            "key": f"{key}",
            "artist": "",
            "sample_rate": 44100,
            "file_extension": "wav",
            "description": caption,
            "keywords": "",
            "duration": length,
            "bpm": "",
            "genre": "",
            "title": "",
            "name": "",
            "instrument": "",
            "moods": "",
            "path": os.path.join(dataset_path, f"{filename}.wav"),
        }
#         print(entry)

        train_len += 1
        train_file.write(json.dumps(entry) + '\n')
        
        if train_len == 5:
            break

print(train_len)

4it [00:20,  5.09s/it]

5





In [8]:
os.makedirs("content/audiocraft/egs/eval_small", exist_ok=True)
eval_len = 0
with open("content/audiocraft/egs/eval_small/data.jsonl", "w") as eval_file:
    for filename, caption in tqdm(zip(test_labels["ytid"], test_labels["caption"])):

        # get key and BPM
        y, sr = librosa.load(os.path.join(dataset_path, f"{filename}.wav"))
        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        key = np.argmax(np.sum(chroma, axis=1))
        key = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'][key]
        length = librosa.get_duration(y=y, sr=sr)

        # populate json
        entry = {
            "key": f"{key}",
            "artist": "",
            "sample_rate": 44100,
            "file_extension": "wav",
            "description": caption,
            "keywords": "",
            "duration": length,
            "bpm": "",
            "genre": "",
            "title": "",
            "name": "",
            "instrument": "",
            "moods": "",
            "path": os.path.join(dataset_path, f"{filename}.wav"),
        }
#         print(entry)

        eval_len += 1
        eval_file.write(json.dumps(entry) + '\n')
        if eval_len == 5:
            break

print(eval_len)

4it [00:00,  5.86it/s]

5





In [26]:
# clear cuda mem for finetuning
from numba import cuda
device = cuda.get_current_device()
device.reset()

In [None]:
%env USER=yimt
# CHANGE THIS

!

command = (
    "HYDRA_FULL_ERROR=1 AUDIOCRAFT_DORA_DIR=other_dir dora -P audiocraft run"
    " solver=musicgen/musicgen_base_32khz"
    " model/lm/model_scale=small"
    " continue_from=//pretrained/facebook/musicgen-small"
    " conditioner=text2music"
    " dset=audio/finetune"
    " dataset.num_workers=2"
    " dataset.valid.num_samples=1"
    " dataset.batch_size=1" # CHANGE THIS
    " schedule.cosine.warmup=8"
    " optim.optimizer=adamw" # uses dadaw by default, which is worse for single-gpu runs
    " optim.lr=1e-4"
    " optim.epochs=5" # stops training after 5 epochs- change this
    " optim.updates_per_epoch=1000" # 2000 by default, change this if you want checkpoints quicker ig
    " optim.adam.weight_decay=0.01"
    " generate.lm.prompted_samples=False" # skip super long generate step
    " generate.lm.gen_gt_samples=True"
    " +output_dir=dora_output"
)

!{command}

env: USER=yimt
Dora directory: other_dir
See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
  ret = run_job(
[[36m12-03 23:56:03[0m][[34mdora.distrib[0m][[32mINFO[0m] - world_size is 1, skipping init.[0m
[[36m12-03 23:56:03[0m][[34mflashy.solver[0m][[32mINFO[0m] - Instantiating solver MusicGenSolver for XP 1894a354[0m
[[36m12-03 23:56:03[0m][[34mflashy.solver[0m][[32mINFO[0m] - All XP logs are stored in /home/thomasyim/cs229-music/other_dir/xps/1894a354[0m
[[36m12-03 23:56:03[0m][[34maudiocraft.solvers.builders[0m][[32mINFO[0m] - Loading audio data split evaluate: /home/thomasyim/cs229-music/content/audiocraft/egs/eval[0m
[[36m12-03 23:56:03[0m][[34maudiocraft.solvers.builders[0m][[32mINFO[0m] - Loading audio data split generate: /home/thomasyim/cs229-music/content/audiocraft/egs/train[0m
[[36m12-03 23:56:04[0m][[34maudiocraft.solvers.builders[0m][[32mINFO[0m] - Loading audio data split train: /ho

In [None]:
!ls

In [4]:
from audiocraft.utils import export
from audiocraft import train
export.export_pretrained_compression_model('facebook/encodec_32khz', '/checkpoints/my_audio_lm/compression_state_dict.bin')


Dora directory: /tmp/audiocraft_yimt


PermissionError: [Errno 13] Permission denied: '/checkpoints'

In [14]:
!export AUDIOCRAFT_DORA_DIR="other_dir"

In [1]:
!echo $AUDIOCRAFT_DORA_DIR




In [None]:
from audiocraft.utils import export
from audiocraft import train
xp = train.main.get_xp_from_sig('0bd6914e')
print(xp.folder)
# export.export_lm(xp.folder / 'checkpoint.th', '/checkpoints/my_audio_lm/state_dict.bin')
# # You also need to bundle the EnCodec model you used !!
# ## Case 1) you trained your own
# xp_encodec = train.main.get_xp_from_sig('SIG_OF_ENCODEC')
# export.export_encodec(xp_encodec.folder / 'checkpoint.th', '/checkpoints/my_audio_lm/compression_state_dict.bin')
# ## Case 2) you used a pretrained model. Give the name you used without the //pretrained/ prefix.
# ## This will actually not dump the actual model, simply a pointer to the right model to download.
# export.export_pretrained_compression_model('facebook/encodec_32khz', '/checkpoints/my_audio_lm/compression_state_dict.bin')
