In [None]:
!pip install pydub

In [None]:
!conda install --yes "ffmpeg<5" -c conda-forge

In [None]:
!python3 -m pip install -U git+https://github.com/facebookresearch/audiocraft#egg=audiocraft

In [None]:
import os
import librosa

from pydub import AudioSegment

In [None]:
# Folder names
input_folder_name = "raw"
output_folder_name = "output"
musicgen_trainer_dir = "musicgen_trainer_dir"

if not os.path.exists(input_folder_name):
    os.makedirs(input_folder_name)
    print(f"'{input_folder_name}' folder created.")

if not os.path.exists(output_folder_name):
    os.makedirs(output_folder_name)
    print(f"'{output_folder_name}' folder created.")

if not os.path.exists(musicgen_trainer_dir):
    os.makedirs(musicgen_trainer_dir)
    print(f"'{musicgen_trainer_dir}' folder created.")

In [None]:
def delete_files_in_directory(directory_path):
    for file_name in os.listdir(directory_path):
        file_path = os.path.join(directory_path, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
            print(f"Deleted: {file_path}")

In [None]:
# Uncomment when you need to delete all files in the output directory
# delete_files_in_directory(output_folder_name)

In [None]:
def process_audio(file_path, output_dir, counter, segment_length=30):
    
    current_count = counter
    
    audio = AudioSegment.from_file(file_path)
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    segment_length_ms = segment_length * 1000
    audio = audio.set_frame_rate(32000)
    num_segments = (len(audio) + segment_length_ms - 1) // segment_length_ms

    for i in range(num_segments):
        start_time = i * segment_length_ms

        # If this is the last segment, adjust start_time
        if i == num_segments - 1:
            start_time = len(audio) - segment_length_ms

        end_time = start_time + segment_length_ms
        
        segment = audio[start_time : end_time]
        segment.export(os.path.join(output_dir, f'segment_{current_count:03d}.wav'), format='wav')

        # Save the caption using file name
        with open(os.path.join(output_dir, f'segment_{current_count:03d}.txt'), 'w') as f:
            f.write(file_name)
            current_count += 1
    
    print(f"processed {file_path}.")
    
    return current_count

In [None]:
# Process the input files
ctr = 0

for file_name in os.listdir(input_folder_name):
    if file_name.endswith(".mp3") or file_name.endswith(".m4a"):
        file_path = os.path.join(input_folder_name, file_name)
        ctr = process_audio(file_path, output_folder_name, ctr, segment_length=30)

In [None]:
# Test uniqueness of tag list
tag_list = []

for file_name in os.listdir(input_folder_name):
    if file_name.endswith(".mp3") or file_name.endswith(".m4a"):
        tag_list.append(os.path.splitext(os.path.basename(file_name))[0].split(",")[-1])
        
print(len(set(tag_list)), len(tag_list))

In [None]:
# Test the processed .wav files
for file_name in os.listdir(output_directory):
    if file_name.endswith('.wav'):
        file_path = os.path.join(output_directory, file_name)
        audio, sample_rate = librosa.load(file_path, sr=None)
        if audio.shape[0] == 32000 * 30:
            continue
            # print(f"{file_name} has the correct shape: {audio.shape[0]}")
        else:
            print(f"{file_name} does not have the correct shape. Actual shape: {audio.shape[0]}")

## Training

In [None]:
!python musicgen_trainer/run.py --dataset_path output

In [None]:
from audiocraft.models import musicgen
from audiocraft.utils.notebook import display_audio
import torch

In [None]:
model = musicgen.MusicGen.get_pretrained("small", device="cuda")

model.set_generation_params(duration=30)

model.lm.load_state_dict(torch.load("models/lm_final.pt"))

In [None]:
# Generate 4 samples by random
res = model.generate_unconditional(4)
display_audio(res, 32000)

In [None]:
from audiocraft.data.audio import audio_write
for idx, one_wav in enumerate(res):
    # Will save under {idx}.wav, with loudness normalization at -14 db LUFS.
    audio_write(f'{idx}', one_wav.cpu(), model.sample_rate, strategy="loudness", loudness_compressor=True)