In [2]:
from transformers import MusicgenForConditionalGeneration
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")

  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "leng

We can then place the model on our accelerator device (if available), or leave it on the CPU otherwise:

In [3]:
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model.to(device);

In [None]:
print(device)

## Generation

MusicGen is compatible with two generation modes: greedy and sampling. In practice, sampling leads to significantly
better results than greedy, thus we encourage sampling mode to be used where possible. Sampling is enabled by default,
and can be explicitly specified by setting `do_sample=True` in the call to `MusicgenForConditionalGeneration.generate` (see below).

### Unconditional Generation

The inputs for unconditional (or 'null') generation can be obtained through the method `MusicgenForConditionalGeneration.get_unconditional_inputs`. We can then run auto-regressive generation using the `.generate` method, specifying `do_sample=True` to enable sampling mode:

In [None]:
from datasets import Dataset
import pandas as pd

In [None]:
df = pd.read_csv("data/musiccaps-train-data.csv")

In [None]:
import os
df = df[df["ytid"].apply(lambda x: os.path.exists(f"data/wav_files/wav-48/{x}.wav"))]

In [8]:
df.head

<bound method NDFrame.head of              ytid  start_s  end_s  \
0     -0Gj8-vB1q4       30     40   
1     -0SdAVK79lg       30     40   
2     -0vPFx-wRRI       30     40   
3     -0xzrMun0Rs       30     40   
4     -1LrH01Ei1w       30     40   
...           ...      ...    ...   
5516  zw5dkiklbhE       15     25   
5517  zwfo7wnXdjs       30     40   
5518  zx_vcwOsDO4       50     60   
5519  zyXa2tdBTGc       30     40   
5520  zzNdwF40ID8       70     80   

                               audioset_positive_labels  \
0                          /m/0140xf,/m/02cjck,/m/04rlf   
1     /m/0155w,/m/01lyv,/m/0342h,/m/042v_gx,/m/04rlf...   
2                                   /m/025_jnm,/m/04rlf   
3                                    /m/01g90h,/m/04rlf   
4                                   /m/02p0sh1,/m/04rlf   
...                                                 ...   
5516                                /m/01sm1g,/m/0l14md   
5517                      /m/02p0sh1,/m/04rlf,/m/06j6

In [9]:
from tqdm import tqdm
import os
from pydub import AudioSegment
dataset_path = f"data/wav_files/wav-48"
for filename in tqdm(os.listdir(dataset_path)):
    if filename.endswith(('.mp3', '.wav', '.flac')):

        # move original file out of the way
        audio = AudioSegment.from_file(f"{dataset_path}/{filename}")

        # resample
        audio = audio.set_frame_rate(44100)
        audio.export(filename, format="wav")

100%|██████████████████████████████████████████████████████████████████████████████████████| 5079/5079 [01:20<00:00, 63.49it/s]


In [9]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [10]:
train.to_csv("train_labels.csv")
test.to_csv("test_labels.csv")

In [11]:
train_labels = pd.read_csv("train_labels.csv")
test_labels = pd.read_csv("test_labels.csv")

In [12]:
data = {
    "audio": train_labels["ytid"],
    "text": train_labels["caption"]
}

In [13]:
dataset = Dataset.from_dict(data)

In [14]:
dataset

Dataset({
    features: ['audio', 'text'],
    num_rows: 4063
})

In [None]:
from transformers import AutoTokenizer, AutoFeatureExtractor
import torchaudio

tokenizer = AutoTokenizer.from_pretrained("facebook/musicgen-small")
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/musicgen-small")

def preprocess_function(examples):
    # Tokenize text
    tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128, return_tensors="pt")
    # Load and preprocess audio
    print(examples["text"])
    print(examples["audio"])
    audio = [torchaudio.load(f"data/wav_files/wav-48/{path}.wav")[0] for path in [examples["audio"]]]
    return {
        "input_ids": tokens["input_ids"],
        "attention_mask": tokens["attention_mask"],
        "audio": audio  # Depending on the model's requirements, preprocess this further.
    }

processed_dataset = dataset.map(preprocess_function, batched=False)


Map:   0%|          | 0/4063 [00:00<?, ? examples/s]

This is a drum & bass electronic music piece. There is no singer. There is a strong and fuzzy feeling bass sound. The rhythmic background consists of a loud and high tempo electronic drum beat. The atmosphere is modern and energetic. This piece could be playing at a nightclub or a dance club.
FFQVVwFjy7s
The music excerpt features a male and a female voice singing the same melody in unison. Accompanying the singers is a sitar that plays the same note throughout - what is called a harmonic pedal. The atmosphere points towards a kind of meditation through repetitive singing, a kind of channeling state.
_h2rFVPCSPE
The low quality recording features a passionate female vocal, alongside harmonizing vocals, singing over wooden percussion, playing as the background music, while there are birds chirping, cows mooing and metallic impacts sounds. The recording is mono and noisy.
55qucu8Zs_I
This is a techno music piece played in the background of a car video. The sounds coming from the car such

This music is instrumental. The tempo is fast with a groovy didgeridoo harmony with percussion accompaniment. The music is upbeat, catchy, funky, unconventional, engaging, punchy and fun. This music is a Didgeridoo instrumental.
xCLzxuZE3yg
The music features a female voice singing with a reverb effect and out of tune. An acoustic guitar accompanies the singer with chords and arpeggios. In the background a synth sound that resembles an instrument with strings plays arpeggios.
v0tYHz5mk4I
This is a rap music piece played behind a rollerskating video. The sound of the skaters can be heard faintly throughout the recording. There is a male voice rapping at the forefront while other voices can be heard singing melodically in the background and ad-libbing occasionally. There is a mild keyboard playing the tune while a loud electronic drum beat is playing the rhythm. The atmosphere of this piece is groovy and urban.
_43OOP6UEw0
This song is recorded in poor quality. Two acoustic guitars are p

The low quality recording features a choir singing over punchy kick and snare hits, shimmering hi hats, groovy piano and groovy bass. In the first half, there are harmonizing female vocals singing, after which the male harmonizing vocals start singing. It sounds soulful, passionate and emotional. The recording is a bit noisy.
TokHdpvX7Es
This audio contains a catchy melody played by a brass instrument, maybe a saxophone and a keyboard. A horn is playing a bass in a lower register along with a drum. In the background you can hear people laughing. This is an amateur recording and of poor audio-quality.
JSqyTVjYY6k
This song contains an acoustic drum playing a simple rock groove with an e-bass playing a bassline along with the e-guitar strumming chords on the same notes as the bass. A deep male voice is singing, almost streaming sounding wild and loud. Then a synth pad comes in playing a melody panned to the left side of the speaker. This song may be playing at a rock festival.
AHmcuClSTL

The low quality recording features an acoustic guitar tuning tutorial. The recording is in mono and very noisy.
M27mIdPCZEY
The low quality recording features a groovy piano melody, mellow cello melody and brass melody. It sounds emotional and passionate, while the recording is noisy.
hBT0bbJl1dU
The Metal song features a manic solo electric guitar melody over shimmering hi hats, shimmering crash cymbals, double pedal kick hits, punchy snare, groovy distorted bass guitar and aggressive electric guitar riffs. It sounds aggressive, manic and energetic - like something you would jump to at the concerts.
OEuBITrf-kE
An ambient instrumental piece with no drums, big reverb and complex sound design elements. Tape hiss can be heard in the background and the vibe is hazy and warm with childlike wonder.
sgTZHSTnU40
This is a fast tempo song. The audio quality is inferior so the vocals are muddled making it difficult to comprehend if the vocalist is a male or female. The music is incoherent too b

A male vocalist sings this famous Beatle’s song in Country style. The tempo is medium with an auto harp accompaniment, groovy bass line, keyboard harmony, female vocal backup and sound of clapping and cheering indicating that this is a live performance . The song is cheerful, melodious, engaging, energetic,youthful and upbeat.
cTzJlLzN-xU
The low quality recording features a live performance of steel drums. It sounds exotic, passionate and emotional, even though the recording is noisy and has some audio popping sounds.
mwLrABPxvgA
This bollywood song features a harmonica playing the main melody. The harmonica is played with a tremolo effect at the end of the lines. This is accompanied by a percussion playing a simple beat, A violin repeats the last part of the melody as played by the harmonica. The bass plays the root note of the chords. This is an amateur recording and the rest of the instruments are not clearly audible. This song can be played in a happy movie.
EZb1wQsg6CU
A male sin

This is a hip-hop music piece. There is a duo of one male and one female vocal singing in a way that resembles rapping. The beat has an electric guitar and a bass guitar playing a groovy tune. The rhythmic background consists of an electronic drum beat. The atmosphere is urban. This piece could be playing in the soundtrack of a crime movie where the characters are doubting their actions.
GRdzFvQezUE
A male guitarist plays the guitar and speaks about technique in this online video tutorial. The male voice is strong and commanding, along with guitar string twang sounds, clearly demonstrating the technique. The audio quality is mediocre.
-5xOcMJpTUk
The song is an instrumental. The tempo is medium fast with a guitar playing lead with a guitar effects pedal producing a distorted grunge tone. The song is energetic and passionate with no other instrumentation. The song audio quality is very poor.
merGvga39Yo
The music features a mix between jazz and soul music. The instrumental is easygoing 

There is a low sound that reminds of a heartbeat rising in tempo before a splash-like metallic sound with reverb comes in and breaks the atmosphere in half. This audio may be used for a movie scene. This audio is in poor quality.
-SD43H5B5hE
Someone is playing a composition using a minor chord with a lot of reverb. In the background you can hear soft strings underlining the song then a high violin comes in. A female voice is singing softly and sensitively. You can hear her breathing and mouth noises in the recording. Her voice is also full of reverb. This song may be playing in an emotional and sensitive/sad movie-scene.
xSDkn9PtQm0
This audio contains someone playing a very low sounding bass flute like instrument. Playing a rhythmic melody with a percussive sound created with the mouth. This song may be playing during a live performance from a busker.
BiqPn3d_dKM
The low quality recording features a groovy buzzy synth bass with a filter effect on it, followed by punchy "4 on the floor

The low quality recording features an ambient song that consists of a mellow piano melody, short plucked melody and wide synth pad. It sounds calming and relaxing.
WQsuFvw43RA
This is an instructive flamenco guitar piece. The player is depicting how the piece must be played by starting it slow and eventually getting faster. The theme is repeated by the player.
r43WrKA6ppI
The low quality recording features a chill electric guitar melody. It sounds easygoing and relaxing. The recording is noisy and in mono.
D-PxXM2I5gY
A male singer sings this vocals with a vocoder. The song is medium tempo with a groovy techno drumming rhythm and keyboard accompaniment. The song is passionate but an amateurish production. The audio quality is very poor.
xL7Krm6FSWE
This song is an amateur recording of a person beatboxing. The sound is produced using the mouth. This features sounds of the kick, snare and hi-hat. There are no other instruments in this song. There is no vocal melody apart from the beatbox

This clip is an instrumental of two separate tracks. The tempo is medium with the first clip of ambient nature sounds during a rainfall like, thunder, lightning and water flowing followed by  a rhythmic clanging of metal. The second clip is sinister music on the synthesiser followed by the buzzing of an electric circuit. The audio is dark, sinister, anticipatory, scary, suspenseful and spooky.
0RgGrVklaao
The low quality recording features a soul song sung by passionate female vocalists over mellow piano chords, smooth bass, shimmering cymbals and punchy kick and snare hits. There are some crowd chattering sounds. It sounds emotional, passionate and heartfelt.
KzvdKLdBw3s
A male vocalist sings this energetic song. The tempo is fast emphatic vocals, bright acoustic guitar accompaniment and clap beat. The song is youthful, energetic, popular, cheerful, engaging and happy. The music is minimal with ambient sounds of people clapping, cheering, singing along and keeping beat with claps. The

Here we have a guitarist playing an intricate rock motif on the electric guitar. We hear some rapid plucking of a melody that sounds dark and mysterious. This is a live recording.
DV5mynb77JM
This amateur recording features a shaker being played continuously. A cajon is played. After about two bars, the sound of the cajon changes to a more open sound. This video is an instrumental. There are no voices in this song. This song can be played as an instruction for cajon players.
15CZ2h5VL-A
The low quality recording features an orchestral music that consists of a wide string melody, followed by mellow brass, wide brass melody and shimmering cymbals. It sounds suspenseful and intense - like something that you would hear in the background of a movie scene.
mhru3GXbkHY
A digital drum is playing a hardstyle beat with a punching kick with a sub sound. Synthesizers/samples are playing industrial digital sounds as a melody. A female voice sample is singing a soulful sequence for a moment. This so

This is a gear showcase video demonstrating the sound difference when an electric guitar is connected to an amp. The electric guitar is playing simple tunes. A male voice is describing the applied effect. Sounds from this piece can be sampled for use in beat-making.
n1PTn_NH_K0
This is a meditation music piece. There is an ambient synth sound playing very faint notes. There are natural sounds that can be heard in the piece such as the water flowing and the crickets chirping. The atmosphere of the piece is relaxing and soothing. The piece would suit perfectly as background music for a meditation video. It could also be playing in the background at spas and wellness centers.
ANaaOqwO0Uo
This song contains digital drums playing a groove along with an aggressive synth saw wave pad repeating the same simple melody. A male kid's voice is chopped into a sample creating a rhythmic phrase. The same goes for a female voice. So it sounds like they are singing. This is an amateur recording. This s

This song is an instrumental. The tempo is slow with an amplified organ playing an intense, sinister chord, followed by a cymbal ride and howling wind produced by wind machine and a chime.. The vibe of the music is spooky, dangerous, ghostly, suspenseful, sinister, grim and scary.
RUUZzoeVrK4
The low quality recording features an electro song that consists of a "4 on the floor" kick pattern and widely spread repetitive synth melody. The recording is crackly and it sounds hypnotic and energetic as the synth lead is moving left and right throughout the stereo image.
tpnvHb9ZhlU
This is a reggae piece. There is a male vocalist singing with a Jamaican accent. An electric guitar is playing the main melody with a groovy bass guitar in the background. The rhythmic background is a simple 2/4 reggae beat being played by acoustic drums. The atmosphere is very chill. This piece could be played at beaches and summertime barbecue parties.
ZNGvyFsCx4g
The song is an instrumental. The tempo is fast w

This is the recording of a drum solo performance. The drummer is performing various complex beats and chops with a rapid procession on an acoustic drum kit. There is a tambourine placed on the snare drum that rattles with every stroke. The atmosphere of the solo is energetic. Parts of this solo could be sampled to be used as intro jingles for music channels. These samples could also be used in beat-making.
WIuLaxWIAAI
This song contains a piano adding single notes to digital strings being played in the high-mid and as a bassline. The recording is full of reverb and the instruments are slightly panned across the speakers. The atmosphere sounds sad,melancholic. This song may be playing in a sad movie-scene.
Y7SoAXBFUew
The low quality recording features a sustained didgeridoo melody played over quiet, shimmering shakers and wooden percussion. It sounds intense and suspenseful.
5at69yM1PoU
This jazz song features the main melody played on a reed instrument and a guitar in unison. The melo

The Classical music features a happy sounding piano chord progression and happy woodwind instrument, followed by rhythmic shoe tapping foley effect. you could tell that the music is played on speakers, while the foley artist is trying to sync the shoe tapping to the scene. Judging by the audio cutouts and mono sound, you could tell that it is an old, low quality, recording.
Q75y0TIp7Ds
A female vocalist sings this perky pop. The tempo is medium with rhythmic acoustic guitar , groovy bass lines, keyboard accompaniment and enthusiastic drumming. The song is simple, dulcet, fresh, peppy, youthful, vibrant and punchy with sounds of crowd hooting and cheering and talking. This song is a Pop song.
I-Z3gB6pfIA
The low quality recording features a live performance of a country song and it consists of banjo solo melody and electric guitar melody going back and forth, energetic drums, groovy bass and sustained strings melody playing in the background. The recording is noisy and in mono, and it s

This is a jingle piece in the background of a tech comparison video. There are various sound effects such as clock ticking and a camera shutter. The electric guitar is playing the same chord repeatedly with electronic drum hits to create an aura of tension. This piece can be used in advertisement jingles. It could also be used in the soundtrack of reality competition programs.
LK6zk03lPlM
This is a glam metal/heavy metal ballad. There is a male vocal and male back vocals singing melodically. The melody is being played on an electric guitar with a chorus effect applied alongside a bass guitar. The rhythmic background consists of a rock acoustic drum beat.The atmosphere is both hard-hitting and sentimental at the same time. This piece could be playing in the background of a rock/metal bar. It could also be included in the soundtrack of an 80s themed romance movie.
uiHyWdYkBvY
The low quality recording features a muffled crowd talking in the background while there is a horn melody, alongs

This pop song features a male voice singing the main melody using vocables. This is accompanied by an acoustic guitar strumming chords. The piano fills at a higher register and also plays chords. The bass guitar plays the root notes of the chords without any flourishes. A shaker is used in the background which acts as the percussion. Hand claps are played at every alternate count. Bells are played on the first count of the first two bars of each phrase. The melody is happy and upbeat. This song can be played in a movie scene featuring a family on a road trip.
0QYNC7J05XI
This is a jazz piece played in the background of a video game. A trumpet plays the main melody while a xylophone and a bass guitar is supporting the tune in the background. A playful jazz drum beat carries the rhythmic background. Occasional electric guitar fills in the form a strum can be heard. There are a lot of sound effects related to the game such as squeaking, chewing and explosions.
BkOfrw3c3EE
solo overdriven 

This is an amateur vocal cover of a pop music piece. There is a teen female vocalist singing melodically. On the backing track, an acoustic guitar is heard playing the main theme while an acoustic sounding drum track is playing the rhythmic background. The atmosphere is melancholic.
1RhYdQnZ_hw
The romantic music features a male voice singing a melody. An acoustic guitar accompanies the singer by playing chords. The drums play a light rhythm. Other percussion instruments such as triangles can be heard.
i6k1yiyO5jQ
The recording is of poor quality. A male voice is singing soulfully along to a backing instrumental sounding like coming out of some speakers. There is a piano playing and you can hear electric drums. This song may be playing at home practicing singing.
2GepmcbNlJY
Singers sing this prayer song in a congregational unison. The song is medium tempo with a heavily distorted harmony section. The song is of very bad quality with humming and whirring noises.
aPafZ1Mx-BE
The Alterna

The song is an instrumental. The song is medium tempo with a string section accompaniment, timpani playing and other percussion instruments. There are other funny sounds like ball bouncing and other percussion tones. The song is an ad jingle soundtrack. The audio track is of poor quality.
40vmsGsFBsw
The low quality recording features a wide punchy toms, wide aggressive electric guitars, sustained synth lead, distorted filtered bass and tinny hi hats. It sounds energetic and exciting.
3ZhyXbwFQAM
The clip features the tone of an alarm bell beeping. It beeps with a high and fuzzy tone. A button is then pressed. There is then the sound of a grungy electric guitar being played in a rock style.
9ZUzftiN2uw
This is a traditional middle eastern type of song with traditional singing that involves vocal modulation. The percussion is simple and encourages dancing. The three men sing in unison and there's an intermittent chant. They repeat the same phrase like a refrain.
ICtri0ElFZc
This recordi

The low quality recording features a DJ scratching a vinyl of a jungle song that consists of punchy kick and snare hits layered with energetic crash cymbal and groovy bass. The song sounds thin as it is probably playing on a speaker. The recording itself is noisy and in mono. It sounds energetic and aggressive overall.
cWOohqFud6g
The low quality recording features a folk song that consists of passionate male vocal singing over shimmering shakers, smooth bass, sustained strings melody and groovy piano melody. It sounds emotional, soulful and passionate - like something you would hear in movies.
exD5okdopWc
This composition contains a string section playing a rising and dissonant chord along with strings playing short notes. In the background you can hear atmospheric noises. This song may be playing in a scary movie scene.
B3q2wHpzhoM
This is an Irish folk music piece. The melody is being played by an acoustic guitar and a mandolin. The atmosphere of the piece is cozy and easygoing. It 

This audio contains a synth bass playing a bassline in a steady 8th notes rhythm along with other digital pluck sounds playing a fast pattern melody in the mid to higher register. Digital flute-like sounds are playing a melody along and one on top in a higher pitch. This song may be playing in an adventure video-game.
Ubj0jlheyvk
This techno song features a male voice singing the main melody. A female voice is heard singing vocables in the background. In the beginning of this clip, the voice is accompanied by a programmed percussion playing the kick at every count and the snare at alternate counts. A synth is playing the same melody as the main voice. A siren is playing in the background. After two bars of eight counts, the percussion plays a roll and the bass plays a groovy bassline. The synth plays a ring modulated sound. This song can be played in a club.
wMHBhCVv--g
The Disco song features a flat male vocal singing over a wide funky electric guitar melody, groovy bass, synth keys c

Techno or house music with a four on the floor kick pattern, off beat hi-hat pattern, synth chords with delay and a chopped vocal sample. It is mid-tempo and minimal for dancing.
x6FbyqrK0g0
This song features the sound of a music box. It sounds like a melody of bells playing in compound time. There are no voices in this song. There are no other instruments. The song has a dreamy feel. This song can be used in a dream sequence in a movie.
c9JyKnsegog
This is the live performance of a reggae piece. There is a female vocalist in the lead singing melodically. The keyboard can be heard playing the main melody. The bass guitar is playing a simple but groovy bass line. In the rhythmic background, there is a slow tempo acoustic drum beat. The atmosphere is emotional. This is an amateur recording and a bit dated, so the audio quality is quite poor.
Cn3xoxvbkF0
This song features a pop song being sung by male voices in acapella . The voices sing the different parts of a choir. They are singing 

This Spanish pop song features a male voice singing the main melody. This is accompanied by percussion playing a Latin beat. A shaker or maracas plays a continuous rhythm. An accordion plays chords in the background. The bass plays the root note of the chords. This song can be played at a party. The mood of this song is romantic.
BdKiPR3kdjo
This is a rock music piece playing in the background of a tutorial video. There is an electric guitar playing a simple tune as the lead while another provides a rhythmic backing by playing the same note repeatedly in rapid procession. There is an acoustic drum beat playing. There are water sound effects related to the video. The piece has a generic feeling. It does not evoke much excitement. It could be used as a jingle for an advertisement or in the background of an instructive video.
-taO6N-rxv4
The low quality recording features a dubstep song that consists of buzzy, manic synth bass, punchy kick, slapping claps and high pitched laser riser. At 

This hip-hop song features a male voice rapping the main lyrics. This is accompanied by programmed percussion playing a simple beat. Hand claps are played on every alternate count. The synth plays a repetitive melody in loop. The bass plays the root notes of the chords in harmony with the synth. This is a moderate tempo song. This song can be played in a club.
vEMNk-lbGTE
The country music features a male voice singing while being backed by female vocals. The drums play a simple and repeating rhythm and together with the bass guitar drive the pulse of the music. An electric guitar plays short repeating musical phrases that incorporate the slide technique. In the background a group of strings fills the harmony and also provides a melody in the higher register.
VJ-dpTx_3Cg
A female singer sings this beautiful melody. The tempo is medium with a piano or keyboard accompaniment and no other instrumentation. The song is emotional and passionate like a love song. The audio quality is very poo

This piano song features a piano playing the backing chords. A violin plays the main melody. The melody is romantic but turns eerie at the end. There are no other instruments in this song. A male voice narrates two words toward the end of the song. This song can be played in a promotional video.
g0aOPWwNMFQ
This is a live performance classical music piece played by a chamber orchestra. It is in the style of a tarantella, which is a genre of Italian folk music. The strings section is playing the melody with a cello as the lead. The atmosphere is lively and elegant. It could be used in the soundtrack of a movie especially during a scene involving a pursuit, a face-off or a duel.
w0A-4EbkVz8
This audio contains an e-bass playing a melody along with a simple drum groove and other stringlike instruments. An e-guitar is playing a lead melody on top and in the background you can hear the crowd screaming. This is an amateur recording and of poor audio-quality. This song may be playing during a

This is an instrumental military march music piece. The uilleann pipes are playing the piece at a medium-to-high range. The atmosphere is epic. This piece could be used in the soundtrack of a historical drama during the scenes of a victory parade.
ZF8uHVu4Res
This is a mantra/meditation music piece. There is a male and a female vocal chanting in a devotional manner. There is a tambura melody going on in the background while certain percussive elements at higher and lower ranges can be heard in the rhythmic structure. The atmosphere is hypnotic. This piece could be used in Hindu religious events.
VuWr1HXHoZg
This is a remix of an Indian folk music piece. There is a male vocalist singing melodically and passionately. The rhythmic background is full of percussive elements that were emphasized by the use of electronic drums. The atmosphere is lively and jovial. This piece could be played in dance parties and also in contemporary dancing courses as an accompaniment piece.
_mQ6KuA2p6k
This i

The low quality recording features a classical song that consists of an aggressive brass melody, punchy percussion hits, mellow low percussions and hollow flute melody playing over it. It sounds vintage, suspenseful, reverberant, intense and the recording is noisy.
vBw99ghST1g
This is an emotional ballad played on the theremin. The audio quality is poor, but still we feel the sad and sombre effects of the melody played by the performer.
sL01xTmV_Fc
Here we get to hear a hardcore metal piece. This is characterised, in part, by the rapid action on the kick drum - with notes being played at a rate of sixteenth notes on the kick. The drummer also frequents the ride cymbal. The guitar is grungy, gritty, fuzzy, heavy, deep, hardcore.
5KvjUzQbMT4
This is the recording of a jazz improvisation performance. It is in an instrumental piece performed with a keyboard that utilizes a classical piano sound. The chords are being played gently which creates a calming atmosphere. Due to various chords an

The Country Blues song features a flat male vocal singing over acoustic guitar melody, accordion and a sustained brass section melody. It sounds very noisy, vintage and therefore nostalgic and sentimental - like something you would hear in an old movie.
V5xL8hLFY58
This pop song features a female voice singing the main melody. This is accompanied by male and female backing voices. The voices sing a repetitive melody. This is accompanied by percussion playing a simple beat. Trumpets play a backing melody. The bass plays a groovy descending bassline. The mood of this song is happy. This song can be played at a club.
EtmKuWPpjG0
The low quality recording features a loud didgeridoo modulating low tone. It is in mono and it sounds buzzy and kind of weird.
WoXgPQQjcJU
The low quality recording features a pop folk song that consists of passionate female vocal singing over repetitive flute licks, stuttering hi hats, groovy piano chords, claps and a riser sweep. It sounds energetic and kind of 

In [None]:
processed_dataset

In [47]:
processed_dataset["input_ids"]

KeyError: "Column input_ids not in the dataset. Current columns in the dataset: ['audio', 'text']"

In [41]:
!ls data/wav-48

-0Gj8-vB1q4.wav  EptdhC17avY.wav  UD0pisqyLKA.wav  id5ibIqjRto.wav
-0SdAVK79lg.wav  EremEyLrdUY.wav  UDN11Q90Fa4.wav  idUZsNLnyDg.wav
-0vPFx-wRRI.wav  Es9FNjZ-SHI.wav  UDS6PrY9ZIM.wav  ieEPKa3HiGo.wav
-0xzrMun0Rs.wav  EsHXnkZ_W2c.wav  UDTsCTLkZzc.wav  iePMcLYozYY.wav
-1LrH01Ei1w.wav  EsssGCL-Axw.wav  UDg9vuBDtVU.wav  ihCl2ImrOYE.wav
-1OlgJWehn8.wav  EtmKuWPpjG0.wav  UEOUXjX5R2I.wav  ihJT65fZaHY.wav
-1UWSisR2zo.wav  Euu6zlJQSD0.wav  UEel3wTf0Sk.wav  ik-fXNjxw58.wav
-3Kv4fdm7Uk.wav  EwDiNj_5PEg.wav  UFyOGqmITjM.wav  ikEuQPSBY-0.wav
-4NLarMj4xU.wav  EwoCbcSXlSM.wav  UGEqY_NTMpI.wav  ikJKSqnTylI.wav
-4SYC2YgzL8.wav  Ex0ukzO5z9M.wav  UHvYrO1IGCc.wav  imIFtW4O5S0.wav
-5FoeegAgvU.wav  Ex18Xwznj60.wav  UIOnnpaqBy8.wav  iqEQBCrOLWc.wav
-5f6hjZf9Yw.wav  ExghbCGRBx0.wav  UJA5AWbt6HM.wav  iqOPJWWKo90.wav
-5xOcMJpTUk.wav  EyO5vB4eqo0.wav  UJAk1nNdo1I.wav  itT0_RhSipQ.wav
-6HBGg1cAI0.wav  EzP7PB2x670.wav  ULHPhjaJ6p0.wav  ivymRS3iEZk.wav
-6QGvxvaTkI.wav  Ezodz2aZnzQ.wav  ULXbXpLcoVA.wa

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [5]:
model.config.audio_encoder.sampling_rate

48000

In [6]:
audio_values.size()

torch.Size([1, 1, 489600])

Or save them as a `.wav` file using a third-party library, e.g. `scipy` (note here that we also need to remove the channel dimension from our audio tensor):

In [7]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./musicgen-musiccaps-finetuned-checkpoints",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    warmup_steps=500,
    learning_rate=5e-5,
    logging_dir='./logs',
    logging_steps=100,
)