In [7]:
!pip install librosa pretty_midi
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install miditok

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting miditok
  Downloading miditok-3.0.5.post1-py3-none-any.whl.metadata (10 kB)
Collecting symusic>=0.5.0 (from miditok)
  Downloading symusic-0.5.8-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (9.0 kB)
Collecting pySmartDL (from symusic>=0.5.0->miditok)
  Downloading pySmartDL-1.3.4-py3-none-any.whl.metadata (2.8 kB)
Downloading miditok-3.0.5.post1-py3-none-any.whl (158 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.3/158.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading symusic-0.5.8-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pySmartDL-1.3.4-py3-none-any.whl (20 kB)
Installing collected packages: pySmartDL, symusic, miditok
Successfully installed miditok-3.0.5.post1 pySmartDL-1.3.4 symusic-0.5.8


In [8]:
import os
import torch
import librosa
import pretty_midi
from google.colab import drive
from torch.utils.data import Dataset
import pandas as pd
import miditok

In [11]:
TOKENIZER_PARAMS = {
"pitch_range": (21, 109),
"beat_res": {(0,4):12, (4,12):12},
"special_tokens": ["PAD", "BOS", "EOS", "MASK"],
"use_chords": True,
"use_rests": False,
"use_tempos": True,
"use_time_signatures": True,
"use_programs": False,
"use_microtiming": True,
"ticks_per_quarter": 320,
"max_microtiming_shift": 0.125,
"num_microtiming_bins": 30,
}
config = miditok.TokenizerConfig(**TOKENIZER_PARAMS)

tokenizer=miditok.PerTok(config)

In [4]:
drive.mount('/content/drive')
DATA_ROOT='content/drive/MyDrive/song2cover'

Mounted at /content/drive


In [12]:
class AudioMidiDataset(Dataset):
  def __init__(self, root_dir, tokenizer, transform=None):
    self.root_dir = root_dir
    self.transform = transform
    df = pd.read_csv(os.path.join(root_dir, 'manifest.csv'))
    self.items = pd.to_dict('records')
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.items)

  def __getitem__(self, idx):
    item = self.items[idx]
    audio_path = item['audio_filepath']
    midi_path = item['midi_filepath']
    diff = torch.tensor(float(item['difficulty'])).float()

    y, sr = librosa.load(audio_path, sr=44100)
    stft = librosa.stft(y)
    x = torch.tensor(stft).unsqueeze(0).float()

    pm = pretty_midi.PrettyMIDI(midi_path)

    tokens = self.tokenizer.midi_to_tokens(pm)[0]
    y_tokens = torch.tensor(tokens, dtype=torch.long)

    sample = {
        'x': x,
        'y': y_tokens,
        'diff': diff
    }

    if self.transform:
      sample = self.transform(sample)

    return x, y_tokens, diff