In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [2]:
from miditok import REMI, TokenizerConfig

# Use REMIPlus tokenizer with appropriate config
config = TokenizerConfig()
config.use_programs = True
config.one_token_stream_for_programs = True
config.use_time_signatures = True

remi_tokenizer = REMI(config)

  super().__init__(tokenizer_config, params)


In [3]:
from pathlib import Path

# Path to your MIDI files
midi_dir = Path("./midi_chunks")
token_dir = Path("./gpt2_remi_tokens")
token_dir.mkdir(exist_ok=True)

# Tokenize each MIDI file
for midi_path in midi_dir.glob("*.mid"):
    tokens = remi_tokenizer(midi_path)
    id_strings = [str(token) for token in tokens.ids]
    text_sequence = ' '.join(id_strings)

    with open(token_dir / (midi_path.stem + ".txt"), "w") as f:
        f.write(text_sequence + '\n')

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

#gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")
gpt2_model = AutoModelForCausalLM.from_pretrained("distilgpt2")

In [6]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from transformers import PreTrainedTokenizerFast
import os

# Step 1: Create vocab.json from all REMI tokens in your dataset
vocab = set()
for file_name in token_dir.glob("*.txt"):
    with open(f"{file_name}", "r") as f:
        tokens = f.read().strip().split()
        vocab.update(tokens)

vocab = sorted(vocab)
vocab_dict = {token: i for i, token in enumerate(vocab)}
vocab_dict["<pad>"] = len(vocab_dict)
vocab_dict["<unk>"] = len(vocab_dict)

# Step 2: Build tokenizer
tokenizer = Tokenizer(WordLevel(vocab_dict, unk_token="<unk>"))
tokenizer.pre_tokenizer = Whitespace()
wrapped_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer, unk_token="<unk>", pad_token="<pad>")
wrapped_tokenizer.save_pretrained("remi-gpt2-tokenizer")

('remi-gpt2-tokenizer/tokenizer_config.json',
 'remi-gpt2-tokenizer/special_tokens_map.json',
 'remi-gpt2-tokenizer/tokenizer.json')

In [7]:
from datasets import load_dataset, Dataset
from pathlib import Path

# Load all text data
all_texts = []
for path in Path("gpt2_remi_tokens").glob("*.txt"):
    with open(path) as f:
        all_texts.append({"text": f.read().strip()})

dataset = Dataset.from_list(all_texts)

# Tokenize
def tokenize(example):
    return wrapped_tokenizer(example["text"], truncation=False)

tokenized_dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/1021 [00:00<?, ? examples/s]

In [8]:
from transformers import DataCollatorForLanguageModeling

collator = DataCollatorForLanguageModeling(
    tokenizer=wrapped_tokenizer,
    mlm=False  # not masked language modeling
)

In [9]:
from transformers import GPT2Config, GPT2LMHeadModel, Trainer, TrainingArguments

config = GPT2Config(
    vocab_size=len(wrapped_tokenizer),
    n_positions=1024,
    n_ctx=1024,
    n_embd=512,
    n_layer=6,
    n_head=8
)

training_args = TrainingArguments(
    output_dir="./gpt2-remi-model",
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    num_train_epochs=5,
    save_steps=1000,
    logging_dir='./logs',
    logging_steps=1000
)

trainer = Trainer(
    model=gpt2_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=wrapped_tokenizer,
    data_collator=collator
)

trainer.train()

  trainer = Trainer(
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [128,0,0], thread: [96,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [128,0,0], thread: [97,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [128,0,0], thread: [98,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [128,0,0], thread: [99,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [128,0,0], thread: [100,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectLargeIndex: block: [128,0,0], thread: [101,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1290: indexSelectL

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
