In [33]:
import os

import torch
from torch.utils.data import DataLoader

import importlib
import data_preprocessing, midi_conversion, model_helpers, models, text_processing

from transformers import (
    GPT2LMHeadModel,
    GPT2Config
)

importlib.reload(data_preprocessing)
importlib.reload(midi_conversion)
importlib.reload(model_helpers)
importlib.reload(models)
importlib.reload(text_processing)


<module 'text_processing' from 'd:\\classical-music-generation-model\\text_processing.py'>

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

DATA_DIR = "data/midi_text_exports"
VOCAB_FILE = "data/midi_text_exports/midi_vocab.txt"
BLOCK_SIZE = 512
BATCH_SIZE = 24
NUM_EPOCHS = 20
LEARNING_RATE = 3e-4
WEIGHT_DECAY = 0.01
MODEL_SAVE_DIR = "models/midi_gpt2_model"

Using device: cuda


In [8]:
from data_preprocessing import get_midis_by_composer, midi_split_to_text_split

composers = ["mozart", "haydn", "beethoven"]
midis = get_midis_by_composer(composers)

# [[train texts], [val texts], [test texts]]
# Export dir: "data/midi_text_exports"
midi_texts = midi_split_to_text_split(midis, save_to_directory="data/midi_text_exports")


Now loading MIDIs from data\train.
Could not load data\train\beethoven-anhang_14_3.mid: Could not decode key with 3 flats and mode 255
Could not load data\train\mozart-piano_sonatas-nueva_carpeta-k281_piano_sonata_n03_3mov.mid: Could not decode key with 2 flats and mode 2
Could not load data\train\unknown_artist-i_o-mozart_k550.mid: MThd not found. Probably not a MIDI file
Loaded 500 MIDI files from data\train
Now loading MIDIs from data\val.
Loaded 47 MIDI files from data\val
Now loading MIDIs from data\test.
Could not load data\test\unknown_artist-i_o-mozart_q1_2.mid: MThd not found. Probably not a MIDI file
Loaded 43 MIDI files from data\test
590 MIDI files retrieved.
Successfully processed 500 MIDIs into text.
Successfully processed 47 MIDIs into text.
Successfully processed 43 MIDIs into text.
Saved 500 files to                       data/midi_text_exports\train
Saved 47 files to                       data/midi_text_exports\val
Saved 43 files to                       data/midi_tex

In [9]:
from text_processing import build_vocab_from_dir
from text_processing import MidiTokenizer


if not os.path.exists(VOCAB_FILE):
    print(f"{VOCAB_FILE} not found, building from {DATA_DIR}...")
    counter = build_vocab_from_dir(DATA_DIR)
    base_tokens = sorted(counter.keys())
    specials = ["<pad>", "<bos>", "<eos>", "<unk>"]
    vocab = specials + base_tokens
    with open(VOCAB_FILE, "w", encoding="utf-8") as f:
        for tok in vocab:
            f.write(tok + "\n")
    print(f"Saved vocab with {len(vocab)} tokens to {VOCAB_FILE}")
else:
    print(f"Found existing vocab file: {VOCAB_FILE}")

tokenizer = MidiTokenizer(VOCAB_FILE)
vocab_size = len(tokenizer.get_vocab())
print("MIDI vocab size:", vocab_size)

Found existing vocab file: data/midi_text_exports/midi_vocab.txt
MIDI vocab size: 701


In [12]:
from text_processing import MidiTextDataset
from model_helpers import collate_fn


train_dataset = MidiTextDataset(os.path.join(DATA_DIR, "train"), tokenizer, block_size=BLOCK_SIZE)
val_dataset   = MidiTextDataset(os.path.join(DATA_DIR, "val"),   tokenizer, block_size=BLOCK_SIZE)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=lambda b: collate_fn(b, tokenizer.pad_token_id),
)
val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=lambda b: collate_fn(b, tokenizer.pad_token_id),
)

Loaded 21696 sequences from data/midi_text_exports\train
Loaded 1962 sequences from data/midi_text_exports\val


In [None]:
print("Loading pretrained GPT-2...")
base_model_name = "gpt2"  # you can try "gpt2-medium" if you have VRAM

pretrained_model = GPT2LMHeadModel.from_pretrained(base_model_name)
base_config = pretrained_model.config

hidden_size = base_config.n_embd
print("GPT-2 hidden size:", hidden_size)

# New config: same architecture, new vocab size + pad/bos/eos
new_config = GPT2Config(
    vocab_size=vocab_size,
    n_positions=base_config.n_positions,
    n_ctx=base_config.n_ctx,
    n_embd=base_config.n_embd,
    n_layer=base_config.n_layer,
    n_head=base_config.n_head,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    resid_pdrop=0.2,  # for mitigating overfitting
    embd_pdrop=0.2,
    attn_pdrop=0.2,
)

model = GPT2LMHeadModel(new_config)

# Copy transformer blocks and positional embeddings from pretrained
with torch.no_grad():
    # positional embeddings
    model.transformer.wpe.weight.copy_(pretrained_model.transformer.wpe.weight)

    # transformer blocks
    for new_block, old_block in zip(model.transformer.h, pretrained_model.transformer.h):
        new_block.load_state_dict(old_block.state_dict())

    # final layer norm
    model.transformer.ln_f.load_state_dict(pretrained_model.transformer.ln_f.state_dict())

    # We intentionally leave token embeddings (wte) and lm_head randomly initialized
    # to match new vocab.

model = model.to(DEVICE)
print("Model ready. New vocab size:", model.config.vocab_size)

Loading pretrained GPT-2...
GPT-2 hidden size: 768
Model ready. New vocab size: 701


In [21]:
from models import train_gpt_2

train_gpt_2(model, train_loader, val_loader, num_epochs=NUM_EPOCHS, lr=LEARNING_RATE,
            weight_decay=WEIGHT_DECAY, device=DEVICE, model_save_dir=MODEL_SAVE_DIR)


Epoch 1/20: 100%|██████████| 904/904 [09:51<00:00,  1.53it/s, batch_loss=1.0457]


Epoch 1/20 | train loss:               2.6200 | val loss: 1.1237
  -> saving best model


Epoch 2/20: 100%|██████████| 904/904 [09:43<00:00,  1.55it/s, batch_loss=0.9353]


Epoch 2/20 | train loss:               0.9596 | val loss: 0.9537
  -> saving best model


Epoch 3/20: 100%|██████████| 904/904 [09:44<00:00,  1.55it/s, batch_loss=0.7254]


Epoch 3/20 | train loss:               0.8253 | val loss: 0.8794
  -> saving best model


Epoch 4/20: 100%|██████████| 904/904 [09:44<00:00,  1.55it/s, batch_loss=0.8048]


Epoch 4/20 | train loss:               0.7546 | val loss: 0.8527
  -> saving best model


Epoch 5/20: 100%|██████████| 904/904 [09:43<00:00,  1.55it/s, batch_loss=0.8338]


Epoch 5/20 | train loss:               0.7061 | val loss: 0.8355
  -> saving best model


Epoch 6/20: 100%|██████████| 904/904 [09:44<00:00,  1.55it/s, batch_loss=0.6464]


Epoch 6/20 | train loss:               0.6669 | val loss: 0.8307
  -> saving best model


Epoch 7/20: 100%|██████████| 904/904 [09:44<00:00,  1.55it/s, batch_loss=0.6638]


Epoch 7/20 | train loss:               0.6322 | val loss: 0.8278
  -> saving best model


Epoch 8/20: 100%|██████████| 904/904 [09:45<00:00,  1.54it/s, batch_loss=0.6333]


Epoch 8/20 | train loss:               0.5998 | val loss: 0.8327


Epoch 9/20: 100%|██████████| 904/904 [09:44<00:00,  1.55it/s, batch_loss=0.6043]


Epoch 9/20 | train loss:               0.5697 | val loss: 0.8399


Epoch 10/20: 100%|██████████| 904/904 [09:44<00:00,  1.55it/s, batch_loss=0.6018]


Epoch 10/20 | train loss:               0.5413 | val loss: 0.8548


Epoch 11/20: 100%|██████████| 904/904 [09:44<00:00,  1.55it/s, batch_loss=0.4291]


Epoch 11/20 | train loss:               0.5149 | val loss: 0.8657


Epoch 12/20: 100%|██████████| 904/904 [09:45<00:00,  1.54it/s, batch_loss=0.4671]


Epoch 12/20 | train loss:               0.4902 | val loss: 0.8791


Epoch 13/20: 100%|██████████| 904/904 [09:43<00:00,  1.55it/s, batch_loss=0.4583]


Epoch 13/20 | train loss:               0.4675 | val loss: 0.8967


Epoch 14/20: 100%|██████████| 904/904 [09:43<00:00,  1.55it/s, batch_loss=0.4454]


Epoch 14/20 | train loss:               0.4466 | val loss: 0.9109


Epoch 15/20: 100%|██████████| 904/904 [10:00<00:00,  1.51it/s, batch_loss=0.6344]


Epoch 15/20 | train loss:               0.4278 | val loss: 0.9225


Epoch 16/20: 100%|██████████| 904/904 [09:59<00:00,  1.51it/s, batch_loss=0.4723]


Epoch 16/20 | train loss:               0.4108 | val loss: 0.9404


Epoch 17/20: 100%|██████████| 904/904 [09:45<00:00,  1.54it/s, batch_loss=0.5193]


Epoch 17/20 | train loss:               0.3961 | val loss: 0.9587


Epoch 18/20: 100%|██████████| 904/904 [09:45<00:00,  1.54it/s, batch_loss=0.4008]


Epoch 18/20 | train loss:               0.3831 | val loss: 0.9694


Epoch 19/20: 100%|██████████| 904/904 [09:44<00:00,  1.55it/s, batch_loss=0.3547]


Epoch 19/20 | train loss:               0.3729 | val loss: 0.9779


Epoch 20/20: 100%|██████████| 904/904 [09:50<00:00,  1.53it/s, batch_loss=0.3447]


Epoch 20/20 | train loss:               0.3648 | val loss: 0.9856


In [41]:
from models import generate_midi_tokens_with_gpt_model
from midi_conversion import text_to_midi
import util

# Example generation
example_prompt = "<SOS> COMPOSER_beethoven"
generated_tokens = generate_midi_tokens_with_gpt_model(
    example_prompt, VOCAB_FILE, MODEL_SAVE_DIR, max_new_tokens=1021, temp=1.0)
print("Generated token sequence:")
print(generated_tokens[:100], "..." if len(generated_tokens) > 500 else "")

# Convert generated text to MIDI and save
generated_mid = text_to_midi(generated_tokens)
util.mkdir("generated")
midi_path = util.path_join("generated", "gpt2_generated_sample.mid")
generated_mid.save(midi_path)
print("Saved generated MIDI to:", midi_path)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
