In [None]:
!pip install mido

Collecting mido
  Downloading mido-1.3.3-py3-none-any.whl.metadata (6.4 kB)
Downloading mido-1.3.3-py3-none-any.whl (54 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mido
Successfully installed mido-1.3.3


In [None]:
import pandas as pd
import numpy as np
import pickle
import glob
from collections import Counter

import mido
from mido import Message, MidiFile, MidiTrack, MetaMessage

import warnings
warnings.filterwarnings('ignore')

In [None]:
data_path = "./data"
model_path = "./model"

In [None]:
df = pd.read_csv(f"{data_path}/csvs/2345.csv")
df.head(15)

Unnamed: 0,start_time,end_time,instrument,note,start_beat,end_beat,note_value
0,22494,30173,1,37,0.0,0.489583,Eighth
1,27102,34270,1,44,0.25,0.489583,Eighth
2,30686,37854,1,44,0.5,0.489583,Eighth
3,30686,37854,1,49,0.5,0.489583,Eighth
4,34270,42462,1,52,0.75,0.489583,Eighth
5,37854,46046,1,56,1.0,0.489583,Eighth
6,37854,46046,1,37,1.0,0.489583,Eighth
7,42462,49118,1,49,1.25,0.489583,Eighth
8,46046,52190,1,52,1.5,0.489583,Eighth
9,46046,52190,1,44,1.5,0.489583,Eighth


In [None]:
def midi_to_note_dataframe(midi_path):
    mid = mido.MidiFile(midi_path)
    ticks_per_beat = mid.ticks_per_beat

    temp = False
    count = 0
    numerator, denominator = 0, 0
    for msg in mid.tracks[0]:
        if msg.type == "time_signature":
            if not temp:
                numerator, denominator = msg.numerator, msg.denominator
            count += 1


    print(f"Time signature: {numerator}/{denominator}\n# of Signature Changes: {count}")

    notes = []
    note_stack = {}  # (track, channel, note) -> (start_tick, velocity, tempo)

    for i, track in enumerate(mid.tracks):
        abs_tick = 0
        tempo = 500000  # Default tempo (120 BPM)

        for msg in track:
            abs_tick += msg.time

            if msg.type == 'set_tempo':
                tempo = msg.tempo

            if msg.type == 'note_on' and msg.velocity > 0:
                key = (i, msg.channel, msg.note)
                note_stack[key] = (abs_tick, msg.velocity, tempo)

            elif (msg.type == 'note_off') or (msg.type == 'note_on' and msg.velocity == 0):
                key = (i, msg.channel, msg.note)
                if key in note_stack:
                    start_tick, velocity, start_tempo = note_stack.pop(key)
                    duration_ticks = abs_tick - start_tick
                    start_time_sec = mido.tick2second(start_tick, ticks_per_beat, start_tempo)
                    duration_sec = mido.tick2second(duration_ticks, ticks_per_beat, start_tempo)
                    notes.append({
                        'hand': "Right" if i==1 else "Left",
                        'channel': msg.channel,
                        'note': msg.note,
                        'velocity': velocity,
                        'start_beat': start_time_sec * (denominator/2),
                        'duration': duration_sec * (denominator/2)
                    })

    return pd.DataFrame(notes)

In [None]:
def create_df(midi_file):
    def fix_note_value(note_value):
        return note_value.replace(" ", "")

    _id = midi_file.split("_")[0]

    mid = mido.MidiFile(f"{data_path}/midis/{midi_file}")
    df_original = pd.read_csv(f"{data_path}/csvs/{_id}.csv")

    df_original = df_original.drop(columns=["start_time", "end_time", "instrument"])
    df_original = df_original.rename(columns={"end_beat": "duration"})
    df_original["note_value"] = df_original["note_value"].apply(fix_note_value)
    df_original["duration"] = df_original["duration"].round(6)
    df_original["start_beat"] = df_original["start_beat"].round(6)

    df = midi_to_note_dataframe(f"{data_path}/midis/{midi_file}")
    df["duration"] = df["duration"].round(6)
    df["start_beat"] = df["start_beat"].round(6)

    print(f"Length of parsed df: {len(df)}\nLength of original df: {len(df_original)}")
    print(f"# of left and right notes: {len(df[df['hand'] == 'Left'])}, {len(df[df['hand'] == 'Right'])}")

    df = df.merge(df_original[['note', 'start_beat', 'duration', 'note_value']],
                on=['note', 'start_beat', 'duration'], how='left')
    df.sort_values(by=["start_beat"], inplace=True)
    df.index = [i for i in range(len(df))]

    num_nans = len(df[df['note_value'].isna()])
    print(f"# of nans as note values: {num_nans}")

    if num_nans > 35:
        return None
    else:
        df.dropna(inplace=True)
        return df

In [None]:
df = create_df("2345_ps14_03.mid")
df.head(15)

Time signature: 4/4
# of Signature Changes: 1
Length of parsed df: 6549
Length of original df: 6548
# of left and right notes: 3077, 3472
# of nans as note values: 1


Unnamed: 0,hand,channel,note,velocity,start_beat,duration,note_value
0,Left,1,37,80,0.0,0.489583,Eighth
1,Right,0,44,80,0.25,0.489583,Eighth
2,Left,1,44,80,0.5,0.489583,Eighth
3,Right,0,49,80,0.5,0.489583,Eighth
4,Right,0,52,80,0.75,0.489583,Eighth
5,Left,1,37,80,1.0,0.489583,Eighth
6,Right,0,56,80,1.0,0.489583,Eighth
7,Right,0,49,80,1.25,0.489583,Eighth
8,Left,1,44,80,1.5,0.489583,Eighth
9,Right,0,52,80,1.5,0.489583,Eighth


In [None]:
# Convert MIDI note numbers to note names (e.g., 60 -> C4)
def midi_to_note_name(midi_number):
    note_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
    octave = (midi_number // 12) - 1
    note = note_names[midi_number % 12]
    return f"{note}{octave}"

# Function to group and process chords
def form_chords(df):
    df["note_name"] = df["note"].apply(midi_to_note_name)
    df["chord"] = ""
    grouped = df.groupby(["hand", "start_beat"])

    records = []

    for (hand, start_beat), group in grouped:
        if len(group) == 1:
            row = group.iloc[0].copy()
            row["chord"] = row["note_name"]
            records.append(row)
        else:
            # Chord name
            chord_notes = sorted(group["note_name"].tolist(), reverse=True)
            chord_label = "_".join(chord_notes)

            # Most frequent note_value
            most_common_note_value = Counter(group["note_value"]).most_common(1)[0][0]

            # Longest duration
            max_duration = group["duration"].max()

            # Create one representative row for the chord
            row = group.iloc[0].copy()
            row["chord"] = chord_label
            row["note_value"] = most_common_note_value
            row["duration"] = max_duration
            records.append(row)

    df = pd.DataFrame(records)
    df.drop(columns=["note_name"], inplace=True)
    df.sort_values(by="start_beat", inplace=True)

    return df

In [None]:
df = form_chords(df)
df.head(15)

Unnamed: 0,hand,channel,note,velocity,start_beat,duration,note_value,chord
0,Left,1,37,80,0.0,0.489583,Eighth,C#2
1,Right,0,44,80,0.25,0.489583,Eighth,G#2
3,Right,0,49,80,0.5,0.489583,Eighth,C#3
2,Left,1,44,80,0.5,0.489583,Eighth,G#2
4,Right,0,52,80,0.75,0.489583,Eighth,E3
6,Right,0,56,80,1.0,0.489583,Eighth,G#3
5,Left,1,37,80,1.0,0.489583,Eighth,C#2
7,Right,0,49,80,1.25,0.489583,Eighth,C#3
9,Right,0,52,80,1.5,0.489583,Eighth,E3
8,Left,1,44,80,1.5,0.489583,Eighth,G#2


In [None]:
for file in glob.glob(f"{data_path}/midis/*.mid"):
    print(file)
    file_name = file.split("/")[-1]
    file_id = file_name.split("_")[0]

    df = create_df(file_name)

    if df is None:
        continue

    df = form_chords(df)
    df.to_csv(f"{data_path}/preprocessed_csvs/{file_id}.csv")

/content/drive/MyDrive/Colab Notebooks/grad_proj/data/midis/2345_ps14_03.mid
Time signature: 4/4
# of Signature Changes: 1
Length of parsed df: 6549
Length of original df: 6548
# of left and right notes: 3077, 3472
# of nans as note values: 1
/content/drive/MyDrive/Colab Notebooks/grad_proj/data/midis/2374_ps19_02.mid
Time signature: 6/8
# of Signature Changes: 1
Length of parsed df: 2027
Length of original df: 2026
# of left and right notes: 1040, 987
# of nans as note values: 1
/content/drive/MyDrive/Colab Notebooks/grad_proj/data/midis/2359_ps15_04.mid
Time signature: 6/8
# of Signature Changes: 2
Length of parsed df: 2638
Length of original df: 2638
# of left and right notes: 1030, 1608
# of nans as note values: 0
/content/drive/MyDrive/Colab Notebooks/grad_proj/data/midis/2371_ps09_02.mid
Time signature: 3/4
# of Signature Changes: 1
Length of parsed df: 1275
Length of original df: 1274
# of left and right notes: 521, 754
# of nans as note values: 1
/content/drive/MyDrive/Colab No

In [None]:
def create_token_sequence(df, quantize=False, quantize_step=0.125):
    def round_beat(x):
        return round(x / quantize_step) * quantize_step if quantize else x

    df = df.copy()
    if quantize:
        df["start_beat"] = df["start_beat"].apply(round_beat)
        df["duration"] = df["duration"].apply(round_beat)

    df = df.sort_values(by=["start_beat", "hand"])  # overall ordering
    sequence = []
    prev_beat = 0.0

    for _, row in df.iterrows():
        current_beat = row["start_beat"]
        time_shift = round_beat(current_beat - prev_beat)
        sequence.append(f"TIME_SHIFT_{time_shift:.3f}")
        prev_beat = current_beat

        hand = row["hand"]
        sequence.append(f"[HAND_{hand.upper()}]")

        chord_str = row["chord"]
        note_value = row["note_value"]

        if "_" in chord_str:
            notes = chord_str.split("_")
            sequence.append("[CHORD_START]")
            for note in notes:
                sequence.append(f"NOTE_{note}")
            sequence.append(f"VALUE_{note_value}")
            sequence.append("[CHORD_END]")
        else:
            sequence.append(f"NOTE_{chord_str}")
            sequence.append(f"VALUE_{note_value}")

    return sequence

In [None]:
class MusicTokenizer:
    def __init__(self):
        self.token_to_id = {}
        self.id_to_token = {}

    def build_vocab(self, sequences):
        for line in sequences:
            parts = line.strip().split()
            for token in parts:
                if token not in self.token_to_id:
                    token_id = len(self.token_to_id)
                    self.token_to_id[token] = token_id
                    self.id_to_token[token_id] = token

    def encode(self, sequences):
        return [[self.token_to_id[token] for token in line.strip().split()] for line in sequences]

    def decode(self, id_sequences):
        return [" ".join([self.id_to_token[token_id] for token_id in line]) for line in id_sequences]

    def decode2(self, id_sequences):
        return [" ".join([self.id_to_token[id] for id in id_sequences])]

In [None]:
def prepare_training_data(token_sequence, tokenizer, seq_len=256):
    tokens = []
    for token in token_sequence:
        tokens.append(token)

    token_ids = [tokenizer.token_to_id[token] for token in tokens if token in tokenizer.token_to_id.keys()]

    for i in range(len(token_ids) - seq_len):
        x.append(token_ids[i:i+seq_len])
        y.append(token_ids[i+seq_len])

In [None]:
x, y = [], []
tokenizer = MusicTokenizer()
max_seq_length = 512

for file in glob.glob(f"{data_path}/preprocessed_csvs/*.csv"):
    df = pd.read_csv(file)
    df = df.drop(columns=["Unnamed: 0"])

    sequence = create_token_sequence(df)
    tokenizer.build_vocab(sequence)
    prepare_training_data(sequence, tokenizer, seq_len=max_seq_length)

In [None]:
import random
from sklearn.model_selection import train_test_split

# Use a local random generator
rng = random.Random(42)

def train_test_val_split(x, y, tokenizer, seq_len):
    # Shuffle before splitting
    combined = list(zip(x, y))
    rng.shuffle(combined)
    x, y = zip(*combined)

    x = np.array(x, dtype=np.int32)
    y = np.array(y, dtype=np.int32)

    # Split into train (80%) and temp (20%)
    x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42)

    # Split temp into validation (10%) and test (10%)
    x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

    return x_train, x_test, x_val, y_train, y_test, y_val

In [None]:
x_train, x_test, x_val, y_train, y_test, y_val = train_test_val_split(x, y, tokenizer, seq_len=max_seq_length)

In [None]:
np.save(f'{data_path}/x_train.npy', x_train)
np.save(f'{data_path}/x_test.npy', x_test)
np.save(f'{data_path}/x_val.npy', x_val)
np.save(f'{data_path}/y_train.npy', y_train)
np.save(f'{data_path}/y_test.npy', y_test)
np.save(f'{data_path}/y_val.npy', y_val)

In [None]:
with open(f"{model_path}/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)