In [13]:
import os
import music21 as m21
import json
import keras
import numpy as np

import warnings
warnings.filterwarnings("ignore", category=UserWarning) # cuz my version of python complains about deprecated stuff

In [16]:
songs_path = "songs"
acceptable_durations = [0.25, 0.5, 0.75, 1.0, 1.5, 2, 3, 4]
preprocessing_path = "preprocessing"
single_file_path = "input"
mapping_path = "mapping.json"
model_path = "model.h5"
sequence_length = 64

### Preprocessing

This is the hardest part by far. I have some experience with music theory, which kinda helped.
I made my own training data inspired by snippets of the lofi songs. These followed a strict format, using only single notes or major triad chords.
Once these were made, I transposed them to C major or A minor (to avoid sharps/flats) and encoded the songs using a custom mapping that only contains the single notes or triads. The mappings follow the MIDI numbers for each note. Chords are separated by periods. Rests and measure breaks also have their own symbol to establish rhythm during encoding.

    "48.52.55": 1, # C major
    "50.53.57": 2, # D minor
    "52.55.59": 3, # E minor
    "53.57.60": 4, # F major
    "55.59.62": 5, # G major
    "57.60.64": 6, # A minor
    "59.62.65": 7, # B diminished
    "60.64.67": 8, # C major (higher octave)
    "62.65.69": 9, # D minor (higher octave)
    "64.67.71": 10, # E minor (higher octave)
    "65.69.72": 11, # F major (higher octave)
    "67.71.74": 12, # G major (higher octave)
    "69.72.76": 13, # A minor (higher octave)

In [3]:
def load_songs(folder_path):
    songs = []

    for file in os.listdir(folder_path):
        song = m21.converter.parse(os.path.join(folder_path, file))
        songs.append(song)

    return songs

def has_acceptable_notes(song, acceptable_durations):
    for note in song.flatten().notesAndRests: # filters out metadata from the m21 stuff so it's just the notes
        if note.duration.quarterLength not in acceptable_durations:
            return False
    return True
        
def transpose(song):
    key = song.analyze('key')

    if key.mode == 'major':
        interval = m21.interval.Interval(key.tonic, m21.pitch.Pitch('C'))
    elif key.mode == 'minor':
        interval = m21.interval.Interval(key.tonic, m21.pitch.Pitch('A'))

    transposed_song = song.transpose(interval)
    return transposed_song

def encode_song(song, time_step = 0.25):

    encoded_song = []

    for note in song.flatten().notesAndRests:
        if isinstance(note, m21.note.Note):
            symbol = f"{note.pitch.midi}"
        elif isinstance(note, m21.note.Rest):
            symbol = "r"
        elif isinstance(note, m21.chord.Chord):
            symbol = ".".join(str(n.midi) for n in note.pitches)
        
        steps = int(note.duration.quarterLength / time_step)

        for step in range(steps):
            if step == 0:
                encoded_song.append(symbol)
            else:
                encoded_song.append("_")

    encoded_song = " ".join(map(str, encoded_song))
    return encoded_song

def load(path):
    with open(path, "r") as f:
        song = f.read()
    return song

def condense(dataset_path, file_dataset_path, sequence_length=64):
    delimiter = "/ " * sequence_length
    songs = ""

    for file in os.listdir(dataset_path):
        path = os.path.join(dataset_path, file)
        song = load(path)
        songs += song + " " + delimiter
    songs = songs[:-1] # gets rid of the final delimiter

    with open(file_dataset_path, "w") as f:
        f.write(songs)

    return songs


def preprocess(folder_path):
    # load everything
    print("Loading songs...")
    songs = load_songs(folder_path)
    print(f"Loaded {len(songs)} songs.\n")
    
    for i, song in enumerate(songs):
        # filter out songs that have weird notes and duration
        if not has_acceptable_notes(song, acceptable_durations):
            continue

        # transpose to either C major or A minor, since both have no flats/sharps and you need both major/minor for mood
        transposed_song = transpose(song)

        # encode to time series
        encoded_song = encode_song(transposed_song)

        save_path = os.path.join(preprocessing_path, str(i))
        with open(save_path + ".txt", "w") as f:
            f.write(encoded_song)

        

In [4]:
preprocess(songs_path)
songs = condense(preprocessing_path, single_file_path, sequence_length=64)

Loading songs...
Loaded 93 songs.



Mapping to Readable Notation

In [5]:
def create_mapping(songs, mapping_path):
    mappings = {}

    songs = songs.split()
    vocab = sorted(set(songs)) # keeps unique symbols in order

    for i, symbol in enumerate(vocab): # isolates unique symbols
        mappings[symbol] = i

    with open(mapping_path, "w") as f:
        json.dump(mappings, f, indent=4)

def convert_to_int(songs):
    int_songs = []

    with open(mapping_path, "r") as f:
        mappings = json.load(f)
    songs = songs.split()

    for symbol in songs:
        int_songs.append(mappings[symbol])

    return int_songs

def generate_batches(sequence_length):
    songs = load(single_file_path)
    int_songs = convert_to_int(songs)

    inputs = []
    targets = []
    num_sequences = len(int_songs) - sequence_length
    for i in range(num_sequences):
        inputs.append(int_songs[i:i + sequence_length]) # slides time series by one step
        targets.append(int_songs[i + sequence_length])

    vocab_size = len(set(int_songs))
    inputs = keras.utils.to_categorical(inputs, num_classes=vocab_size)
    targets = np.array(targets)

    return inputs, targets
    


In [6]:
create_mapping(songs, mapping_path)
inputs, targets = generate_batches(sequence_length=64)

### The model

In [11]:
vocab_size = 126
model = keras.Sequential()
model.add(keras.Input(shape=(None, vocab_size)))
model.add(keras.layers.LSTM(128))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(vocab_size, activation='softmax'))

model.compile(loss=keras.losses.SparseCategoricalCrossentropy(), optimizer=keras.optimizers.Nadam(), metrics=["accuracy"])

model.summary()

In [12]:
model.fit(inputs, targets, epochs=40, batch_size=64)
model.save("model.h5")

Epoch 1/40
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.6365 - loss: 2.1939
Epoch 2/40
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.7876 - loss: 1.1620
Epoch 3/40
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.8056 - loss: 1.0837
Epoch 4/40
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.8122 - loss: 1.0197
Epoch 5/40
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.8240 - loss: 0.9579
Epoch 6/40
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.8259 - loss: 0.9510
Epoch 7/40
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.8253 - loss: 0.9382
Epoch 8/40
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.8292 - loss: 0.9018
Epoch 9/40
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━



In [20]:
generator = keras.models.load_model(model_path)
mappings = json.load(open(mapping_path, "r"))



In [35]:
def generate(model, mappings, sequence_length, seed, num_steps=500, temperature=1.0):
    reverse_mapping = {v: k for k, v in mappings.items()}
    vocab_size = len(mappings)

    seed_int = [mappings[symbol] for symbol in seed.split()]
    output = list(seed_int)

    for _ in range(num_steps):
        seed_sequence = output[-sequence_length:]
        onehot = keras.utils.to_categorical(seed_sequence, num_classes=vocab_size)
        onehot = np.expand_dims(onehot, axis=0)

        preds = model.predict(onehot, verbose=0)[0]

        preds = np.log(preds + 1e-8) / temperature
        preds = np.exp(preds) / np.sum(np.exp(preds))

        next_idx = np.random.choice(range(vocab_size), p=preds)
        output.append(next_idx)

    generated = [reverse_mapping[i] for i in output]
    return " ".join(generated)

def decode_song(encoded_song, time_step=0.25):

    song_stream = m21.stream.Stream()
    symbols = encoded_song.split()
    step_counter = 0
    prev_symbol = None

    for symbol in symbols:
        if symbol != "_":
            if prev_symbol and step_counter > 0:
                dur = m21.duration.Duration(step_counter * time_step)
                song_stream[-1].duration = dur
            if symbol == "r" or symbol == "/":
                song_stream.append(m21.note.Rest())
            elif "." in symbol:
                pitches = [int(x) for x in symbol.split(".")]
                song_stream.append(m21.chord.Chord(pitches))
            else:
                midi_number = int(symbol)
                song_stream.append(m21.note.Note(midi_number))
            step_counter = 1
            prev_symbol = symbol
        else:
            step_counter += 1

    song_stream.write("midi", fp="generated_song.mid")

In [39]:
seed = "67 _ _ _ _ _ 69 _ _ 71 _ _ _ _ _ 72 _ _ _ _ 71 _ _ 69 _ _ _ _"  # or take from your dataset
generated = generate(model, mappings, sequence_length=64, seed=seed, num_steps=200, temperature=2)
decode_song(generated)