## Goal
preprocess wav,midi into ingestible format, line up onsets

mp3 -> wav -> stft/cqt -> matrix[num_onsets][frame_size/sequence length][freq_bins]

midi -> piano_roll matrix[num_onsets][note_range] -> [Pitches vs Timestep] 2d-matrix

num_onsets = batch size?
sequence length depends on the sampling rate (most sr 16000)
bin size is frequency resolution

timestep/len_feats = 200, notes = 88 (piano)

timestep = 1/fs

input of the model: Input(shape=(len_feats, nb_notes))

In [87]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import streamlit as st
import h5py

import collections
import datetime
# import fluidsynth
import glob
import numpy as np
import pathlib
import pandas as pd
import pretty_midi
import seaborn as sns
#import tensorflow as tf

from IPython import display
from matplotlib import pyplot as plt
from typing import Optional

ModuleNotFoundError: No module named 'globals.py'; 'globals' is not a package

In [94]:
seed = 42
#tf.random.set_seed(seed)
np.random.seed(seed)

# Sampling rate for audio playback
CONTEXT_WINDOW_ROWS = 88
HOP_LENGTH = 512
SAMPLING_RATE = 22050
# the num of samples per sec of 1 frame in the spectrogram
SECONDS = 5 #duration of clips 
BINS_PER_OCTAVE = 36

In [3]:
# data_dir = pathlib.Path('data/SMD_raw')
mid_data_dir = 'data/midi'
wav_data_dir = 'data/wav'
mid_filenames = glob.glob(str(mid_data_dir + '**/*.mid*'))
wav_filenames = glob.glob(str(wav_data_dir + '**/*.wav*')) #tk

print('Number of mid files:', len(mid_filenames))
print('Number of wav files:', len(wav_filenames))

# example use
wav_sample_file = wav_filenames[0]
print(wav_sample_file)

mid_sample_file = mid_filenames[0]
mid_sample_file2 = mid_filenames[1]

print(mid_sample_file)

Number of mid files: 50
Number of wav files: 50
data/wav/Chopin_Op028-17_005_20100611-SMD.wav
data/midi/Beethoven_Op031No2-03_002_20090916-SMD.mid


# wav

In [4]:
def lame_process(input_file, output_file):
    '''
    Converts a single MP# file to WAV format using LAME decoder
    param input_file: path to an input MP3 file
    param output_file: path to save the output WAV file
    '''
    os.system(f'lame --decode --quiet "{input_file}" "{output_file}"')


In [5]:
def convert_mp3_to_wav(input_path, output_path):
    """
    Convert all MP3 files in a directory or a single MP3 file to WAV format
    param input_path: Path to the directory containing MP3 files
    param output_path: Path to save the converted WAV files to
    """
    if os.path.isdir(input_path):
        input_files = [file for file in os.listdir(input_path) if file.lower().endswith('.mp3')]
        for file_name in input_files:
            print(f'Processing {input_path}/{file_name}')
            output_file = os.path.splitext(file_name)[0] + '.wav'
            lame_process(os.path.join(input_path, file_name), os.path.join(output_path, output_file))
    else:
        lame_process(input_path, output_path)

In [6]:
convert_mp3_to_wav('../data/SMD_raw', "../data/wav")

Can't init infile '../data/SMD_raw'


In [106]:

def wav_to_spectrogram(wav_file):
    """
    Converts a wav file into a tensor input for transformer model, breaks uptensor input into 5 second segments
    param wav: path to a wav file 
    """
    y, sr = librosa.load(wav_file)
    spectrogram = librosa.cqt(y, sr = SAMPLING_RATE, bins_per_octave = BINS_PER_OCTAVE) 
    
    #print(spectrogram.shape) #(num_freq_bins, num_time_frames)
    spectrogram = spectrogram.T #(num_time_frames,num_freq_bins)
    spectrogram = librosa.amplitude_to_db(np.abs(spectrogram), ref = np.max) #convert to dB scale 
    minDB = np.min(spectrogram)
    
    #print(f'Minimum: {np.min(spectrogram)}, Maximum: {np.max(spectrogram)}, Mean: {np.mean(spectrogram)}') 
    window_size = librosa.time_to_frames(SECONDS, sr = sr)
    windows = [] #5 seconds worth of frames

    pad_width = ((0, window_size - (spectrogram.shape[0] % window_size)), (0,0))
    spectrogram = np.pad(spectrogram, pad_width, 'constant', constant_values=minDB) #pad spectrogram to split into 5 second frames
    #print(f'Padded spectrogram shape: {spectrogram.shape}')

    for i in range(0, spectrogram.shape[0], window_size):
        w = spectrogram[i:i + window_size, :]
        windows.append(w)
    
    windows = np.array(windows) 
    print(f'spectrogram final shape: {windows.shape}')

    return windows


In [12]:
def plot_spectrogram(spectrogram, sr):
    """
    Creates a visualization of a spectrogram, for testing purpose
    """
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(spectrogram, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title('CQT Spectrogram')
    plt.tight_layout()
    plt.show()

# mid

In [101]:
def midi_to_piano_roll(midi_file_path, start_pitch=19, end_pitch=107, sr= SAMPLING_RATE):
    '''
    Returns an np array of the piano roll representation of a midi file, 
    with 88 notes representing those of a piano keyboard rather than 
    the default 128 notes, that is, from MIDI note 21 (A0) to MIDI note 108 (C8).
    '''
    midi_data = pretty_midi.PrettyMIDI(midi_file_path)

    raw_piano_roll = midi_data.get_piano_roll(fs=sr)[start_pitch:end_pitch]
    # 0 meaning not played -> converting it into binary representation
    piano_roll = raw_piano_roll > 0
    piano_roll = np.asarray(piano_roll).astype(int).T
    print(f'shape of pianoroll: {piano_roll.shape}')

    remainder = SECONDS*SAMPLING_RATE - (piano_roll.shape[0] % (SECONDS*SAMPLING_RATE))
    pad_width = ((0, remainder), (0,0))
    piano_roll = np.pad(piano_roll, pad_width, 'constant', constant_values=0)
    piano_roll = np.reshape(piano_roll, (-1, SECONDS * SAMPLING_RATE, piano_roll.shape[1]))
    print(f'shape of pianoroll after splitting: {piano_roll.shape}')

    return piano_roll
                    
    

In [107]:
def plot_piano_roll(midi_file_path, name_fig, start_pitch=19, end_pitch=107, sr=SAMPLING_RATE):
    """
    Use librosa's specshow function for displaying the piano roll (in streamlit framework)
    """

    fig = plt.figure(figsize=(10,8))
    midi_data = midi_data = pretty_midi.PrettyMIDI(midi_file_path)

    raw_piano_roll = midi_data.get_piano_roll(fs=sr)[start_pitch:end_pitch]

    librosa.display.specshow(raw_piano_roll,
                             hop_length=1, x_axis='time', y_axis='cqt_note',
                             fmin=pretty_midi.note_number_to_hz(start_pitch))
    plt.title(f"{name_fig}", fontsize="x-large")
    plt.xlabel("Time (s)", fontsize="x-large")
    plt.ylabel("Pitch", fontsize="x-large")
    st.pyplot(fig)


In [102]:
# Example usage
midi_file_path1 = "data/midi/bach1.mid"
midi_file_path2 = "data/midi/bach2.mid"


#midi_file_path1 = mid_sample_file
#midi_file_path2 = mid_sample_file2

midi_to_piano_roll(midi_file_path1)
midi_to_piano_roll(midi_file_path2)


#plot_piano_roll(midi_file_path1, "Piano Roll 1")
#plot_piano_roll(midi_file_path2, "Piano Roll 2")

shape of pianoroll: (3451605, 88)
shape of pianoroll after splitting: (32, 110250, 88)
shape of pianoroll: (5198563, 88)
shape of pianoroll after splitting: (48, 110250, 88)


array([[[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       ...,

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 

In [108]:
def get_cqt_and_pianoroll(wav_path, midi_dir, output_dir):
    '''
    Converts a .wav file into CQT representation and grab the
    corresponding .midi file'''

    file_name = os.path.basename(wav_path).replace('.wav', '')
    # print(file_name)
    mid_path = midi_dir + "/" + file_name + '.mid'

    if not os.path.exists(mid_path):
        print("No MIDI file found:" + mid_path)
        return
    
    # print("Found CQT and pianorolls for " + file_name)
    
    cqt = wav_to_spectrogram(wav_path) 
    piano_roll = midi_to_piano_roll(mid_path)

    print("converted CQT and pianorolls for " + file_name)

    h5_name = output_dir + file_name + ".h5"

    with h5py.File(h5_name, 'w') as hf: 

        if not os.path.exists(h5_name):
            hf.create_dataset("pianoroll", data=piano_roll)
            hf.create_dataset("cqt", data=cqt)
        else: 
            print("h5 already existed")



In [110]:
def preprocess_wav(wav_dir, midi_dir, output_dir):
    wav_paths = glob.glob(str(wav_dir + '**/*.wav*'))  
    print('Number of wav files:', len(wav_paths))

    for wav_path in wav_paths:
        get_cqt_and_pianoroll(wav_path, midi_dir, output_dir)

preprocess_wav("data/wav", "data/midi", "data/pre_out")

Number of wav files: 2
spectrogram final shape: (32, 215, 84)
shape of pianoroll: (3451605, 88)
shape of pianoroll after splitting: (32, 110250, 88)
converted CQT and pianorolls for bach1
h5 already existed
spectrogram final shape: (48, 215, 84)
shape of pianoroll: (5198563, 88)
shape of pianoroll after splitting: (48, 110250, 88)
converted CQT and pianorolls for bach2
h5 already existed


# Load Data