In [1]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from tqdm import tqdm
import torch
output_notebook()

In [2]:
input_bars = 32
hop_n_bars = 1
move_all_to_cuda = False

In [3]:
import bz2, pickle
import numpy as np

path1 = 'data/lmd/data_bass_groove_train.bz2'
with bz2.BZ2File(path1, 'rb') as f:
    instrument1s = pickle.load(f)

path2 = 'data/lmd/data_drums_full_unsplit.bz2'
with bz2.BZ2File(path2, 'rb') as f:
    instrument2s = pickle.load(f)

song_ids_all = list(set(instrument1s.keys()) & set(instrument2s.keys()))
song_ids_ = []

for song_id in song_ids_all:
    i1 = instrument1s[song_id]
    i2 = instrument2s[song_id]
    # if i1.hvo.shape[0] == i2.hvo.shape[0]:
    #     song_ids.append(song_id)
    
    if len(i1.time_signatures) == 1 and len(i2.time_signatures) == 1:
        if i1.time_signatures[0].numerator == i2.time_signatures[0].numerator and i1.time_signatures[0].denominator == i2.time_signatures[0].denominator:
            if i1.time_signatures[0].numerator == 4 and i1.time_signatures[0].denominator == 4:
                
                song_ids_.append(song_id)

# split
from sklearn.model_selection import train_test_split

song_ids_train, song_ids_test = train_test_split(song_ids_, test_size=0.2, random_state=42)


In [4]:

def extract_data(song_ids, instrument1s, instrument2s, input_bars, hop_n_bars, input_has_velocity, input_has_offset):
    '''
    Splits the paired Lakh data into input and output segments for the LTA model 
    :param song_ids:        List of song ids to extract data from
    :param instrument1s:    Dictionary of instrument 1 data
    :param instrument2s:    Dictionary of instrument 2 data
    :param input_bars:      Number of bars to use as input
    :param hop_n_bars:      Number of bars to move forward for the next input
    :return:               Tuple of torch tensors containing the input and output data
        - previous_input_bars:                 Tensor of shape (n_segments, input_bars * 16, n_features)
        - upcoming_input_2_bars:               Tensor of shape (n_segments, 2 * 16, n_features)
        - previous_stacked_input_output_bars:  Tensor of shape (n_segments, input_bars * 16, 2 * n_features)
        - upcoming_stacked_input_output_2_bars:Tensor of shape (n_segments, 2 * 16, 2 * n_features)
        - previous_output_bars:                Tensor of shape (n_segments, input_bars * 16, n_features)
        - upcoming_output_2_bars:              Tensor of shape (n_segments, 2 * 16, n_features)
    '''
    
    i1_hvos = []
    i2_hvos = []
    
    # seg_len must be input_bars * 16 + 32
    seg_len = input_bars * 16 + 32
    
    # break into bars
    for song_id in tqdm(song_ids):
        i1 = instrument1s[song_id]
        i2 = instrument2s[song_id]
        
        n_steps = max(i1.hvo.shape[0], i2.hvo.shape[0], input_bars * 16)
        
        i1.adjust_length(n_steps)
        i2.adjust_length(n_steps)
        
        if i1.hvo.shape[0] != i2.hvo.shape[0]:
            print(i1.hvo.shape, i2.hvo.shape)
            raise ValueError('Shapes do not match')
        
        n_bars = n_steps // 16
        
        for i in range(0, n_bars - input_bars + 1, hop_n_bars):
            seg1 = i1.hvo[i*16:(i+input_bars+2)*16]
            seg2 = i2.hvo[i*16:(i+input_bars+2)*16]
            
            if seg1.shape[0] != seg_len:
                # zero pad
                seg1 = np.vstack([seg1, np.zeros((seg_len - seg1.shape[0], seg1.shape[1]))])
                seg2 = np.vstack([seg2, np.zeros((seg_len - seg2.shape[0], seg2.shape[1]))])
    
            i1_hvos.append(seg1)
            i2_hvos.append(seg2)
        
    previous_input_bars = []
    upcoming_input_2_bars = []
    previous_stacked_input_output_bars = []
    upcoming_stacked_input_output_2_bars = []
    previous_output_bars = []
    upcoming_output_2_bars = []
    
    for i1_hvo, i2_hvo in tqdm(zip(i1_hvos, i2_hvos)):
        assert i1_hvo.shape[0] == i2_hvo.shape[0]
        
        n_steps = i1_hvo.shape[0]
    
        n_voices_1 = i1_hvo.shape[-1] // 3
        n_voices_2 = i2_hvo.shape[-1] // 3
        h1 = i1_hvo[:, :n_voices_1]
        v1 = i1_hvo[:, n_voices_1:2*n_voices_1]
        o1 = i1_hvo[:, 2*n_voices_1:]
        h2 = i2_hvo[:, :n_voices_2]
        v2 = i2_hvo[:, n_voices_2:2*n_voices_2]
        o2 = i2_hvo[:, 2*n_voices_2:]
        
        if input_has_velocity and input_has_offset:
            hvo1 = torch.tensor(np.hstack([h1, v1, o1]), dtype=torch.float32)
        elif input_has_offset:
            hvo1 = torch.tensor(np.hstack([h1, o1]), dtype=torch.float32)
        elif input_has_velocity:
            hvo1 = torch.tensor(np.hstack([h1, v1]), dtype=torch.float32)
        else:
            hvo1 = torch.tensor(np.hstack([h1]), dtype=torch.float32)
    
        hvo2 = torch.tensor(np.hstack([h2, v2, o2]), dtype=torch.float32)
        
        # add inputs 
        previous_input_bars.append(hvo1[:-32])
        upcoming_input_2_bars.append(hvo1[-32:])
        
        # add outputs        
        previous_output_bars.append(hvo2[:-32])
        upcoming_output_2_bars.append(hvo2[-32:])
        
        # add stacked inputs and outputs
        if input_has_velocity and input_has_offset:
            i12 = torch.tensor(np.hstack([h1, h2, v1, v2, o1, o2]), dtype=torch.float32)
        elif input_has_offset:
            i12 = torch.tensor(np.hstack([h1, h2, o1, o2]), dtype=torch.float32)
        elif input_has_velocity:
            i12 = torch.tensor(np.hstack([h1, h2, v1, v2]), dtype=torch.float32)
        else:
            i12 = torch.tensor(np.hstack([h1, h2]), dtype=torch.float32)
        
        previous_stacked_input_output_bars.append(i12[:-32])
        upcoming_stacked_input_output_2_bars.append(i12[-32:])
        
    
    # convert to torch tensors
    previous_input_bars = torch.vstack([x.unsqueeze(0) for x in previous_input_bars])
    upcoming_input_2_bars = torch.vstack([x.unsqueeze(0) for x in upcoming_input_2_bars])
    previous_stacked_input_output_bars = torch.vstack([x.unsqueeze(0) for x in previous_stacked_input_output_bars])
    upcoming_stacked_input_output_2_bars = torch.vstack([x.unsqueeze(0) for x in upcoming_stacked_input_output_2_bars])
    previous_output_bars = torch.vstack([x.unsqueeze(0) for x in previous_output_bars])
    upcoming_output_2_bars = torch.vstack([x.unsqueeze(0) for x in upcoming_output_2_bars])
    
    return previous_input_bars, upcoming_input_2_bars, previous_stacked_input_output_bars, upcoming_stacked_input_output_2_bars, previous_output_bars, upcoming_output_2_bars


In [6]:
previous_input_bars, upcoming_input_2_bars, previous_stacked_input_output_bars, upcoming_stacked_input_output_2_bars, previous_output_bars, upcoming_output_2_bars = extract_data(song_ids_train, instrument1s, instrument2s, input_bars, hop_n_bars)

100%|██████████| 992/992 [00:11<00:00, 85.14it/s]
78878it [00:08, 9178.43it/s] 


In [7]:
previous_input_bars.shape, upcoming_input_2_bars.shape, previous_stacked_input_output_bars.shape, upcoming_stacked_input_output_2_bars.shape, previous_output_bars.shape, upcoming_output_2_bars.shape

(torch.Size([78878, 512, 3]),
 torch.Size([78878, 32, 3]),
 torch.Size([78878, 512, 30]),
 torch.Size([78878, 32, 30]),
 torch.Size([78878, 512, 27]),
 torch.Size([78878, 32, 27]))