In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!unzip ./drive/MyDrive/Exploration/dataset.zip -d "./drive/MyDrive/Exploration/"

Archive:  ./drive/MyDrive/Exploration/dataset.zip
replace ./drive/MyDrive/Exploration/dataset/midi2corpus.py? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [2]:
pip install pytorch-fast-transformers

Collecting pytorch-fast-transformers
  Downloading pytorch-fast-transformers-0.4.0.tar.gz (93 kB)
[?25l[K     |███▌                            | 10 kB 33.0 MB/s eta 0:00:01[K     |███████                         | 20 kB 8.4 MB/s eta 0:00:01[K     |██████████▌                     | 30 kB 7.7 MB/s eta 0:00:01[K     |██████████████                  | 40 kB 7.3 MB/s eta 0:00:01[K     |█████████████████▌              | 51 kB 3.9 MB/s eta 0:00:01[K     |█████████████████████           | 61 kB 4.2 MB/s eta 0:00:01[K     |████████████████████████▌       | 71 kB 4.4 MB/s eta 0:00:01[K     |████████████████████████████    | 81 kB 5.0 MB/s eta 0:00:01[K     |███████████████████████████████▌| 92 kB 5.2 MB/s eta 0:00:01[K     |████████████████████████████████| 93 kB 1.8 MB/s 
Building wheels for collected packages: pytorch-fast-transformers
  Building wheel for pytorch-fast-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for pytorch-fast-transformers: filename=pyto

In [48]:
import pandas as pd
import numpy as np
import time
import torch
import math
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils as utils
from fast_transformers.builders import TransformerEncoderBuilder
from fast_transformers.masking import TriangularCausalMask

In [42]:
torch.cuda.empty_cache()


In [4]:
datapath = './drive/MyDrive/Exploration/dataset/representations/uncond/cp/ailab17k_from-scratch_cp'
dictionary = pd.read_pickle(datapath + '/dictionary.pkl')
events2words, words2events = dictionary

tokens = [len(events2words[key]) for key in events2words.keys()]

def load_train_data():
  return np.load(datapath+'/train_data_linear.npz')

In [50]:
# Loading the training data
data_train = load_train_data()

data_train_x = data_train['x']
data_train_y = data_train['y']
data_train_mask = data_train['mask']
total_batches = len(data_train)

# Setting Hyper Parameters
model_dimension = 512
total_layers = 12
total_heads = 8
batch_size = 4
initial_learning_rate = 1e-4
total_eopchs = 1000
max_grad_norm = 3

# Embedding
embedding_dimension = [128, 256, 64, 32, 512, 128, 128]

# Token types
token_types = ['tempo', 'chord', 'barbeat', 'type', 'pitch', 'duration', 'velocity']

In [51]:
# Transformer Class
class Transformer(nn.Module):
  def __init__(self, tokens):
    super(Transformer, self).__init__()

    self.tokens = tokens
    self.model_dimension = model_dimension
    self.total_layers = total_layers
    self.total_heads = total_heads
    self.embedding_dimension = embedding_dimension
    self.dropout = 0.1
    self.batch_size = batch_size
    self.token_embeddings = nn.ModuleList()
    self.token_linear = nn.ModuleList()
    for i in range(len(token_types)):
      self.token_embeddings.append(Embedding(tokens[i], embedding_dimension[i]))
    # [Embedding(tokens[i], embedding_dimension[i]) for i in range(len(token_types))]
    self.positional_embedding = PositionalEmbedding(self.model_dimension, self.dropout)
    self.input_linear = nn.Linear(np.sum(self.embedding_dimension), self.model_dimension)
    for i in range(len(token_types)):
      self.token_linear.append(nn.Linear(self.model_dimension, tokens[i]))
    # self.token_linear = [nn.Linear(self.model_dimension, tokens[i]) for i in range(len(token_types))]
    self.concat_type_linear = nn.Linear(self.model_dimension + embedding_dimension[token_types.index('type')], self.model_dimension)
    self.loss = nn.CrossEntropyLoss(reduction='none')
    
    self.transformer_encoder = TransformerEncoderBuilder.from_kwargs(
        n_layers = self.total_layers,
        n_heads = self.total_heads,
        query_dimensions = self.model_dimension//self.total_heads,
        value_dimensions = self.model_dimension//self.total_heads,
        feed_forward_dimensions = 2048,
        activation = 'gelu',
        dropout = self.dropout,
        attention_type = "causal-linear"
    ).get()

  # Computes loss
  def get_loss(self, prediction, target,  loss_mask):
    loss = self.loss(prediction, target)
    loss = loss * loss_mask
    return torch.sum(loss) / torch.sum(loss_mask)

  # Training operation for the neural net on the given set of data batch 
  def train_step(self, x, target, loss_mask):
    # Get token embeddings and positional embeddings
    embeddings = []
    for i in range(len(token_types)):
      embeddings.append(self.token_embeddings[i](x[..., i]))
    concat_embedding = torch.cat(embeddings, dim=-1)
    model_embedding = self.input_linear(concat_embedding)
    positional_embedding = self.positional_embedding(model_embedding)

    # Feed the embeddings into our transformer
    attention_mask = TriangularCausalMask(positional_embedding.size(1), device=x.device)
    h = self.transformer_encoder(positional_embedding, attention_mask)

    # Get the predicted token type
    token_type_index = token_types.index('type')
    predicted_token_type = self.token_linear[token_type_index](h)
    target_token_type = self.token_embeddings[token_type_index](target[..., token_type_index])

    # Get the predicted tokens
    predictions = []
    output = self.concat_type_linear(torch.cat([h, target_token_type], dim=-1))
    for index, token_predictor in enumerate(self.token_linear):
      if index != token_type_index:
        predictions.append(token_predictor(output))
      else:
        predictions.append(predicted_token_type)
    predictions = [prediction[:, ...].permute(0, 2, 1) for prediction in predictions]
    losses = [self.get_loss(predictions[i], target[..., i], loss_mask) for i in range(len(token_types))]
    return losses
      

# Embedding Class
class Embedding(nn.Module):
  def __init__(self, num_embeddings, embedding_dimension):
    super(Embedding, self).__init__()
    self.embedding_dimension = embedding_dimension
    self.embed = nn.Embedding(num_embeddings, embedding_dimension)

  def forward(self, x):
    return self.embed(x) * math.sqrt(self.embedding_dimension)

# Positional Embedding Class
class PositionalEmbedding(nn.Module):
  def __init__(self, model_dimension, dropout, max_len=20000):
    super(PositionalEmbedding, self).__init__()
    self.dropout = nn.Dropout(dropout)

    positional_encoding = torch.zeros(max_len, model_dimension)
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
    division_term = torch.exp(torch.arange(0, model_dimension, 2).float() * (-math.log(10000.0)/model_dimension))
    positional_encoding[:, 0::2] = torch.sin(position*division_term)
    positional_encoding[:, 1::2] = torch.cos(position*division_term)
    positional_encoding = positional_encoding.unsqueeze(0)
    self.register_buffer('positional_encoding', positional_encoding)

  def forward(self, x):
    x = x + self.positional_encoding[:, :x.size(1), :]
    return self.dropout(x)



In [52]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

def get_parameters(model):
  return filter(lambda param: param.requires_grad, model.parameters())

# Defining Neural Network
music_transformer = Transformer(tokens)
music_transformer.cuda()
# Defining Optimizer
optimizer = optim.Adam(get_parameters(music_transformer), lr=initial_learning_rate)

In [55]:
def train(neural_net, epochs):
  neural_net.train()
  batches = len(data_train_x)//neural_net.batch_size

  for epoch in range(epochs):
    start_time = time.time()
    total_loss = 0
    for batch_index in range(batches):
      batch_start_index = neural_net.batch_size*batch_index
      batch_end_index = neural_net.batch_size*(batch_index + 1)
      batch_x = torch.from_numpy(data_train_x[batch_start_index:batch_end_index]).long().cuda()
      batch_y = torch.from_numpy(data_train_y[batch_start_index:batch_end_index]).long().cuda()
      batch_mask = torch.from_numpy(data_train_mask[batch_start_index:batch_end_index]).float().cuda()

      neural_net.zero_grad()
      losses = neural_net.train_step(batch_x, batch_y, batch_mask)
      # print(type(losses))
      loss = (losses[0] + losses[1] + losses[2] + losses[3] + losses[4] + losses[5] + losses[6])/len(token_types)
      loss.backward()
      utils.clip_grad_norm_(get_parameters(neural_net), max_grad_norm)
      optimizer.step()

      print(f'Batch: {batch_index+1}/{batches} | Total Loss: {loss} |' + 
            f'Individual Loss: {losses[0]}, {losses[1]}, {losses[2]}, {losses[3]}, {losses[4]}, {losses[5]}, {losses[6]}') 

      total_loss += loss.item()

    total_loss /= batches
    print(f'Epoch Loss: {total_loss} | Epoch Time: {time.time()-start_time}') 

In [None]:
# torch.cuda.memory_summary(device=None, abbreviated=False)

train(music_transformer, 10)

Batch: 1/406 | Total Loss: 1.3179117441177368 |Individual Loss: 0.8901422619819641, 0.7580288648605347, 1.1125637292861938, 0.4781794846057892, 2.6195569038391113, 1.5319252014160156, 1.8349848985671997
Batch: 2/406 | Total Loss: 1.3172154426574707 |Individual Loss: 0.7894248366355896, 0.5755009651184082, 1.024003505706787, 0.48824501037597656, 2.6859920024871826, 1.6845415830612183, 1.9727989435195923
Batch: 3/406 | Total Loss: 1.2606408596038818 |Individual Loss: 0.678968608379364, 0.6201798915863037, 0.9847992062568665, 0.5003271698951721, 2.6867854595184326, 1.531847596168518, 1.8215785026550293
Batch: 4/406 | Total Loss: 1.3191722631454468 |Individual Loss: 0.6429595947265625, 0.4650804102420807, 0.9715250730514526, 0.4964408278465271, 2.8766372203826904, 1.6475796699523926, 2.133981704711914
Batch: 5/406 | Total Loss: 1.2772626876831055 |Individual Loss: 0.7588629722595215, 0.5873809456825256, 1.0160350799560547, 0.4746311604976654, 2.647578001022339, 1.5854270458221436, 1.870923

total    : 17071734784
free     : 693895168
used     : 16377839616


In [34]:
# Analysing data

print(dictionary[0].keys())
print(dictionary[1].keys())
print(tokens)


print(f'Train X: {(data_train_x.shape)} | Train Y: {(data_train_y.shape)} | Mask: {(data_train_mask.shape)}')
print(total_batches)

print(data_train_x[0][1])
print(data_train_y[0][1])


dict_keys(['tempo', 'chord', 'bar-beat', 'type', 'pitch', 'duration', 'velocity'])
dict_keys(['tempo', 'chord', 'bar-beat', 'type', 'pitch', 'duration', 'velocity'])
[56, 135, 18, 3, 87, 18, 25]
Train X: (1625, 3584, 7) | Train Y: (1625, 3584, 7) | Mask: (1625, 3584)
5
[12 37  2  1  0  0  0]
[ 0  0  0  2 24 17 12]
