In [None]:
!pip install librosa



In [1]:
import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple
import librosa

import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import pickle
import matplotlib.pyplot as plt

In [3]:
# Mount G Drive and get names of training files

from google.colab import drive
drive.mount('/content/drive')

path_snippets = "/content/drive/MyDrive/transcription_ai_data/snippets"
file_names = pd.read_csv('/'.join([path_snippets, "files.txt"]))
file_names = file_names.sort_values("files")
torch.random.manual_seed(0)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


<torch._C.Generator at 0x7e72a4391e90>

In [4]:
def make_path(string): return '/'.join([path_snippets, string])

In [5]:
bach_sample_audio, sr = librosa.core.load(make_path(file_names.iloc[0]["files"]), sr=None, mono=False)
librosa.display.waveshow(bach_sample_audio, sr=sr)

KeyboardInterrupt: 

In [None]:
bach_sample_audio.shape

(2, 1128960)

In [5]:
class CNNFeatureEncoder(nn.Module):
    def __init__(self, in_channels=1128960, num_hidden = 1063, out_channels=512, kernel_size=3, stride=1, padding=1):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(in_channels=2, out_channels=1, kernel_size=kernel_size, stride=stride, padding=padding),
            nn.BatchNorm1d(num_hidden),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(in_channels=1, out_channels=1, kernel_size=kernel_size, stride=stride, padding=padding),
            nn.BatchNorm1d(out_channels),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
        )

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x (Tensor): A batch of audio signals of shape [B, S=1128960]

        Returns:
            Tensor: A batch of extracted features of shape [B, T=512]
        """
        return self.cnn(x)

In [6]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [7]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(device)
        output = self.transformer_encoder(src, src_mask)
        output = self.linear(output)
        return output

In [8]:
class TranscriberModel(nn.Module):
    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.cnn_feature_encoder = CNNFeatureEncoder()
        self.transformer = TransformerModel(ntoken, d_model, nhead, d_hid, nlayers, dropout)

    def forward(self, audio: Tensor) -> Tensor:
        x = self.transformer(self.cnn_feature_encoder(audio))
        return x

In [9]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class TranscriberDataset(Dataset):
    def __init__(self, names):
        self.names = names

    def __len__(self) -> int:
        return len(self.names)

    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor]:
        audio_path = self.names[index] + '.wav'
        score_path = 'music21text/' + self.names[index] + '.txt' #score_path = self.names[index] + '.xml'
        audio, sr = librosa.core.load(make_path(audio_path), sr=None, mono=False)
        audio = np.pad(audio, pad_width=((0,0), (0, 1128960-audio.shape[1])))
        audio = torch.from_numpy(audio)
        with open(make_path(score_path), 'r') as f:
            score = ''.join(f.readlines()).replace('    ', '\t')
        return audio, score

names = file_names[file_names['files'].str.contains('.xml')]
names['files'] = names['files'].str.replace('.xml', '')
names = names.sample(frac=1)['files'].to_list()
names.remove('Liszt_Hungarian_Rhapsodies_6_LiA09M_4')   # by far the largest file
num_data = len(names)
split_idx = num_data // 10

training_data = TranscriberDataset(names[split_idx:])
validation_data = TranscriberDataset(names[:split_idx])

train_dataloader = DataLoader(training_data, shuffle=True)
validation_dataloader = DataLoader(validation_data, shuffle=True)

  names['files'] = names['files'].str.replace('.xml', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  names['files'] = names['files'].str.replace('.xml', '')


In [10]:
# create tokenizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

def custom_analyzer(input_text):
    pad_tabs_and_newlines = input_text.replace('\t', ' \t ').replace('\n', ' \n ')
    return re.split(r' +', pad_tabs_and_newlines)

tokenizer = Tokenizer(num_words=None, filters='', lower=False, analyzer=custom_analyzer)

'''
texts = []
for name in names:
    score_path = 'music21text/' + name + '.txt'
    with open(make_path(score_path), 'r') as f:
        score = ''.join(f.readlines()).replace('    ', '\t')
    texts.append(score)

tokenizer.fit_on_texts(texts)
'''

"\ntexts = []\nfor name in names:\n    score_path = 'music21text/' + name + '.txt'\n    with open(make_path(score_path), 'r') as f:\n        score = ''.join(f.readlines()).replace('    ', '\t')\n    texts.append(score)\n\ntokenizer.fit_on_texts(texts)\n"

In [11]:
import pickle

tokenizer_path = make_path('models/tokenizer.pickle')

'''
# saving
with open(tokenizer_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
'''

# loading
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

In [12]:
pad_max = 7345

transcriber_model = TranscriberModel(ntoken=pad_max, d_model=512, nhead=8, d_hid=256, nlayers=6)
transcriber_model.to('cuda')

optimizer = torch.optim.AdamW(transcriber_model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()



In [13]:
def train_model(model, epochs, dataloader, criterion, optimizer):
    for epoch in range(1,epochs+1):
      epoch_loss = 0
      for wavs,scores in dataloader:
        pred = model(wavs.to('cuda'))
        loss = criterion(pred, pad_sequences(tokenizer.texts_to_sequences(scores), maxlen=pad_max, padding='post').to('cuda'))
        epoch_loss += loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
      print("Epoch {}, average loss: {}".format(epoch, epoch_loss/len(dataloader)))

train_model(transcriber_model, 10, train_dataloader, criterion, optimizer)

model_path = make_path('models/transcriber_model.pt')

torch.save(transcriber_model.state_dict(), model_path)

RuntimeError: running_mean should contain 1 elements not 1063

In [14]:
for data, label in training_data:
    print(data)
    break

tensor([0.0572, 0.0582, 0.0569,  ..., 0.0739, 0.0736, 0.0728])
