In [86]:
import torch
from warpctc_pytorch import CTCLoss # https://github.com/SeanNaren/warp-ctc
import numpy as np
import os
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn
import torch.utils.data.dataloader as dataloader
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

from torch.utils.data import TensorDataset
from torchvision import transforms
from torchvision.datasets import MNIST

import matplotlib.pyplot as plt
import time

import pandas as pd

In [2]:
import ctcdecode
# https://github.com/parlance/ctcdecode
# in tf: https://github.com/githubharald/CTCWordBeamSearch

In [3]:
ctc_loss = CTCLoss()
# expected shape of seqLength x batchSize x alphabet_size
probs = torch.FloatTensor([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]]]).transpose(0, 1).contiguous()
labels = torch.IntTensor([1, 2])
label_sizes = torch.IntTensor([2])
probs_sizes = torch.IntTensor([2])
probs.requires_grad_(True)  # tells autograd to compute gradients for probs
cost = ctc_loss(probs, labels, probs_sizes, label_sizes)
cost.backward()

In [33]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

In [2]:
class WSJ():
    """ Load the WSJ speech dataset
        
        Ensure WSJ_PATH is path to directory containing 
        all data files (.npy) provided on Kaggle.
        
        Example usage:
            loader = WSJ()
            trainX, trainY = loader.train
            assert(trainX.shape[0] == 24590)
            
    """
  
    def __init__(self, path):
        self.dev_set = None
        self.train_set = None
        self.test_set = None
        self.path  = path
        
    @property
    def dev(self):
        if self.dev_set is None:
            self.dev_set = load_raw(self.path, 'wsj0_dev')
        return self.dev_set

    @property
    def train(self):
        if self.train_set is None:
            self.train_set = load_raw(self.path, 'wsj0_train')
        return self.train_set
  
    @property
    def test(self):
        if self.test_set is None:
            self.test_set = (np.load(os.path.join(self.path, 'wsj0_test.npy'), encoding='bytes'), None)
        return self.test_set
    
def load_raw(path, name):
    return (
        np.load(os.path.join(path, '{}.npy'.format(name)), encoding='bytes'), 
        np.load(os.path.join(path, '{}_merged_labels.npy'.format(name)), encoding='bytes')
    )

In [6]:
path = "/home/borowis/s3"
wsj = WSJ(path)

In [14]:
import sys
sys.path.append(path)
import phoneme_list as phl

In [36]:
dev = wsj.dev
train = wsj.train

In [32]:
print(dev[0].shape)
print(dev[1].shape)
print(dev[0][0].shape)
print(dev[1][0].shape)
print(dev[0][3].shape)
print(dev[1][3].shape)
print(dev[0][0])
print(dev[1][0])

(1106,)
(1106,)
(440, 40)
(54,)
(482, 40)
(64,)
[[-4.9549413  -5.909959   -4.7054377  ...  0.26314926 -0.00832033
   0.2449565 ]
 [-4.4155927  -7.4320974  -4.8468237  ...  0.09183788 -0.21720076
   0.5789623 ]
 [-4.64845    -5.345671   -3.6078033  ...  0.00744247  0.19980097
  -0.01899004]
 ...
 [-6.1085844  -6.8452053  -5.9429183  ... -1.9091392  -1.709682
  -1.4018598 ]
 [-5.8867598  -6.644912   -4.627789   ... -2.1586275  -1.6964803
  -1.3536029 ]
 [-4.7362947  -5.2249713  -3.899804   ... -2.992228   -2.853492
  -2.5541077 ]]
[36 15  8 19 23 27 18 26 32 33  8 14 40 34 22 44  8 26 22 37 17  8 41 37
 40 37 22 19  9 33 43  8 29 22 28 28 30 41 16 27 12 17  7 28 14 14 22 34
 16 27 12 17  0 36]


In [19]:
phonemes = phl.PHONEME_LIST
phonemes_map = phl.PHONEME_MAP

In [20]:
print([phonemes[ph] for ph in dev[1][0]])

['SIL', 'DH', 'AH', 'F', 'IY', 'M', 'EY', 'L', 'P', 'R', 'AH', 'D', 'UW', 'S', 'IH', 'Z', 'AH', 'L', 'IH', 'T', 'ER', 'AH', 'V', 'T', 'UW', 'T', 'IH', 'F', 'AO', 'R', 'Y', 'AH', 'NG', 'IH', 'N', 'N', 'OW', 'V', 'EH', 'M', 'B', 'ER', 'AE', 'N', 'D', 'D', 'IH', 'S', 'EH', 'M', 'B', 'ER', '+BREATH+', 'SIL']


In [28]:
print("".join([phonemes_map[ph] for ph in dev[1][0]]))

.DhfImElpRhdUsizhlitrhvtUtifoR?hGinnOvembrAnddisembr_.


In [29]:
print(len(phonemes))
print(len(phonemes_map))

46
46


## Model, dataloader

In [103]:
class LinesDataset(Dataset):
    def __init__(self, loader):
        self.x = [torch.tensor(l) for l in (loader[0] if len(loader) == 2 else loader)]
        self.y = [torch.tensor(l) for l in loader[1]] if (len(loader) == 2 and loader[1] is not None) else None
        
    def __getitem__(self, idx):
        data = self.x[idx]
        
        if self.y is not None:
            label = self.y[idx]
            return data, label
          
        else:
            return data
        
    def __len__(self):
        return len(self.x)

# collate fn lets you control the return value of each batch
# for packed_seqs, you want to return your data sorted by length
def collate_lines(seq_list):
    inputs, targets = zip(*seq_list)
    lens = [len(seq) for seq in inputs]
    seq_order = sorted(range(len(lens)), key=lens.__getitem__, reverse=True)
    inputs = [inputs[i] for i in seq_order]
    targets = [targets[i] for i in seq_order]
    return inputs, targets

In [104]:
# Model that takes packed sequences in training
class PackedLanguageModel(nn.Module):
    
    def __init__(self, vocab_size, input_size, hidden_size, nlayers):
        super(PackedLanguageModel,self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.nlayers = nlayers
        self.rnn = nn.GRU(input_size = input_size, hidden_size = hidden_size, num_layers = nlayers)
        self.scoring = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, seq_list): # list N, where N is batch size
        batch_size = len(seq_list)
        lens = [len(s) for s in seq_list] # lens of all sequences (already sorted in descending order)
        packed_input = rnn.pack_sequence(seq_list) # packed version
        hidden = None
        output_packed, hidden = self.rnn(packed_input, hidden)
        output_padded, _ = rnn.pad_packed_sequence(output_packed) # unpacked output (padded)
        output_flatten = torch.cat([output_padded[:lens[i],i] for i in range(batch_size)]) # concatenated output
        scores_flatten = self.scoring(output_flatten) # concatenated logits
        return scores_flatten # return concatenated logits

## Training loop

In [73]:
def train_epoch_packed(model, optimizer, train_loader, val_loader):
    criterion = CTCLoss() # sum instead of averaging, to take into account the different lengths
    criterion = criterion.to(DEVICE)
    batch_id=0
    before = time.time()
    print("Training", len(train_loader), "number of batches")
    for inputs, targets in train_loader: # lists, presorted, preloaded on GPU
        batch_id += 1
        outputs = model(inputs)
        loss = criterion(outputs,torch.cat(targets)) # criterion of the concatenated output
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if batch_id % 100 == 0:
            after = time.time()
            nwords = np.sum(np.array([len(l) for l in inputs]))
            lpw = loss.item() / nwords
            print("Time elapsed: ", after - before)
            print("At batch",batch_id)
            print("Training loss per word:",lpw)
            print("Training perplexity :",np.exp(lpw))
            before = after
    
    val_loss = 0
    batch_id=0
    nwords = 0
    for inputs,targets in val_loader:
        nwords += np.sum(np.array([len(l) for l in inputs]))
        batch_id+=1
        outputs = model(inputs)
        loss = criterion(outputs,torch.cat(targets))
        val_loss+=loss.item()
    val_lpw = val_loss / nwords
    print("\nValidation loss per word:",val_lpw)
    print("Validation perplexity :",np.exp(val_lpw),"\n")
    return val_lpw

In [105]:
model = PackedLanguageModel(len(phonemes), 40, 256, 3)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 1e-6)
train_dataset = LinesDataset(train)
val_dataset = LinesDataset(dev)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=64, collate_fn = collate_lines)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=64, collate_fn = collate_lines)

In [89]:
print(len(train_dataset.x))
print(train_dataset.x[0])

24724
tensor([-9.2617, -9.6756, -9.6329,  ..., -3.3868, -2.9438, -3.5909])


In [106]:
for i in range(1):
    train_epoch_packed(model, optimizer, train_loader, val_loader)

Training 387 number of batches


RuntimeError: Expected object of type torch.cuda.FloatTensor but found type torch.FloatTensor for argument #4 'mat1'