In [5]:
import torch
from warpctc_pytorch import CTCLoss # https://github.com/SeanNaren/warp-ctc
import numpy as np
import os
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data.dataloader as dataloader
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

from torch.utils.data import TensorDataset
from torchvision import transforms
from torchvision.datasets import MNIST

import matplotlib.pyplot as plt
import time

import pandas as pd

In [2]:
import ctcdecode
# https://github.com/parlance/ctcdecode
# in tf: https://github.com/githubharald/CTCWordBeamSearch

In [3]:
ctc_loss = CTCLoss()
# expected shape of seqLength x batchSize x alphabet_size
probs = torch.FloatTensor([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]]]).transpose(0, 1).contiguous()
labels = torch.IntTensor([1, 2])
label_sizes = torch.IntTensor([2])
probs_sizes = torch.IntTensor([2])
probs.requires_grad_(True)  # tells autograd to compute gradients for probs
cost = ctc_loss(probs, labels, probs_sizes, label_sizes)
cost.backward()

In [2]:
class WSJ():
    """ Load the WSJ speech dataset
        
        Ensure WSJ_PATH is path to directory containing 
        all data files (.npy) provided on Kaggle.
        
        Example usage:
            loader = WSJ()
            trainX, trainY = loader.train
            assert(trainX.shape[0] == 24590)
            
    """
  
    def __init__(self, path):
        self.dev_set = None
        self.train_set = None
        self.test_set = None
        self.path  = path
        
    @property
    def dev(self):
        if self.dev_set is None:
            self.dev_set = load_raw(self.path, 'wsj0_dev')
        return self.dev_set

    @property
    def train(self):
        if self.train_set is None:
            self.train_set = load_raw(self.path, 'wsj0_train')
        return self.train_set
  
    @property
    def test(self):
        if self.test_set is None:
            self.test_set = (np.load(os.path.join(self.path, 'wsj0_test.npy'), encoding='bytes'), None)
        return self.test_set
    
def load_raw(path, name):
    return (
        np.load(os.path.join(path, '{}.npy'.format(name)), encoding='bytes'), 
        np.load(os.path.join(path, '{}_merged_labels.npy'.format(name)), encoding='bytes')
    )

In [6]:
path = "/home/borowis/s3"
wsj = WSJ(path)

In [14]:
import sys
sys.path.append(path)
import phoneme_list as phl

In [7]:
dev = wsj.dev

In [12]:
print(dev[0].shape)
print(dev[1].shape)
print(dev[0][0])
print(dev[1][0])

(1106,)
(1106,)
[[-4.9549413  -5.909959   -4.7054377  ...  0.26314926 -0.00832033
   0.2449565 ]
 [-4.4155927  -7.4320974  -4.8468237  ...  0.09183788 -0.21720076
   0.5789623 ]
 [-4.64845    -5.345671   -3.6078033  ...  0.00744247  0.19980097
  -0.01899004]
 ...
 [-6.1085844  -6.8452053  -5.9429183  ... -1.9091392  -1.709682
  -1.4018598 ]
 [-5.8867598  -6.644912   -4.627789   ... -2.1586275  -1.6964803
  -1.3536029 ]
 [-4.7362947  -5.2249713  -3.899804   ... -2.992228   -2.853492
  -2.5541077 ]]
[36 15  8 19 23 27 18 26 32 33  8 14 40 34 22 44  8 26 22 37 17  8 41 37
 40 37 22 19  9 33 43  8 29 22 28 28 30 41 16 27 12 17  7 28 14 14 22 34
 16 27 12 17  0 36]


In [19]:
phonemes = phl.PHONEME_LIST
phonemes_map = phl.PHONEME_MAP

In [20]:
print([phonemes[ph] for ph in dev[1][0]])

['SIL', 'DH', 'AH', 'F', 'IY', 'M', 'EY', 'L', 'P', 'R', 'AH', 'D', 'UW', 'S', 'IH', 'Z', 'AH', 'L', 'IH', 'T', 'ER', 'AH', 'V', 'T', 'UW', 'T', 'IH', 'F', 'AO', 'R', 'Y', 'AH', 'NG', 'IH', 'N', 'N', 'OW', 'V', 'EH', 'M', 'B', 'ER', 'AE', 'N', 'D', 'D', 'IH', 'S', 'EH', 'M', 'B', 'ER', '+BREATH+', 'SIL']


In [26]:
print([phonemes_map[ph] for ph in dev[1][0]].jo)

['.', 'D', 'h', 'f', 'I', 'm', 'E', 'l', 'p', 'R', 'h', 'd', 'U', 's', 'i', 'z', 'h', 'l', 'i', 't', 'r', 'h', 'v', 't', 'U', 't', 'i', 'f', 'o', 'R', '?', 'h', 'G', 'i', 'n', 'n', 'O', 'v', 'e', 'm', 'b', 'r', 'A', 'n', 'd', 'd', 'i', 's', 'e', 'm', 'b', 'r', '_', '.']
