In [2]:
import phoneme_list as pl
import numpy as np
import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn

import Levenshtein as L
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from ctcdecode import CTCBeamDecoder
from tensorboardX import SummaryWriter

n_stat = pl.N_STATES
n_phon = pl.N_PHONEMES
p_list = pl.PHONEME_LIST
p_map = pl.PHONEME_MAP

train_data_path = './../data/wsj0_train.npy'
train_label_path = './../data/wsj0_train_merged_labels.npy'

val_data_path = './../data/wsj0_dev.npy'
val_label_path = './../data/wsj0_dev_merged_labels.npy'

test_path = './../data/transformed_test_data.npy'

train_data = np.load(train_data_path, encoding='bytes')
train_label = np.load(train_label_path)

val_data = np.load(val_data_path, encoding='bytes')
val_label = np.load(val_label_path)


# phonome dataset, init dataset with data and label
class PhonDataset(Dataset):
    def __init__(self, phon_list, labels):
        self.data = [torch.tensor(phon) for phon in phon_list]
        self.labels = [torch.tensor(label) for label in labels]

    def __getitem__(self, i):
        data = self.data[i]
        label = self.labels[i]
        data = data.type(torch.float32)
        return data.to(DEVICE), label.to(DEVICE)

    def __len__(self):
        return len(self.labels)


# collate_phon return your data sorted by length
def collate_phon(phon_list):
    inputs, targets = zip(*phon_list)
    lens = [len(phon) for phon in inputs]
    seq_order = sorted(range(len(lens)), key=lens.__getitem__, reverse=True)
    inputs = [inputs[i] for i in seq_order]
    targets = [targets[i] for i in seq_order]
    return inputs, targets


def print_model(model):
    params = model.state_dict()
    keys = params.keys()
    for key in keys:
        print(key + ": ")
        print(torch.max(params[key]))
    return


# phonome dataloader
# Model that takes packed sequences in training
class PackedPhonModel(nn.Module):
    def __init__(self, in_size, hidden_size, out_size, nlayers):
        super(PackedPhonModel, self).__init__()
        self.in_size = in_size
        self.hidden_size = hidden_size
        self.nlayers = nlayers
        self.rnn = nn.LSTM(input_size=in_size, hidden_size=hidden_size, num_layers=nlayers, bidirectional=True)
        self.scoring1 = nn.Linear(2 * hidden_size, 1024)
        self.scoring2 = nn.Linear(1024, out_size)
        self.lsm = nn.LogSoftmax(dim=2)

    def forward(self, phon_list):  # list
        # pack and split the length sorted input into small pieces
        packed_input = rnn.pack_sequence(phon_list)  # packed version

        hidden = None
        output_packed, hidden = self.rnn(packed_input, hidden)

        # get the output with dim 0 corresponding to packed_input
        output_padded, _ = rnn.pad_packed_sequence(output_packed)  # unpacked output (padded)
        scores_flatten = self.scoring1(output_padded)  # concatenated logits
        scores_flatten = self.scoring2(output_padded)
        scores_flatten = self.lsm(scores_flatten)

        return scores_flatten  # return concatenated logits
    
    
def save_ckpt(model, optimizer, val_loss, idx):
    id_name = 'id_' + str(idx)
    path = './../result/' + id_name

    torch.save({
        'exp_id': idx,
        'val_loss': val_loss,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, path)
    
    return path

def load_ckpt(path):
    new_model = PackedPhonModel(40, 512, 47, 4)
    pretrained_ckpt = torch.load(path)
    new_model.load_state_dict(pretrained_ckpt['model_state_dict'])
    
    new_optimizer = torch.optim.Adam(new_model.parameters(), lr=0.001, weight_decay=5e-4)
    new_optimizer.load_state_dict(pretrained_ckpt['optimizer_state_dict'])
    return new_model, new_optimizer


def train(epochs, train_loader, val_loader, model, writer):
    model.train()
    idx = 0
    for e in range(epochs):
        print("begin epoch: ", e)
        for inputs, targets in train_loader:
            idx += 1
            # inputs is a list of 64 frames, each frame is K * 40, with K varies
            # targets is a list of 64 target vectors, each containing T values, T varies
            in_lens = [len(phon) for phon in inputs]
            tar_lens = [len(tar) for tar in targets]

            in_lens = torch.tensor(in_lens)
            tar_lens = torch.tensor(tar_lens)

            packed_targets = torch.cat(targets, dim=0)

            outputs = model(inputs)
            loss = criterion(outputs, packed_targets, in_lens, tar_lens)
            
            writer.add_scalar('train/loss', loss.item(), idx)

            # perform backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        val_loss = val(model, val_loader, writer, e)
        model.train()
        save_ckpt(model, optimizer, val_loss, e)
            
            
# validation
def val(model, val_loader, writer, ep):
    model.eval()
    cnt = 0
    with torch.no_grad():
        cnt += 1
        avg_loss = 0
        for inputs, targets in val_loader:
            in_lens = [len(phon) for phon in inputs]
            tar_lens = [len(tar) for tar in targets]

            in_lens = torch.tensor(in_lens)
            tar_lens = torch.tensor(tar_lens)

            packed_targets = torch.cat(targets, dim=0)

            outputs = model(inputs)
            loss = criterion(outputs, packed_targets, in_lens, tar_lens)
            avg_loss += loss.item()
        avg_loss /= cnt
        writer.add_scalar('val/loss', avg_loss.item(), ep)
        return avg_loss

ModuleNotFoundError: No module named 'phoneme_list'

In [3]:
model = PackedPhonModel(40, 512, 47, 4)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

train_dataset = PhonDataset(train_data, train_label)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=64, collate_fn=collate_phon)

val_dataset = PhonDataset(val_data, val_label)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=64, collate_fn=collate_phon)

criterion = nn.CTCLoss(blank=46)
criterion = criterion.to(DEVICE)
model = model.to(DEVICE)

epochs = 5
writer = SummaryWriter()

In [4]:
train(epochs, train_loader, val_loader, model, writer)

begin epoch:  0


KeyboardInterrupt: 

In [None]:
# load model
path = './../result/id_0'
model = load_ckpt(path)

In [5]:
# tokens is the list of vocab index predicted
# vocab is the vocabulary map
def convert_to_string(tokens, vocab, seq_len):
    print(seq_len)
    return ''.join([vocab[x] for x in tokens[0:seq_len]])

# the list
p_map = pl.PHONEME_MAP
p_map.append('%')
print(p_map)

['_', '+', '~', '!', '-', '@', 'a', 'A', 'h', 'o', 'w', 'y', 'b', 'c', 'd', 'D', 'e', 'r', 'E', 'f', 'g', 'H', 'i', 'I', 'j', 'k', 'l', 'm', 'n', 'G', 'O', 'Y', 'p', 'R', 's', 'S', '.', 't', 'T', 'u', 'U', 'v', 'W', '?', 'z', 'Z', '%']


In [1]:
val_loader2 = DataLoader(val_dataset, shuffle=False, batch_size=1, collate_fn=collate_phon)

classification_result = []

with torch.no_grad():
    for inputs, targets in val_loader2:
        output = model(inputs)
        sp = output.shape
        output = output.reshape((sp[0], sp[2]))
        output = output.cpu().numpy()
        print(output.shape)
        
        probs_seq = torch.FloatTensor([output])
        
        print(probs_seq[0].shape)
        
        decoder = CTCBeamDecoder(p_map, beam_width=100, blank_id=p_map.index('%'))
        output, scores, timesteps, out_seq_len = decoder.decode(probs_seq)
#         print(out_seq_len)
#         print(beam_result.shape)
        for i in range(output.size(0)):
            pred = "".join(p_map[o] for o in output[i, 0, :out_seq_len[i, 0]])
        print(pred)
#         pred_str = convert_to_string(beam_result[0][0], p_map, out_seq_len[0][0])
        

#         print(out_seq_len[0][0])
#         print(pred_str)
#         print(len(pred_str))
#         print(targets)
        
#         true = ''.join([vocab[x] for x in tokens[0:seq_len]])
#         true = convert_to_string(targets, p_map, len(targets))
#         true = convert_to_string(targets, p_map, 10)

#         L.distance(pred_str, true)
        

        


NameError: name 'DataLoader' is not defined

In [28]:
# test
test_data = np.load(test_path, encoding='bytes')
test_len = len(test_data)
test_label = val_label[:test_len]

test_dataset = PhonDataset(test_data, test_label)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=1, collate_fn=collate_phon)

outputs = []

with torch.no_grad():
    for inputs, _ in test_loader:
        output = model(inputs)
        sp = output.shape
        output = output.reshape((sp[0], sp[2]))
        output = output.cpu().numpy()
        outputs.append(output)
        print(output.shape)

np.save("outputs_2.npy", outputs)

(542, 47)
(681, 47)
(646, 47)
(600, 47)
(578, 47)
(1325, 47)
(1086, 47)
(1038, 47)
(877, 47)
(819, 47)
(520, 47)
(739, 47)
(741, 47)
(923, 47)
(719, 47)
(476, 47)
(765, 47)
(429, 47)
(566, 47)
(486, 47)
(484, 47)
(712, 47)
(806, 47)
(799, 47)
(790, 47)
(792, 47)
(522, 47)
(550, 47)
(594, 47)
(563, 47)
(1027, 47)
(664, 47)
(540, 47)
(948, 47)
(1202, 47)
(963, 47)
(269, 47)
(840, 47)
(486, 47)
(771, 47)
(493, 47)
(931, 47)
(462, 47)
(412, 47)
(1419, 47)
(708, 47)
(683, 47)
(904, 47)
(169, 47)
(432, 47)
(582, 47)
(465, 47)
(1242, 47)
(820, 47)
(340, 47)
(838, 47)
(1418, 47)
(1114, 47)
(620, 47)
(665, 47)
(756, 47)
(529, 47)
(344, 47)
(622, 47)
(410, 47)
(1102, 47)
(565, 47)
(646, 47)
(732, 47)
(623, 47)
(704, 47)
(461, 47)
(643, 47)
(619, 47)
(585, 47)
(895, 47)
(1055, 47)
(533, 47)
(503, 47)
(219, 47)
(1093, 47)
(674, 47)
(1187, 47)
(554, 47)
(532, 47)
(183, 47)
(259, 47)
(935, 47)
(813, 47)
(958, 47)
(244, 47)
(803, 47)
(379, 47)
(660, 47)
(644, 47)
(641, 47)
(920, 47)
(300, 47)
(637, 4

In [16]:
for data in test_data:
    print(data.shape)

(542, 40)
(681, 40)
(646, 40)
(600, 40)
(578, 40)
(1325, 40)
(1086, 40)
(1038, 40)
(877, 40)
(819, 40)
(520, 40)
(739, 40)
(741, 40)
(923, 40)
(719, 40)
(476, 40)
(765, 40)
(429, 40)
(566, 40)
(486, 40)
(484, 40)
(712, 40)
(806, 40)
(799, 40)
(790, 40)
(792, 40)
(522, 40)
(550, 40)
(594, 40)
(563, 40)
(1027, 40)
(664, 40)
(540, 40)
(948, 40)
(1202, 40)
(963, 40)
(269, 40)
(840, 40)
(486, 40)
(771, 40)
(493, 40)
(931, 40)
(462, 40)
(412, 40)
(1419, 40)
(708, 40)
(683, 40)
(904, 40)
(169, 40)
(432, 40)
(582, 40)
(465, 40)
(1242, 40)
(820, 40)
(340, 40)
(838, 40)
(1418, 40)
(1114, 40)
(620, 40)
(665, 40)
(756, 40)
(529, 40)
(344, 40)
(622, 40)
(410, 40)
(1102, 40)
(565, 40)
(646, 40)
(732, 40)
(623, 40)
(704, 40)
(461, 40)
(643, 40)
(619, 40)
(585, 40)
(895, 40)
(1055, 40)
(533, 40)
(503, 40)
(219, 40)
(1093, 40)
(674, 40)
(1187, 40)
(554, 40)
(532, 40)
(183, 40)
(259, 40)
(935, 40)
(813, 40)
(958, 40)
(244, 40)
(803, 40)
(379, 40)
(660, 40)
(644, 40)
(641, 40)
(920, 40)
(300, 40)
(637, 4

In [7]:
import Levenshtein as L
L.distance("aaaanfavaffd", "andfavbaffd")

5