In [2]:
# Load packages
import IPython.display as ipd
import os
import json
import random
import pydub
import numpy as np
import torch
torch.cuda.set_device(1)
# Import 
from utils import AudioDataset, PostProcess
root_dir = '../../sdd2/audio'

In [3]:
# Create a list of paths
kore_word_dir = os.path.join(root_dir,'kore_words')

from glob import glob
audio_dir = os.path.join(kore_word_dir,'word_audio_npy')
glob_pattern = os.path.join(audio_dir, '*')
audio_list = sorted(glob(glob_pattern), key=os.path.getctime)

target_dir = os.path.join(kore_word_dir,'targets')
glob_pattern = os.path.join(target_dir, '*')
target_list = sorted(glob(glob_pattern), key=os.path.getctime)

In [4]:
n_audio_max = 80000
n_target_max = 9
dataset = AudioDataset(audio_list,target_list,n_audio_max,n_target_max)
lookup_dict = json.load(open('./lookup.json'))
postprocessor = PostProcess(lookup_dict)
random_integer = random.randint(0,len(audio_list))
original_word_audio_dir =  os.path.join(kore_word_dir,'kore-sound-vocab-munged')

random_audio, random_target, _ = dataset[random_integer]
random_audio_ID = os.path.split(audio_list[random_integer])[-1][:-4]
original_audio_path = os.path.join(original_word_audio_dir,random_audio_ID) + '.mp3'

rs = int(pydub.utils.mediainfo(original_audio_path)['sample_rate'])


print(postprocessor.target2kana(random_target.cpu().numpy()))
ipd.display(ipd.Audio(random_audio.cpu().numpy()*2**15, rate=rs,autoplay=True))

ハガキ


In [5]:
n_dataset = len(audio_list)
train_proportion = .9
n_train = int(.9*n_dataset)
inds = np.arange(n_dataset)
np.random.shuffle(inds)
inds_train = inds[:n_train]
inds_val = inds[n_train:]
n_val = n_dataset - n_train

audio_list_train = np.array(audio_list)[inds_train].tolist()
audio_list_val = np.array(audio_list)[inds_val].tolist()

target_list_train = np.array(target_list)[inds_train].tolist()
target_list_val = np.array(target_list)[inds_val].tolist()

# I know that it possible to use the Subset class to separate the train and validation loaders,
# however I don't know how to make the validation dataset not randomly subsampled in that case

dataset_train = AudioDataset(audio_list_train,target_list_train,n_audio_max,n_target_max)
dataset_val = AudioDataset(audio_list_val,target_list_val,n_audio_max,n_target_max)

batch_size_train = 32
batch_size_val = 64

from torch.utils.data import DataLoader

train_loader = DataLoader(dataset_train, batch_size=batch_size_train,shuffle=True)
val_loader = DataLoader(dataset_val, batch_size=batch_size_val)


In [13]:
from models import Wav2Letter
n_class = 79
model = Wav2Letter(n_class)
model = model.cuda()
optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)
criterion = torch.nn.CTCLoss(reduction='sum')

In [None]:
n_epoch = 100

for e in range(n_epoch) :

    total_training_loss = 0   
    model.train()
    for batch in train_loader :

        optimizer.zero_grad()   

        audio = batch[0]
        targets = batch[1]
        target_lengths = batch[2]        
        current_batch_size = audio.size()[0]
        output = model(audio)

        # this basically a tensor vector of the length the size of the current
        # batch size, each entry being the length of the predictions (determined in the model)
        input_lengths = torch.full(size=(current_batch_size,), fill_value=output.size()[-1], dtype=torch.long)

        # loss = ctc_loss(input, target, input_lengths, target_lengths)
        loss = criterion(output.transpose(1, 2).transpose(0, 1),targets,input_lengths,target_lengths)        
        total_training_loss += float(loss.cpu())

        loss.backward()
        optimizer.step()

    total_val_loss = 0  
    total_lev_dist = 0
    model.eval()      
    for batch in val_loader :

        audio = batch[0]
        targets = batch[1]
        target_lengths = batch[2]        
        current_batch_size = audio.size()[0]
        output = model(audio)        

        input_lengths = torch.full(size=(current_batch_size,), fill_value=output.size()[-1], dtype=torch.long)
        loss = criterion(output.transpose(1, 2).transpose(0, 1),targets,input_lengths,target_lengths)

        total_val_loss += float(loss.cpu())

        targets = targets.cpu().numpy().astype('int')     
        outmax = torch.argmax(output,dim=1).cpu().numpy()
        for i, vec in enumerate(outmax):

            original = postprocessor.target2kana(targets[i]) 
            predicted = postprocessor.target2kana(vec,refine = True)
            lev_dist = postprocessor.levenshtein(original,predicted)
            total_lev_dist += lev_dist/len(original)

    av_lev_dist = total_lev_dist/n_val

    print(e,total_training_loss/n_train,total_val_loss/n_val,av_lev_dist)

0 71.81408316872337 15.971818825239332 1.0
1 15.271030953996478 14.599568922651033 1.0
2 13.540140659424281 12.215242057889451 0.9903450194769058
3 10.280377270870174 9.372267785175813 0.8833611574846975
4 7.306078665597099 6.310129181570521 0.5939793836287997
5 5.1630429128105435 5.814781889493557 0.47332922595860855
6 3.864707091124469 4.228687751273281 0.341983332008374
7 3.1253039641371463 4.824508794361044 0.4162426795982724
8 2.596619992795813 3.1832226353615076 0.2607666216180408
9 2.224714691784917 4.494454818496322 0.34447823091395746
10 1.8246403104962576 4.208029567100767 0.3373062246601481
11 1.7001371893236943 2.9175983741008777 0.2311603996078121
12 1.4947259934802224 2.824275683879056 0.2503378646951273
13 1.359673075242476 3.6334721050994823 0.2647772742931343
14 1.1567154132368835 2.9194875599346894 0.2319937991891247
15 1.000760946114563 2.689672180328624 0.20583313458939498
16 0.8588113979417451 2.6972066404823467 0.19907120332829856
17 0.8741563942082959 3.182364984