# Transcript to speech vector: TRAIN

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
from glob import glob
import os
from matplotlib.pylab import *
import librosa
import torch
from epoch_time import epoch_time
from tqdm.auto import tqdm
from OpenASR_convert_reference_transcript import txt_to_stm
import pandas as pd
import numpy as np
from padarray import padarray
from to_samples import to_samples
from torch.utils.data import TensorDataset, DataLoader
import audioread
import random
import soundfile as sf

In [4]:
torch.__version__

'1.7.0.dev20200823'

In [5]:
stage='NIST'
sample_rate=16000
window = sample_rate
H=window

In [6]:
transcripts = list(sorted(glob(f'{stage}/*amharic/build/transcription/*.txt')))
len(transcripts)

122

In [7]:
audio_files=[x.replace('/transcription/', '/audio/').replace('.txt','.wav') for x in transcripts]

In [8]:
_X_samples=[]
_Y_samples=[]

for transcript_file in tqdm(transcripts[0:4]):
    audio_file = transcript_file.replace('/transcription/', '/audio/').replace('.txt','.wav')
    if not os.path.exists(audio_file):
        print('missing', audio_file)
        continue
    file = "_".join(os.path.basename(transcript_file).split("_")[:-1])
    channel = os.path.basename(transcript_file).split("_")[-1].split(".")[-2]
    transcript_df = pd.read_csv(transcript_file, sep = "\n", header = None, names = ["content"])
    result = txt_to_stm(transcript_df, file, channel)
    speeches=[(float(x[-2]), float(x[-1])) for x in result if len(x)==5]
    x_np,sr=librosa.load(audio_file, sr=sample_rate)
    recording_length=x_np.shape[0]
    speech_segments=[(int(a*sample_rate), int(b*sample_rate)) for (a,b) in speeches]
    y_np=np.zeros(recording_length,dtype=x_np.dtype)
    for lower, upper in speech_segments:
        y_np[lower:upper]=1
    x_samples_np=to_samples(x_np, window, 600)
    y_samples_np=to_samples(y_np, window, 600)
    _X_samples.extend(x_samples_np)
    _Y_samples.extend(y_samples_np)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [9]:
XY_samples=[(x,y) for x,y in zip(_X_samples, _Y_samples)][0:50]
random.shuffle(XY_samples)

In [10]:
X_samples = [x for x,y in XY_samples]
Y_samples = [y for x,y in XY_samples]
len(X_samples)

50

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [12]:
tensor_x = torch.Tensor(X_samples).to(device)

In [13]:
tensor_y = torch.Tensor(Y_samples).to(device)

In [14]:
full_dataset = TensorDataset(tensor_x,tensor_y) 

In [15]:
tensor_y.shape

torch.Size([50, 16000])

In [16]:
D_in,D_out=window,window
D_in,H,D_out

(16000, 16000, 16000)

In [17]:
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
    torch.nn.ReLU()
).cuda()

In [18]:
model_fn='speech_detector.pt'
if os.path.exists(model_fn):
    model.load_state_dict(torch.load(model_fn))

In [19]:
best_loss = float('inf')
learning_rate =.0001
criterion = torch.nn.MSELoss(reduction='sum')

In [20]:
def trainer(device, model, iterator, optimizer, criterion, clip=1):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        x=batch[0]
        y=batch[1]
        optimizer.zero_grad()
        y_pred=model(x)
        loss = criterion(y_pred, y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [21]:
def evaluator(device, model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            x=batch[0]
            y=batch[1]
            y_pred=model(x)
            loss = criterion(y_pred, y)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [22]:
best_valid_loss = float('inf')

In [23]:
learning_rate=0.0001

In [24]:
epochs=10000
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [25]:
error_nn=[]
n_passes_not_saved = 0

for epoch in range(epochs):

    start_time = time.time()
    
    train_size = int(0.8 * len(full_dataset))
    test_size = len(full_dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
    train_iterator = torch.utils.data.DataLoader(train_dataset, 
                                                 batch_size=8, shuffle=False, num_workers=0)
    valid_iterator = torch.utils.data.DataLoader(test_dataset,
                                                 batch_size=8, shuffle=False, num_workers=0)
    
    start_time = time.time()
    
    train_loss = trainer(device, model, train_iterator, optimizer, criterion)
    valid_loss = evaluator(device, model, valid_iterator, criterion)
    error_nn.append(train_loss)
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s Train Loss: {train_loss:.3f} Val. Loss: {valid_loss:.3f}')
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_fn)
        print('saved', model_fn)
        print()
        n_passes_not_saved = 0
    else:
        n_passes_not_saved += 1
    
    if n_passes_not_saved > 50:
        for g in optimizer.param_groups:
            g['lr'] *= 0.998
        print(f"learning rate is now {g['lr']:.8f}")
        n_passes_not_saved = 0

RuntimeError: CUDA out of memory. Tried to allocate 978.00 MiB (GPU 0; 10.76 GiB total capacity; 8.59 GiB already allocated; 759.50 MiB free; 8.62 GiB reserved in total by PyTorch)

In [None]:
plot(error_nn)

In [None]:
torch.save(model.state_dict(), model_fn)