In [1]:
import numpy as np
import datetime
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from dataset import AudioDataset
from model import SimpleConvModel

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Torch version: ', torch.__version__)
print('Device: ', device)

print(torch.backends.cudnn.version())
torch.backends.cudnn.benchmark = True

root_dir = '/home/apelykh/datasets/speech_commands_v0.02'
seed = 123

Torch version:  1.4.0
Device:  cuda
7603


### Supervised classification

Let's build a supervised classifier to distinguish between 10 voice commands.

In [14]:
d_train = AudioDataset(root_dir, mode='train', seed=seed)
d_val = AudioDataset(root_dir, mode='val', seed=seed)

batch_size = 256

# merges a list of samples to form a mini-batch
def collate_fn(batch):
    features = [np.expand_dims(sample['mfcc'], axis=0) for sample in batch]
    labels = [sample['label'] for sample in batch]

    return torch.Tensor(features), torch.Tensor(labels)


train_loader = DataLoader(dataset=d_train, batch_size=batch_size, 
                               collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(dataset=d_val, batch_size=batch_size,
                             collate_fn=collate_fn, shuffle=True)

In [15]:
print('Train set length: ', len(d_train))
print('Number of train batches: ', len(train_loader))
print('Val set length: ', len(d_val))
print('Number of val batches: ', len(val_loader))

Train set length:  30837
Number of train batches:  121
Val set length:  3854
Number of val batches:  16


In [16]:
model = SimpleConvModel().to(device)

In [17]:
# sanity check of the dimensions of the DataLoader and model outputs
features, labels = next(iter(train_loader))
print(features.shape)
print(labels.shape)

out = model(features.to(device, non_blocking=True))
print(out.shape)

torch.Size([256, 1, 20, 44])
torch.Size([256])
torch.Size([256, 10])


In [18]:
loss_func = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.999), eps=1e-08)

### Training the model

In [19]:
num_epochs = 5
start_epoch = 0

# weights = './weights/weights_file'
# model.load_state_dict(torch.load(weights, map_location=device)

In [20]:
timestamp = str(datetime.datetime.now())
# weights are saved in the folder with the current timestamp
weights_dir = os.path.join('./weights', timestamp)
if not os.path.exists(weights_dir):
    os.makedirs(weights_dir)

train_loss = []
val_loss = []

In [None]:
# Training is very slow, probably because of the heavy feature calculation in the Dataset
# Option 1: load the data into memory first and get t from there while training
# Option 2: pre-compute features and store them on the drive
# Option 3: write an async queue, where batches will be stored and taken from while training (?)

# Low GPU utilization, TODO: find the reason

for epoch in range(start_epoch, start_epoch + num_epochs):
    epoch_train_loss = []
    for i, (features, labels) in enumerate(train_loader):      
        model.zero_grad()
        out = model(features.to(device, non_blocking=True))
        loss = loss_func(out, labels.long().to(device, non_blocking=True))
        epoch_train_loss.append(loss.item())
        loss.backward()
        optimizer.step()

        print('Epoch: {:2d}\tBatch: {:2d}/{:d}\tTrain loss: {:.3e}'
              .format(epoch + 1, i + 1, len(train_loader), loss.item()))
    
    avg_train_loss = np.mean(epoch_train_loss)
    train_loss.append(avg_train_loss)
    print('-' * 50)
    print('Epoch: {:2d}\tTrain loss: {:.3f}'.format(epoch + 1, avg_train_loss))
    
    with torch.no_grad():
        features, labels = next(iter(val_loader))
        out = model(features.to(device))
        loss = loss_func(out, labels.long().to(device)).item()
        val_loss.append(loss)

    print('Epoch: {:2d}\tValidation loss: {:.3f}'.format(epoch + 1, loss))
    print('-' * 50)
    
    weights_file = os.path.join(weights_dir, 'simpleConv_bs{}_{:04d}_{:.3f}.pt'.format(
        batch_size, epoch, loss)) 
    torch.save(model.state_dict(), weights_file)

### Plotting loss values

In [None]:
plt.figure(figsize=(7,5))
plt.plot(train_loss, label='Train loss')
plt.plot(val_loss, label='Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss');
plt.xlim(0, start_epoch + num_epochs)
plt.legend()
plt.tight_layout()
plt.savefig('{}/loss.png'.format(weights_dir))

### Testing the model

In [None]:
weights = './weights/weight_file.pt'
model.load_state_dict(torch.load(weights, map_location=device))

In [None]:
d_test = AudioDataset(mode='test', seed=seed)
test_loader = data.DataLoader(dataset=d_test, batch_size=batch_size, 
                              collate_fn=collate_fn, shuffle=True)