**Frame-Level Classification of Speech**

Author: Brian Yan

Implemented for Kaggle: https://www.kaggle.com/c/11-785-s20-hw1p2/overview

References MNIST MLP example provided by Deep Learning Course, CMU.

In [0]:
## Run this code to ensure high ram instance in Google Colab

# a = []
# while(1):
#     a.append('1')

In [0]:
version = 'm6'

**SETUP**

Mounting drive, installing packages, downloading dataset from Kaggle

In [0]:
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json
# !ls ~/.kaggle

In [0]:
# !ls -l ~/.kaggle
# !cat ~/.kaggle/kaggle.json

In [0]:
# !pip install -q kaggle
# !pip install -q kaggle-cli

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
root_path = '/content/gdrive/My Drive/Frame-Level Classification of Speech/'  #change dir to your project folder

In [0]:
# !kaggle competitions download -c 11-785-s20-hw1p2 -p /content/gdrive/My\ Drive/Frame-Level\ Classification\ of\ Speech

In [0]:
# import os
# os.chdir(root_path)  #change dir
# !ls

In [0]:
# !unzip dev_labels.npy.zip
# !unzip dev.npy.zip
# !unzip test.npy.zip
# !unzip train.npy.zip
# !unzip hw1p2_sample_submission.csv.zip

In [0]:
# !rm dev_labels.npy.zip
# !rm dev.npy.zip
# !rm test.npy.zip
# !rm train.npy.zip
# !rm hw1p2_sample_submission.csv.zip

In [0]:
# !unzip train_labels.npy.zip
# !rm train_labels.npy.zip

In [0]:
# !pip3 install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl 

In [0]:
# !pip3 install torchvision

In [0]:
import numpy as np
import torch
import sys
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils import data
from torchvision import transforms
from torchvision.datasets import MNIST

import matplotlib.pyplot as plt
import time

In [5]:
cuda = torch.cuda.is_available()
cuda

True

**DATA LOADER**

Custom data loader class. Dataset is composed of frames of audio, and a single frame does not contain enough context for predictions. It is preferrable to use a window of frames in this task. This implementation uses k = 9, for a window size of (2k + 1).  

In [0]:
from torch.utils.data import DataLoader, Dataset, TensorDataset

In [0]:
train_x = np.load(root_path + "/train.npy", allow_pickle=True)

In [0]:
train_y = np.load(root_path + "/train_labels.npy", allow_pickle=True)

In [0]:
# flatten the frames within each utterance, x_i, so that they can be loaded by Data Loader
def flatten_frames(x_list):
    indexes = []
    for i in range(len(x_list)):              #utterance
        for j in range(x_list[i].shape[0]):   #frame
            indexes.append((i, j))
    return indexes

In [0]:
class myDataset(Dataset):
    def __init__(self, x, y, k = 9):
        self.k = k
        self.x = x
        self.y = y
        self.idx_to_frame = flatten_frames(x)

    def __len__(self):
        return len(self.idx_to_frame)
    
    def __getitem__(self, idx):
        i, j = self.idx_to_frame[idx]
        lo = j - self.k
        hi = j + 1 + self.k
        x_i = self.x[i].take(range(lo, hi), mode='clip', axis=0)
        x_i = torch.from_numpy(x_i).float().reshape(-1)
        if self.y is None:
            y_i = -1
        else:
            y_i = self.y[i][j]
        return x_i, y_i

In [0]:
k = 9

In [0]:
dev_dataset = myDataset(train_x, train_y, k)

In [0]:
num_workers = 8 if cuda else 0
train_loader_args = dict(shuffle=True, batch_size=1024, num_workers=num_workers, pin_memory=True) if cuda\
                    else dict(shuffle=True, batch_size=64)
train_loader = data.DataLoader(dev_dataset, **train_loader_args)

In [0]:
test_x = np.load(root_path + "/dev.npy", allow_pickle=True)
test_y = np.load(root_path + "/dev_labels.npy", allow_pickle=True)
test_dataset = myDataset(test_x, test_y, k)
test_loader_args = dict(shuffle=True, batch_size=1000, num_workers=num_workers, pin_memory=True) if cuda\
                    else dict(shuffle=True, batch_size=1000)
test_loader = data.DataLoader(test_dataset, **test_loader_args)

**MODEL**

The implemented MLP is a deep NN with batch normalization and dropout at each hidden layer. Dropout at 10% boosted accuracy on the validation set by 3%, achieving 65+% accuracy.

The layer sizes were chosen based on the input size. The first layers are wider to capture more information from the features. The later layers can be more narrow. The depth of the model boosts accuracy.

In [0]:
in_size = (2 * k + 1) * 40
out_size = 138

model = nn.Sequential(
                nn.Linear(in_size, in_size*2),
                nn.BatchNorm1d(in_size*2),
                nn.ReLU(),
                nn.Dropout(.1),

                nn.Linear(in_size*2, in_size),
                nn.BatchNorm1d(in_size),
                nn.ReLU(),
                nn.Dropout(.1),

                nn.Linear(in_size, in_size),
                nn.BatchNorm1d(in_size),
                nn.ReLU(),
                nn.Dropout(.1),

                nn.Linear(in_size, in_size // 2),
                nn.BatchNorm1d(in_size // 2),
                nn.ReLU(),
                nn.Dropout(.1),

                nn.Linear(in_size // 2, out_size)
            )

In [0]:
save_path = root_path + version+ ".pt"

In [91]:
model.load_state_dict(torch.load(save_path))

<All keys matched successfully>

In [92]:
model.cuda()
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
device = torch.device("cuda" if cuda else "cpu")
model.to(device)
print(model, device)

Sequential(
  (0): Linear(in_features=760, out_features=1520, bias=True)
  (1): BatchNorm1d(1520, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
  (3): Dropout(p=0.1, inplace=False)
  (4): Linear(in_features=1520, out_features=760, bias=True)
  (5): BatchNorm1d(760, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (6): ReLU()
  (7): Dropout(p=0.1, inplace=False)
  (8): Linear(in_features=760, out_features=760, bias=True)
  (9): BatchNorm1d(760, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (10): ReLU()
  (11): Dropout(p=0.1, inplace=False)
  (12): Linear(in_features=760, out_features=380, bias=True)
  (13): BatchNorm1d(380, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (14): ReLU()
  (15): Dropout(p=0.1, inplace=False)
  (16): Linear(in_features=380, out_features=138, bias=True)
) cuda


In [93]:
!nvidia-smi

Sat Feb  8 03:56:15 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.48.02    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P0    35W / 250W |    889MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

**Train**

In [0]:
def train_epoch(model, train_loader, criterion, optimizer):
    model.train()

    running_loss = 0.0
    
    start_time = time.time()
    for batch_idx, (data, target) in enumerate(train_loader):   
        optimizer.zero_grad()   # .backward() accumulates gradients
        data = data.to(device)
        target = target.to(device) # all data & model on same device

        outputs = model(data)
        loss = criterion(outputs, target)
        running_loss += loss.item()

        loss.backward()
        optimizer.step()
    
    end_time = time.time()
    
    running_loss /= len(train_loader)
    print('Training Loss: ', running_loss, 'Time: ',end_time - start_time, 's')
    return running_loss

In [0]:
def test_model(model, test_loader, criterion):
    with torch.no_grad():
        model.eval()

        running_loss = 0.0
        total_predictions = 0.0
        correct_predictions = 0.0

        for batch_idx, (data, target) in enumerate(test_loader):   
            data = data.to(device)
            target = target.to(device)

            outputs = model(data)

            _, predicted = torch.max(outputs.data, 1)
            total_predictions += target.size(0)
            correct_predictions += (predicted == target).sum().item()

            loss = criterion(outputs, target).detach()
            running_loss += loss.item()


        running_loss /= len(test_loader)
        acc = (correct_predictions/total_predictions)*100.0
        print('Testing Loss: ', running_loss)
        print('Testing Accuracy: ', acc, '%')
        return running_loss, acc

In [0]:
n_epochs = 100
Train_loss = []
Test_loss = []
Test_acc = []

for i in range(n_epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer)
    # if i % 5 == 0:
    test_loss, test_acc = test_model(model, test_loader, criterion)
    Train_loss.append(train_loss)
    Test_loss.append(test_loss)
    Test_acc.append(test_acc)
    print('='*20)
    torch.save(model.state_dict(), save_path)

In [0]:
torch.save(model.state_dict(), save_path)

**PREDICTIONS**

In [0]:
def predict(model, predict_loader):
  with torch.no_grad():
        model.eval()

        ids = []
        preds = []
        for batch_idx, (data, target, idx_val) in enumerate(predict_loader):   
            data = data.to(device)
            target = target.to(device)

            outputs = model(data)

            _, predicted = torch.max(outputs.data, 1)

            ids.append(idx_val.cpu().numpy())
            preds.append(predicted.cpu().numpy())

        return ids, preds

In [0]:
pred_x = np.load(root_path + "/test.npy", allow_pickle=True)
pred_dataset = myDataset(pred_x, None, k)
pred_loader_args = dict(shuffle=False, batch_size=1000, num_workers=0, pin_memory=True) if cuda\
                    else dict(shuffle=False, batch_size=1000)
pred_loader = data.DataLoader(pred_dataset, **pred_loader_args)

In [0]:
to_csv = predict(model, pred_loader)

In [0]:
id_a, pred_a = to_csv

In [0]:
print(len(id_a), len(pred_a))
id_a = [val for sublist in id_a for val in sublist]
pred_a = [val for sublist in pred_a for val in sublist]
print(len(id_a), len(pred_a))

224 224
223592 223592


In [0]:
import csv
import operator

f = open(root_path + version + '.csv', 'w')

with f:
    writer = csv.writer(f)
    for i in range(len(id_a)):
        if i == 0:
            writer.writerow(['id', 'label'])
        writer.writerow([i, pred_a[i]])