In [None]:
!pip install -U torchtext==0.8.0

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

CONTENT_FOLDER = "/content"
DRIVE_FOLDER = "/content/gdrive/MyDrive/miniproj"
DATASET_FOLDER = F'{DRIVE_FOLDER}/UrbanSound8K'

model_save_name = 'classifier.pt'
MODEL_PATH = F"{DRIVE_FOLDER}/{model_save_name}" 

DOWNLOAD_DATASET = False
EXTRACT_DATASET = False

# DATSET_OPTIONS = ['GET_WAVEFORM', 'SAVE_CSV', 'LOAD_CSV', 'SKIP']
DATASET_OPTION = 'GET_WAVEFORM'

In [None]:
import os.path
import tarfile

%cd /content/gdrive/MyDrive/miniproj

if DOWNLOAD_DATASET:
    ! wget https://goo.gl/8hY5ER -O dataset.tar.gz

if EXTRACT_DATASET:
    tar = tarfile.open('dataset.tar.gz', 'r:gz')
    tar.extractall()
    tar.close()

%cd /content

In [None]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class TransformerModel(nn.Module):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output

In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
CUDA_LAUNCH_BLOCKING="1"

import io
import torch
import pandas as pd
from torchtext.vocab import build_vocab_from_iterator
import numpy as np
import librosa
import librosa.display
from tqdm import tqdm_notebook as tqdm

SOF = 299.
EOF = 99.

def get_waveform(idx, rows):
    row = rows[idx]
    filename = row[0]
    fold = row[1]
    wav, sr = librosa.load(f'{DATASET_FOLDER}/audio/fold{fold}/{filename}', sr=8000)
    wav = np.around(wav, 4)
    wav = np.append(wav, EOF)
    wav = np.insert(wav, 0, SOF)

    return wav

def get_audio_data_from_csv(category='dog_bark'):
    df = pd.read_csv(F'{DATASET_FOLDER}/metadata/UrbanSound8K.csv')
    groupedData = df[['slice_file_name', 'fold', 'class']].groupby('class').apply(np.array)
    rows = groupedData[category]

    audio_data = []
    unique_wav_data = []

    for idx in tqdm(range(int(len(rows)))):
        audio_wav_sr_data = get_waveform(idx, rows)

        audio_data.append(audio_wav_sr_data)

    return audio_data

In [None]:
# Optional: save data to csv
import csv

if DATASET_OPTION == 'GET_WAVEFORM' or DATASET_OPTION == 'SAVE_CSV':
    data = get_audio_data_from_csv()

    if DATASET_OPTION == 'SAVE_CSV':
        with open(F"{DRIVE_FOLDER}/waveforms.csv","w+") as my_csv:
            csvWriter = csv.writer(my_csv, delimiter=',')
            csvWriter.writerows(data)
elif DATASET_OPTION == 'LOAD_CSV':
    # TODO: CHECK IF CORRECT
    data2 = pd.read_csv(F'{DRIVE_FOLDER}/waveforms.csv')

if DATASET_OPTION != 'SKIP':
    b = int(len(data)*0.8)
    train_data = data[0:b]
    val_data = data[b+1:len(data)]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab = build_vocab_from_iterator(iter(data))

def waveform_detokenizer(iter):
    arr = [np.round((np.array(item) / 10000) - 1, 4) for item in iter]
    return arr

def data_process(iter):
    data = [torch.tensor([vocab[token] for token in row], dtype=torch.long) for row in iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

def batchify(data, bsz):
    nbatch = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).t().contiguous()

    return data.to(device)

def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

train_data = data_process(iter(train_data))
val_data = data_process(iter(val_data))

bptt = 35
batch_size = 32
eval_batch_size = 16
train_data = batchify(train_data, batch_size)
val_data = batchify(val_data, eval_batch_size)

In [None]:
import os.path

ntokens = len(vocab.stoi) # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 32 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value

model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

if os.path.isfile(F"{DRIVE_FOLDER}/{model_save_name}" ):
    print('loading checkpoint...')
    checkpoint = torch.load(MODEL_PATH)
    model.load_state_dict(checkpoint['model_state_dict'])#.to(device)
    model.to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
lr = 0.1 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 5, gamma=0.1)

if os.path.isfile(F"{DRIVE_FOLDER}/{model_save_name}" ):
    print('loading checkpoint...')
    checkpoint = torch.load(MODEL_PATH)
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

import time
def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    src_mask = model.generate_square_subsequent_mask(bptt).to(device)
    # for batch, i in enumerate(tqdm(range(0, train_data.size(0) - 1, bptt))):
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        optimizer.zero_grad()
        if data.size(0) != bptt:
            src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 3000
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source):
    eval_model.eval()
    total_loss = 0.
    src_mask = model.generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            if data.size(0) != bptt:
                src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
            output = eval_model(data, src_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [None]:
best_val_loss = float("inf")
epochs = 10 # The number of epochs
best_model = None
epoch_losses = np.array([])
epoch = 1

if os.path.isfile(F"{DRIVE_FOLDER}/{model_save_name}" ):
    print('loading checkpoint...')
    checkpoint = torch.load(MODEL_PATH)
    epoch = checkpoint['epoch']
    saved_loss = checkpoint['loss']
    
    if saved_loss < best_val_loss:
        best_val_loss = saved_loss

while epoch < epochs + 1:
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data)
    epoch_losses = np.append(epoch_losses, val_loss)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': val_loss,
        }, MODEL_PATH)

    epoch = epoch + 1
    scheduler.step()

In [None]:
import matplotlib.pyplot as plt
plt.plot(epoch_losses)