1. Convert additional data to wav format resampled at 22.5kHz - code available on local computer
2. Convert audio to hdf5 format

In [None]:
import pandas as pd
import numpy as np


import h5py
import librosa

from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import os
import time
import random

In [None]:
SEED = 42

def random_seed(SEED = SEED):    
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
random_seed()

In [None]:
df_train = pd.read_csv("../input/giz-data/giz_data/Train.csv")
sample_submission = pd.read_csv("../input/giz-data/giz_data/SampleSubmission.csv")

In [None]:
ORIGINAL_AUDIO_PATH = "../input/giz-data/giz_data/audio_files/"
# Named AdditionalUtterances
ADDITIONAL_AUDIO_PATH_1 = "../input/giz-data/AdditionalUtterancesConverted/AdditionalUtterancesConverted/"
ADDITIONAL_AUDIO_PATH_2 = "../input/giz-data/AdditionalUtterancesConverted_2/AdditionalUtterancesConverted_2/"

In [None]:
# match fnames in dataframes to audio path
df_train["fn"] = df_train.fn.str.replace("audio_files/", ORIGINAL_AUDIO_PATH)
sample_submission["fn"] = sample_submission.fn.str.replace("audio_files/", ORIGINAL_AUDIO_PATH)

In [None]:
df_test = sample_submission[["fn"]].copy()

In [None]:
df_train["fold"] = -1

In [None]:
# -1, train, others == eval set

In [None]:
# sss = StratifiedShuffleSplit(n_splits = 5, random_state = SEED, test_size = 0.2)
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = SEED)

for fold, (train_idx, valid_idx) in enumerate(kfold.split(df_train, df_train.label.values)):
    
#     df_train.loc[valid_idx, "fold"] = df_train.loc[valid_idx, "fold"].apply(lambda row: fold + 1 if row == -1 else row)
    df_train.loc[valid_idx, "fold"] = fold + 1 

In [None]:
df_train.fold.value_counts()

In [None]:
# Porblems with additional audio data like similar voicings/duplicates, sr was initially different, file type was initially mp3
df_train["weight"] = 0.65

### Add additional audio details to existing dataframe

In [None]:
base_path_1 = ADDITIONAL_AUDIO_PATH_1 + "latest_keywords"
additional_labels_1 = os.listdir(base_path_1)

base_path_2 = ADDITIONAL_AUDIO_PATH_2 + "nlp_keywords"
additional_labels_2 = os.listdir(base_path_2)

In [None]:
new_fns = []
new_labels = []
# batch = []

In [None]:
for label in additional_labels_1:
    samples_in_label = os.listdir(os.path.join(base_path_1, label))
    
    for sample in samples_in_label:
        file_path = os.path.join(base_path_1, label + "/" + sample)
        new_fns.append(file_path)
        new_labels.append(label)
#         batch.append(1)

        
for label in additional_labels_2:
    samples_in_label = os.listdir(os.path.join(base_path_2, label))
    
    for sample in samples_in_label:
        file_path = os.path.join(base_path_2, label + "/" + sample)
        new_fns.append(file_path)
        new_labels.append(label)
#         batch.append(2)

In [None]:
df_new = np.asarray([new_fns, new_labels]).T
df_new = pd.DataFrame(df_new, columns = ["fn", "label"])

# df_new["batch"] = batch

In [None]:
# # kept all batch 2 data

# df_new["id_"] = df_new.fn.apply(lambda row: row.split("/")[-1].split("_")[0])
# to_keep = df_new[df_new.batch == 1][["label", "id_"]].drop_duplicates().index.append(df_new[df_new.batch == 2].index)
# df_new = df_new.iloc[to_keep].reset_index(drop = True)
# df_new = df_new.drop("id_", axis = 1)

In [None]:
# df_new.drop("batch", axis = 1, inplace = True)

In [None]:
# Use all for training - identified by -1
df_new["fold"] = -1
df_new["weight"] = 0.35

In [None]:
df_train.shape, df_new.shape

## New df to be used for training

In [None]:
# Concat
df_train_new = pd.concat([df_train, df_new], axis = 0).reset_index(drop = True)

In [None]:
# Sort
df_train_new = df_train_new.sort_values("label").reset_index(drop = True)
# Shuffle
df_train_new = df_train_new.sample(frac=1, random_state = 42).reset_index(drop = True)

In [None]:
df_train_new["audio_id"] = df_train_new.fn.apply(lambda row: row.split("/")[-1][:-4])

In [None]:
df_test["audio_id"] = df_test.fn.apply(lambda row: row.split("/")[-1][:-4])

### Encode labels

In [None]:
# Quick Label encoding
df_train_new["target"] = df_train_new.label.astype("category").cat.codes

In [None]:
df_train_new.shape, df_test.shape

In [None]:
df_train_new.head()

## CONST

In [None]:
# config/arguments/parameters

SAMPLE_RATE = 16000 #22050
CLIP_SAMPLES = SAMPLE_RATE * 3 #SAMPLE_RATE * 30
NUM_CLASSES = 193 #10
# WORKSPACE_PATH = , HDF5 PATH SHOULD BE DYNAMIC
MIXUP = False
WINDOW_SIZE = 512 #1024# 2048
HOP_SIZE = 160 #320
MEL_BINS = 64
FMIN = 50#20
FMAX = SAMPLE_RATE // 2.0
FREEZE_BASE = True

BATCH_SIZE = 64 # * 2 if mixup

AUGMENTATION = ['mixup']

MODEL_TYPE = "Transfer_Cnn14" #"Cnn14_DecisionLevelAtt"  #"Cnn14"
PRETRAINED_CHECKPOINT_PATH = "../input/pretrained-pann/Cnn14_16k_mAP0.438.pth"
# PRETRAINED_CHECKPOINT_PATH = "../input/resnet38-0-434/ResNet38_0_434.pth"
# PRETRAINED_CHECKPOINT_PATH = "../input/wavegram-logmel-cnn14-0-439/Wavegram_Logmel_Cnn14_0_439.pth"


from torch import cuda
DEVICE = 'cuda' if cuda.is_available() else 'cpu'

# Utils

In [None]:
def traverse_folder(fd):
    paths = []
    names = []

    for root, dirs, files in os.walk(fd):
        for name in files:
            filepath = os.path.join(root, name)
            names.append(name)
            paths.append(filepath)

    return names, paths

In [None]:
def create_folder(fd):
    if not os.path.exists(fd):
        os.makedirs(fd)

In [None]:
def pad_truncate_sequence(x, max_len):
    if len(x) < max_len:
        return np.concatenate((x, np.zeros(max_len - len(x))))
    else:
        return x[0 : max_len]

In [None]:
def float32_to_int16(x):
    # assert np.max(np.abs(x)) <= 1.
    if np.max(np.abs(x)) > 1.:
        x /= np.max(np.abs(x))
    return (x * 32767.).astype(np.int16)

In [None]:
def int16_to_float32(x):
    return (x / 32767.).astype(np.float32)

In [None]:
def to_one_hot(k, classes_num):
    target = np.zeros(classes_num)
    target[k] = 1
    return target

In [None]:
def move_data_to_device(x, device):
    if 'float' in str(x.dtype):
        x = torch.Tensor(x)
    elif 'int' in str(x.dtype):
        x = torch.LongTensor(x)
    else:
        return x

    return x.to(device)

In [None]:
def collate_fn(list_data_dict):
    """Collate data.
    Args:
      list_data_dict, e.g., [{'audio_name': str, 'waveform': (clip_samples,), ...}, 
                             {'audio_name': str, 'waveform': (clip_samples,), ...},
                             ...]
    Returns:
      np_data_dict, dict, e.g.,
          {'audio_name': (batch_size,), 'waveform': (batch_size, clip_samples), ...}
    """
    np_data_dict = {}
    
    for key in list_data_dict[0].keys():
        np_data_dict[key] = np.array([data_dict[key] for data_dict in list_data_dict])
    
    return np_data_dict

In [None]:
def pack_audio_files_to_hdf5(df, workspace, train = True):

#     # Paths
#     audios_dir = df.fn.values

    if train:
        packed_hdf5_path = os.path.join(workspace, 'features', 'waveform_train.h5')
    else:
        packed_hdf5_path = os.path.join(workspace, 'features', 'waveform_test.h5')
     
    # Create folder
    create_folder(os.path.dirname(packed_hdf5_path))
    
    audio_names = df.audio_id.values
    audio_paths = df.fn.values

    if train:
        meta_dict = {
            'audio_name': np.array(audio_names), 
            'audio_path': np.array(audio_paths), 
            'target': df.target.values, 
            'fold': df.fold.values
        }
    else:
        meta_dict = {
            'audio_name': np.array(audio_names), 
            'audio_path': np.array(audio_paths), 
        }        
        
    
    audios_num = len(meta_dict['audio_name'])

    feature_time = time.time()
    with h5py.File(packed_hdf5_path, 'w') as hf:
        hf.create_dataset(
            name='audio_name', 
            shape=(audios_num,), 
            dtype='S80')

        hf.create_dataset(
            name='waveform', 
            shape=(audios_num, CLIP_SAMPLES), 
            dtype=np.int16)

        if train:
            hf.create_dataset(
                name='target', 
                shape=(audios_num,), 
                dtype=np.int32)

            hf.create_dataset(
                name='fold', 
                shape=(audios_num,), 
                dtype=np.int16)
 
        for n in range(audios_num):
            print(n)
            audio_name = meta_dict['audio_name'][n]
            audio_path = meta_dict['audio_path'][n]
            
            if train:
                fold = meta_dict['fold'][n]
            
            (audio, fs) = librosa.core.load(audio_path, sr = SAMPLE_RATE, mono=True, res_type = "kaiser_fast")#sr=SAMPLE_RATE

            audio = pad_truncate_sequence(audio, CLIP_SAMPLES)

            hf['audio_name'][n] = audio_name.encode()
            hf['waveform'][n] = float32_to_int16(audio)
            
            if train:
                hf['target'][n] = meta_dict['target'][n]
                hf['fold'][n] = meta_dict['fold'][n]

    print('Write hdf5 to {}'.format(packed_hdf5_path))
    print('Time: {:.3f} s'.format(time.time() - feature_time))

In [None]:
pack_audio_files_to_hdf5(df_train_new, "./", train = True)

In [None]:
pack_audio_files_to_hdf5(df_test, "./", train = False)

In [None]:
class AudioDataset(object):
    def __init__(self, train = True):
        """This class takes the meta of an audio clip as input, and return 
        the waveform and target of the audio clip. This class is used by DataLoader. 
        Args:
          clip_samples: int
          classes_num: int
        """
        self.train = train
#         pass
    
    def __getitem__(self, meta):
        """Load waveform and target of an audio clip.
        
        Args:
          meta: {
            'audio_name': str, 
            'hdf5_path': str, 
            'index_in_hdf5': int}
        Returns: 
          data_dict: {
            'audio_name': str, 
            'waveform': (clip_samples,), 
            'target': (classes_num,)}
        """
        hdf5_path = meta['hdf5_path']
        index_in_hdf5 = meta['index_in_hdf5']

        with h5py.File(hdf5_path, 'r') as hf:
            audio_name = hf['audio_name'][index_in_hdf5].decode()
            waveform = int16_to_float32(hf['waveform'][index_in_hdf5])
            if self.train:
                target = hf['target'][index_in_hdf5].astype(np.int32)

        if self.train:
            data_dict = {
                'audio_name': audio_name, 
                'waveform': waveform, 
                'target': target
            }
            
            return data_dict
        else:
            data_dict = {
                'audio_name': audio_name, 
                'waveform': waveform
            }            
            
            return data_dict


In [None]:
class TrainSampler1(object):
    def __init__(self, hdf5_path, holdout_fold, batch_size, random_seed = 1234):
        """Balanced sampler. Generate batch meta for training.
        
        Args:
          indexes_hdf5_path: string
          batch_size: int
          black_list_csv: string
          random_seed: int
        """
        # super(TrainSampler, self).__init__(indexes_hdf5_path, batch_size, 
            # random_seed)

        self.hdf5_path = hdf5_path
        self.batch_size = batch_size
        self.random_state = np.random.RandomState(random_seed)

        with h5py.File(hdf5_path, 'r') as hf:
            self.folds = hf['fold'][:].astype(np.float32)

        self.indexes = np.where(self.folds != int(holdout_fold))[0]
        self.audios_num = len(self.indexes)
        # self.validate_audio_indexes = np.where(self.folds == int(holdout_fold))[0]
        
        # self.indexes = np.arange(self.audios_num)
            
        # Shuffle indexes
        self.random_state.shuffle(self.indexes)
        
        self.pointer = 0
        
    def __len__(self):
        #len of loader
        print(len(self.indexes))
        return int(np.ceil(len(self.indexes) / self.batch_size))

    def __iter__(self):
        """Generate batch meta for training. 
        
        Returns:
          batch_meta: e.g.: [
            {'audio_name': 'YfWBzCRl6LUs.wav', 
             'hdf5_path': 'xx/balanced_train.h5', 
             'index_in_hdf5': 15734, 
             'target': [0, 1, 0, 0, ...]}, 
            ...]
        """
        batch_size = self.batch_size

        while True:
            batch_meta = []
            i = 0
            while i < batch_size:
                index = self.indexes[self.pointer]
                self.pointer += 1

                # Shuffle indexes and reset pointer
                if self.pointer >= self.audios_num:
                    self.pointer = 0
                    self.random_state.shuffle(self.indexes)
                
                batch_meta.append({
                    'hdf5_path': self.hdf5_path, 
                    'index_in_hdf5': self.indexes[self.pointer]
                })
                i += 1

            yield batch_meta

    def state_dict(self):
        state = {
            'indexes': self.indexes,
            'pointer': self.pointer
        }
        return state
            
    def load_state_dict(self, state):
        self.indexes = state['indexes']
        self.pointer = state['pointer']


class EvaluateSampler(object):
    def __init__(self, hdf5_path, holdout_fold, batch_size, random_seed=1234):
        """Balanced sampler. Generate batch meta for training.
        
        Args:
          indexes_hdf5_path: string
          batch_size: int
          black_list_csv: string
          random_seed: int
        """
        # super(TrainSampler, self).__init__(indexes_hdf5_path, batch_size, 
            # random_seed)

        self.hdf5_path = hdf5_path
        self.batch_size = batch_size

        with h5py.File(hdf5_path, 'r') as hf:
            self.folds = hf['fold'][:].astype(np.float32)

        self.indexes = np.where(self.folds == int(holdout_fold))[0]
        self.audios_num = len(self.indexes)
        
    def __len__(self):
        #len of loader
        return int(np.ceil(len(self.indexes) / self.batch_size))        
        
    def __iter__(self):
        """Generate batch meta for training. 
        
        Returns:
          batch_meta: e.g.: [
            {'audio_name': 'YfWBzCRl6LUs.wav', 
             'hdf5_path': 'xx/balanced_train.h5', 
             'index_in_hdf5': 15734, 
             'target': [0, 1, 0, 0, ...]}, 
            ...]
        """
        batch_size = self.batch_size
        pointer = 0

        while pointer < self.audios_num:
            batch_indexes = np.arange(pointer, 
                min(pointer + batch_size, self.audios_num))

            batch_meta = []

            for i in batch_indexes:
                batch_meta.append({
                    'hdf5_path': self.hdf5_path, 
                    'index_in_hdf5': self.indexes[i]
                })

            pointer += batch_size
            yield batch_meta

In [None]:
class TestSampler(object):
    def __init__(self, hdf5_path, batch_size, random_seed=1234):
        """Balanced sampler. Generate batch meta for training.
        
        Args:
          indexes_hdf5_path: string
          batch_size: int
          black_list_csv: string
          random_seed: int
        """
        # super(TrainSampler, self).__init__(indexes_hdf5_path, batch_size, 
            # random_seed)

        self.hdf5_path = hdf5_path
        self.batch_size = batch_size

        with h5py.File(hdf5_path, 'r') as hf:
            self.audio_name = hf['audio_name'][:]

        self.indexes = [i for i in range(len(self.audio_name))]
        self.audios_num = len(self.indexes)
        
    def __len__(self):
        #len of loader
        return int(np.ceil(len(self.indexes) / self.batch_size))        
        
    def __iter__(self):
        """Generate batch meta for training. 
        
        Returns:
          batch_meta: e.g.: [
            {'audio_name': 'YfWBzCRl6LUs.wav', 
             'hdf5_path': 'xx/balanced_train.h5', 
             'index_in_hdf5': 15734, 
             'target': [0, 1, 0, 0, ...]}, 
            ...]
        """
        batch_size = self.batch_size
        pointer = 0

        while pointer < self.audios_num:
            batch_indexes = np.arange(pointer, 
                min(pointer + batch_size, self.audios_num))

            batch_meta = []

            for i in batch_indexes:
                batch_meta.append({
                    'hdf5_path': self.hdf5_path, 
                    'index_in_hdf5': self.indexes[i]
                })

            pointer += batch_size
            yield batch_meta

In [None]:
class TrainSampler2(object):
    def __init__(self, hdf5_path, holdout_fold, batch_size, random_seed=1234):
        """Balanced sampler. Generate batch meta for training.
        
        Args:
          indexes_hdf5_path: string
          batch_size: int
          black_list_csv: string
          random_seed: int
        """
        # super(TrainSampler, self).__init__(indexes_hdf5_path, batch_size, 
            # random_seed)

        self.hdf5_path = hdf5_path
        self.batch_size = batch_size

        with h5py.File(hdf5_path, 'r') as hf:
            self.folds = hf['fold'][:].astype(np.float32)

        self.indexes = np.where(self.folds != int(holdout_fold))[0]
        self.audios_num = len(self.indexes)
        
    def __len__(self):
        #len of loader
        return int(np.ceil(len(self.indexes) / self.batch_size))        
        
    def __iter__(self):
        """Generate batch meta for training. 
        
        Returns:
          batch_meta: e.g.: [
            {'audio_name': 'YfWBzCRl6LUs.wav', 
             'hdf5_path': 'xx/balanced_train.h5', 
             'index_in_hdf5': 15734, 
             'target': [0, 1, 0, 0, ...]}, 
            ...]
        """
        batch_size = self.batch_size
        pointer = 0

        while pointer < self.audios_num:
            batch_indexes = np.arange(pointer, 
                min(pointer + batch_size, self.audios_num))

            batch_meta = []

            for i in batch_indexes:
                batch_meta.append({
                    'hdf5_path': self.hdf5_path, 
                    'index_in_hdf5': self.indexes[i]
                })

            pointer += batch_size
            yield batch_meta

In [None]:
dataset = AudioDataset()

In [None]:
train_sampler1 = TrainSampler1(
    hdf5_path="./features/waveform_train.h5", 
    holdout_fold = 1, 
    batch_size = BATCH_SIZE * 2 if MIXUP else BATCH_SIZE)

train_sampler2 = TrainSampler2(
    hdf5_path="./features/waveform_train.h5", 
    holdout_fold = 1, 
    batch_size = BATCH_SIZE * 2 if MIXUP else BATCH_SIZE)

valid_sampler = EvaluateSampler(
    hdf5_path = "./features/waveform_train.h5", 
    holdout_fold = 1, 
    batch_size = BATCH_SIZE)

# Data loader
train_loader1 = DataLoader(dataset=dataset, 
    batch_sampler=train_sampler1, 
                          collate_fn=collate_fn, 
    num_workers=2, pin_memory=True)

train_loader2 = DataLoader(dataset=dataset, 
    batch_sampler=train_sampler2, 
                          collate_fn=collate_fn, 
    num_workers=2, pin_memory=True)

validate_loader = DataLoader(dataset=dataset, 
    batch_sampler=valid_sampler, collate_fn=collate_fn, 
    num_workers=2, pin_memory=True)

In [None]:
# # Data loader
# train_loader = DataLoader(dataset=dataset, 
#     num_workers=2, pin_memory=True)

# validate_loader = DataLoader(dataset=dataset, 
#     num_workers=2, pin_memory=True)

In [None]:
ts = []
idds = []
btchs = []
ii = 0
for btch in train_loader2:
    for i, t in zip(btch["audio_name"], btch["target"]):
        ts.append(t)
        idds.append(i)
#     print(btch["target"])
    btchs.append(btch["target"])
    if ii == 67:
        break
    ii += 1
    
# sample_data_iter = iter(validate_loader)
# sample_output = sample_data_iter.next()    

In [None]:
len(ts)

In [None]:
len(btchs)

In [None]:
pd.Series(idds).duplicated().value_counts()

In [None]:
pd.DataFrame(idds)[pd.Series(idds).duplicated()]

In [None]:
for b in btchs:
    print(len(b))

In [None]:
ts

In [None]:
sample_data_iter = iter(train_loader2)
sample_output = sample_data_iter.next()

In [None]:
for dt in train_loader:
    print(dt)
    break

In [None]:
for i, dt in enumerate(train_loader, 1):
    print(dt)
    break

In [None]:
sample_output["waveform"][0]

In [None]:
    Model = eval(MODEL_TYPE)
    model = Model(SAMPLE_RATE, WINDOW_SIZE, HOP_SIZE, MEL_BINS, FMIN, FMAX, NUM_CLASSES, FREEZE_BASE)
    model.load_from_pretrain(PRETRAINED_CHECKPOINT_PATH)
    model.to(DEVICE)

In [None]:
model.train()

for key in sample_output.keys():
    sample_output[key] = move_data_to_device(sample_output[key], DEVICE) 

s_out = model(sample_output["waveform"])
s_target = sample_output["target"]

In [None]:
s_out["clipwise_output"]

In [None]:
nn.CrossEntropyLoss().to(DEVICE)(s_out["clipwise_output"], s_target)

In [None]:
# s_out['clipwise_output'].T[0]

In [None]:
sample_output["target"] * s_out['clipwise_output'].T[0]

In [None]:
s_out

In [None]:
sample_output['target'].shape

In [None]:
loss_func(s_out, {'target': sample_output['target']})

In [None]:
for i, j in enumerate(train_loader, 0):
    print(i)
#     break

# Mixup Augmentation

In [None]:
class Mixup(object):
    def __init__(self, mixup_alpha, random_seed=1234):
        """Mixup coefficient generator.
        """
        self.mixup_alpha = mixup_alpha
        self.random_state = np.random.RandomState(random_seed)

    def get_lambda(self, batch_size):
        """Get mixup random coefficients.
        Args:
          batch_size: int
        Returns:
          mixup_lambdas: (batch_size,)
        """
        mixup_lambdas = []
        for n in range(0, batch_size, 2):
            lam = self.random_state.beta(self.mixup_alpha, self.mixup_alpha, 1)[0]
            mixup_lambdas.append(lam)
            mixup_lambdas.append(1. - lam)

        return np.array(mixup_lambdas)


def do_mixup(x, mixup_lambda):
    """Mixup x of even indexes (0, 2, 4, ...) with x of odd indexes 
    (1, 3, 5, ...).
    Args:
      x: (batch_size * 2, ...)
      mixup_lambda: (batch_size * 2,)
    Returns:
      out: (batch_size, ...)
    """
    out = (x[0 :: 2].transpose(0, -1) * mixup_lambda[0 :: 2] + \
        x[1 :: 2].transpose(0, -1) * mixup_lambda[1 :: 2]).transpose(0, -1)
    return out

# Architecture

In [None]:
!pip install torchlibrosa

In [None]:
from torchlibrosa.stft import Spectrogram, LogmelFilterBank
from torchlibrosa.augmentation import SpecAugmentation

In [None]:
def init_layer(layer):
    """Initialize a Linear or Convolutional layer. """
    nn.init.xavier_uniform_(layer.weight)
 
    if hasattr(layer, 'bias'):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)

In [None]:
def init_bn(bn):
    """Initialize a Batchnorm layer. """
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.)

In [None]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        
        super(ConvBlock, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=in_channels, 
                              out_channels=out_channels,
                              kernel_size=(3, 3), stride=(1, 1),
                              padding=(1, 1), bias=False)
                              
        self.conv2 = nn.Conv2d(in_channels=out_channels, 
                              out_channels=out_channels,
                              kernel_size=(3, 3), stride=(1, 1),
                              padding=(1, 1), bias=False)
                              
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.init_weight()
        
    def init_weight(self):
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)
        init_bn(self.bn2)

        
    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
        
        x = input
        x = F.relu_(self.bn1(self.conv1(x)))
        x = F.relu_(self.bn2(self.conv2(x)))
        if pool_type == 'max':
            x = F.max_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg':
            x = F.avg_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg+max':
            x1 = F.avg_pool2d(x, kernel_size=pool_size)
            x2 = F.max_pool2d(x, kernel_size=pool_size)
            x = x1 + x2
        else:
            raise Exception('Incorrect argument!')
        
        return x

In [None]:
class Cnn14(nn.Module):
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
        fmax, classes_num):
        
        super(Cnn14, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=32, time_stripes_num=2, 
            freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048, bias=True)
        self.fc_audioset = nn.Linear(2048, classes_num, bias=True)
        
        self.init_weight()

    def init_weight(self):
        init_bn(self.bn0)
        init_layer(self.fc1)
        init_layer(self.fc_audioset)
 
    def forward(self, input, mixup_lambda=None):
        """
        Input: (batch_size, data_length)"""

        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
        
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        
        if self.training:
            x = self.spec_augmenter(x)

        # Mixup on spectrogram
        if self.training and mixup_lambda is not None:
            x = do_mixup(x, mixup_lambda)

        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = torch.mean(x, dim=3)
        
        (x1, _) = torch.max(x, dim=2)
        x2 = torch.mean(x, dim=2)
        x = x1 + x2
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu_(self.fc1(x))
        embedding = F.dropout(x, p=0.5, training=self.training)
        clipwise_output = torch.sigmoid(self.fc_audioset(x))
        
        output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding}

        return output_dict

In [None]:
class ConvPreWavBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        
        super(ConvPreWavBlock, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels=in_channels, 
                              out_channels=out_channels,
                              kernel_size=3, stride=1,
                              padding=1, bias=False)
                              
        self.conv2 = nn.Conv1d(in_channels=out_channels, 
                              out_channels=out_channels,
                              kernel_size=3, stride=1, dilation=2, 
                              padding=2, bias=False)
                              
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.bn2 = nn.BatchNorm1d(out_channels)

        self.init_weight()
        
    def init_weight(self):
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)
        init_bn(self.bn2)

        
    def forward(self, input, pool_size):
        
        x = input
        x = F.relu_(self.bn1(self.conv1(x)))
        x = F.relu_(self.bn2(self.conv2(x)))
        x = F.max_pool1d(x, kernel_size=pool_size)
        
        return x

In [None]:
class Wavegram_Logmel_Cnn14(nn.Module):
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
        fmax, classes_num):
        
        super(Wavegram_Logmel_Cnn14, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        self.pre_conv0 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=11, stride=5, padding=5, bias=False)
        self.pre_bn0 = nn.BatchNorm1d(64)
        self.pre_block1 = ConvPreWavBlock(64, 64)
        self.pre_block2 = ConvPreWavBlock(64, 128)
        self.pre_block3 = ConvPreWavBlock(128, 128)
        self.pre_block4 = ConvBlock(in_channels=4, out_channels=64)

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
            freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=128, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048, bias=True)
        self.fc_audioset = nn.Linear(2048, classes_num, bias=True)
        
        self.init_weight()

    def init_weight(self):
        init_layer(self.pre_conv0)
        init_bn(self.pre_bn0)
        init_bn(self.bn0)
        init_layer(self.fc1)
        init_layer(self.fc_audioset)
 
    def forward(self, input, mixup_lambda=None):
        """
        Input: (batch_size, data_length)"""

        # Wavegram
        a1 = F.relu_(self.pre_bn0(self.pre_conv0(input[:, None, :])))
        a1 = self.pre_block1(a1, pool_size=4)
        a1 = self.pre_block2(a1, pool_size=4)
        a1 = self.pre_block3(a1, pool_size=4)
        a1 = a1.reshape((a1.shape[0], -1, 32, a1.shape[-1])).transpose(2, 3)
        a1 = self.pre_block4(a1, pool_size=(2, 1))

        # Log mel spectrogram
        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
        
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)

        if self.training:
            x = self.spec_augmenter(x)

        # Mixup on spectrogram
        if self.training and mixup_lambda is not None:
            x = do_mixup(x, mixup_lambda)
            a1 = do_mixup(a1, mixup_lambda)
        
        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')

        # Concatenate Wavegram and Log mel spectrogram along the channel dimension
        x = torch.cat((x, a1), dim=1)

        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = torch.mean(x, dim=3)
        
        (x1, _) = torch.max(x, dim=2)
        x2 = torch.mean(x, dim=2)
        x = x1 + x2
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu_(self.fc1(x))
        embedding = F.dropout(x, p=0.5, training=self.training)
        clipwise_output = torch.sigmoid(self.fc_audioset(x))
        
        output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding}

        return output_dict

In [None]:
class Cnn14_16k(nn.Module):
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num):
        
        super(Cnn14_16k, self).__init__() 

        assert sample_rate == 16000
        assert window_size == 512
        assert hop_size == 160
        assert mel_bins == 64
        assert fmin == 50
        assert fmax == 8000

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
            freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048, bias=True)
        self.fc_audioset = nn.Linear(2048, classes_num, bias=True)
        
        self.init_weight()

    def init_weight(self):
        init_bn(self.bn0)
        init_layer(self.fc1)
        init_layer(self.fc_audioset)
 
    def forward(self, input, mixup_lambda=None):
        """
        Input: (batch_size, data_length)"""

        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        
        if self.training:
            x = self.spec_augmenter(x)

        # Mixup on spectrogram
        if self.training and mixup_lambda is not None:
            x = do_mixup(x, mixup_lambda)
        
        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = torch.mean(x, dim=3)
        
        (x1, _) = torch.max(x, dim=2)
        x2 = torch.mean(x, dim=2)
        x = x1 + x2
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu_(self.fc1(x))
        embedding = F.dropout(x, p=0.5, training=self.training)
        clipwise_output = torch.sigmoid(self.fc_audioset(x))
        
        output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding}

        return output_dict

## Custom model head for task

In [None]:
class Transfer_Cnn14(nn.Module):
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
        fmax, classes_num, freeze_base):
        """Classifier for a new task using pretrained Cnn14 as a sub module.
        """
        super(Transfer_Cnn14, self).__init__()
        audioset_classes_num = 527
        
        self.base = Cnn14_16k(sample_rate, window_size, hop_size, mel_bins, fmin, 
            fmax, audioset_classes_num)

        # Transfer to another task layer
        self.fc_transfer = nn.Linear(2048, classes_num, bias=True)

        if freeze_base:
            # Freeze AudioSet pretrained layers
            for param in self.base.parameters():
                param.requires_grad = False

        self.init_weights()

    def init_weights(self):
        init_layer(self.fc_transfer)

    def load_from_pretrain(self, pretrained_checkpoint_path):
        checkpoint = torch.load(pretrained_checkpoint_path)
        self.base.load_state_dict(checkpoint['model'])

    def forward(self, input, mixup_lambda=None):
        """Input: (batch_size, data_length)
        """
        output_dict = self.base(input, mixup_lambda)
        embedding = output_dict['embedding']

        clipwise_output =  torch.log_softmax(self.fc_transfer(embedding), dim=-1)
        output_dict['clipwise_output'] = clipwise_output
 
        return output_dict

In [None]:
# LEARNING_RATE = 1e-3
# OPTIMIZER = optim.AdamW(model.parameters(), lr = LEARNING_RATE, weight_decay = 0.1)
if 'mixup' in AUGMENTATION:
    MIXUP_AUGMENTER = Mixup(mixup_alpha = 1.)

In [None]:
def forward(model, generator, return_input=False, 
    return_target=False):
    """Forward data to a model.
    
    Args: 
      model: object
      generator: object
      return_input: bool
      return_target: bool
    Returns:
      audio_name: (audios_num,)
      clipwise_output: (audios_num, classes_num)
      (ifexist) segmentwise_output: (audios_num, segments_num, classes_num)
      (ifexist) framewise_output: (audios_num, frames_num, classes_num)
      (optional) return_input: (audios_num, segment_samples)
      (optional) return_target: (audios_num, classes_num)
    """
    def append_to_dict(dict, key, value):
        if key in dict.keys():
            dict[key].append(value)
        else:
            dict[key] = [value]

    output_dict = {}
    device = next(model.parameters()).device

    # Forward data to a model in mini-batches
    for n, batch_data_dict in enumerate(generator):
        batch_waveform = move_data_to_device(batch_data_dict['waveform'], device)
        
        with torch.no_grad():
            model.eval()
            batch_output = model(batch_waveform)

        append_to_dict(output_dict, 'audio_name', batch_data_dict['audio_name'])

        append_to_dict(output_dict, 'clipwise_output', 
            batch_output['clipwise_output'].data.cpu().numpy()
                      )
            
        if return_input:
            append_to_dict(output_dict, 'waveform', batch_data_dict['waveform'])
            
        if return_target:
            if 'target' in batch_data_dict.keys():
                append_to_dict(output_dict, 'target', batch_data_dict['target'])

    for key in output_dict.keys():
        output_dict[key] = np.concatenate(output_dict[key], axis=0)

    return output_dict


In [None]:
def calculate_accuracy(y_true, y_score):
    N = y_true.shape[0]
    accuracy = np.sum(np.argmax(y_true, axis=-1) == np.argmax(y_score, axis=-1)) / N
    return accuracy

In [None]:
print('Iteration {iteration}', end = '\n')
print(5)

In [None]:
from transformers import get_linear_schedule_with_warmup

In [None]:
from sklearn import metrics

# iteration = 0
# stop_iteration = 1000
validation_cycle = 100
# checkpoint_cycle = 50

# Train on mini batches
def train_evaluate(model, train_loader, valid_loader, loss_func, device, stop_iteration):
    iteration = 1
    lr = 1e-3
    optimizer = optim.AdamW(model.parameters(), lr = lr, weight_decay = 0.1)
#     optimizer = optim.Adam(model.parameters(), lr = lr)
#     optimizer = optim.SGD(model.parameters(), lr = lr, weight_decay = 0.01)
#     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, 1700)
#     scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=lr, total_steps = 680)
    
    
    for batch_data_dict in train_loader:
    
        print(f'Iteration {iteration}', end = ', ')

        # Move data to device as tensor
        for key in batch_data_dict.keys():
            batch_data_dict[key] = move_data_to_device(batch_data_dict[key], device)

        # Train
        model.train()

        batch_output_dict = model(batch_data_dict['waveform'], None)
        batch_target_dict = {'target': batch_data_dict['target']}

        # loss
        loss = loss_func(batch_output_dict["clipwise_output"], batch_target_dict["target"])
        print(f'loss: {loss}', end = '\n')

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
#         scheduler.step()

        # Evaluate
        if iteration % validation_cycle == 0 and iteration > 0:

            output_dict = forward(model, valid_loader, return_target=True)
            clipwise_output = output_dict['clipwise_output']    # (audios_num, classes_num)
            target = output_dict['target']    # (audios_num, classes_num)           

#             cm = metrics.confusion_matrix(np.argmax(target, axis=-1),
#                                             np.argmax(clipwise_output, axis=-1),
#                                             labels=None)

#             val_accuracy = calculate_accuracy(target, clipwise_output)
            val_loss = loss_func(torch.tensor(clipwise_output), torch.tensor(target, dtype = torch.long))
#             print(f', val_acc:{val_accuracy}')
            print(f'val_loss: {val_loss}')
#             print('Confusion Matrix:')
#             print(cm)
#             print(idx_to_lb)

#         # Save model 
#         if iteration % checkpoint_cycle == 0 and iteration > 0:
#             checkpoint = {
#                 'iteration': iteration, 
#                 'model': model.state_dict()}

#             checkpoint_name = f'{iteration}_iterations.pth'
#             checkpoint_path = os.path.join(checkpoints_dir, checkpoint_name)

#             torch.save(checkpoint, checkpoint_path)
#             print(f'Model saved at {checkpoint_name}')

#         print()

        # Stop learning
        if iteration == stop_iteration:
            break 

        iteration += 1

In [None]:
for fold in range(1, 4):
    
    train_sampler = TrainSampler1(
        hdf5_path="./features/waveform_train.h5", 
        holdout_fold = fold, 
        batch_size = 32 * 2 if MIXUP else BATCH_SIZE
    )

    valid_sampler = EvaluateSampler(
        hdf5_path = "./features/waveform_train.h5", 
        holdout_fold = fold, 
        batch_size = 32
    )
    
    test_sampler = TestSampler(
        hdf5_path = "./features/waveform_test.h5", 
        batch_size = 32
    )   
    
    train_dataset = AudioDataset()
    test_dataset = AudioDataset(train = False)

    # Data loader
    train_loader = DataLoader(dataset = train_dataset, 
        batch_sampler=train_sampler, collate_fn=collate_fn, 
        num_workers=2, pin_memory=True
    )

    valid_loader = DataLoader(dataset = train_dataset, 
        batch_sampler=valid_sampler, collate_fn=collate_fn, 
        num_workers=2, pin_memory=True
    )    
    
    test_loader = DataLoader(dataset = test_dataset, 
        batch_sampler=test_sampler, collate_fn=collate_fn, 
        num_workers=2, pin_memory=True
    )  
    
#     if 'mixup' in AUGMENTATION:
#         MIXUP_AUGMENTER = Mixup(mixup_alpha = 1.)

    random_seed()
    Model = eval(MODEL_TYPE)
    model = Model(SAMPLE_RATE, WINDOW_SIZE, HOP_SIZE, MEL_BINS, FMIN, FMAX, NUM_CLASSES, FREEZE_BASE)
    model.load_from_pretrain(PRETRAINED_CHECKPOINT_PATH)
    model.to(DEVICE)
    
    loss_func = nn.CrossEntropyLoss().to(DEVICE)

    print(f'Fold {fold}')
    
    train_evaluate(model = model, train_loader = train_loader, valid_loader = valid_loader, loss_func = loss_func, device = DEVICE, stop_iteration = 2000)

In [None]:
np.mean([1.67, 1.34, 1.54]) - 2

In [None]:
1.66, 1.66, 1.63, 1700it

In [None]:
# MIXUP OR MIXUP IN AUGMENTATION

In [None]:
def train_evaluate(model, train_loader, valid_loader, test_loader, loss_fn, lr, epochs, warm_up_prop, device, n_samples_train, n_samples_val, fold):
    
    num_training_steps = epochs * len(train_loader)
    num_warmup_steps = int(warm_up_prop * num_training_steps)
    print(num_training_steps, num_warmup_steps)
    optimizer = optim.AdamW(model.parameters(), lr = lr, weight_decay = 0.1)
#     optimizer = Lookahead(optimizer)
#     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps) #using transformers library
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=lr, total_steps = num_training_steps)

    validation_loss = 0
    validation_loss_min = np.Inf
    
    
    for epoch in range(epochs):
        # Train
        model.train()
        start_time = time.time()
        
        batch_losses_train = []
        n_correct = 0
        
        for batch_index, batch_data in enumerate(train_loader, 0):
            
# #             if 'mixup' in augmentation:
#             if MIXUP:
#                 batch_data['mixup_lambda'] = MIXUP_AUGMENTER.get_lambda(len(batch_data['waveform']))            
            
            # batch_data type is a dict
            # Send input to device
            for key in batch_data.keys():
                batch_data[key] = move_data_to_device(batch_data[key], device)
            
            waveforms = batch_data["waveform"]
            targets = batch_data["target"]
#             mixup_lambda = batch_data["mixup_lambda"]
#             targets = do_mixup(batch_data["target"], mixup_lambda).long()
            
            random_seed()
            outputs = model(waveforms)
            clipwise_outputs = outputs["clipwise_output"]

            loss = loss_fn(clipwise_outputs, targets)            
            batch_losses_train.append(loss.item())
            _, preds = torch.max(clipwise_outputs, dim = 1)            
            n_correct += torch.sum(preds == targets)            
            
#             optimizer.zero_grad()
            loss.backward()            
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
    
            if batch_index == 67:
                break

        epoch_loss = np.mean(batch_losses_train)
            
            
        # Evaluate
        model.eval()
        
        batch_losses_val = []
        n_correct_val = 0
        
        with torch.no_grad():
            for batch_index, batch_data in enumerate(valid_loader, 0):
                
                for key in batch_data.keys():
                    batch_data[key] = move_data_to_device(batch_data[key], device)
                
                waveforms = batch_data["waveform"]
                targets = batch_data["target"]
                
                random_seed()
                val_outputs = model(waveforms)
                val_clipwise_outputs = val_outputs["clipwise_output"]
                
                loss = loss_fn(val_clipwise_outputs, targets)
#                 scheduler.step(loss)
                batch_losses_val.append(loss.item())
                _, val_preds = torch.max(val_clipwise_outputs, dim = 1)
                n_correct_val += torch.sum(val_preds == targets) 
                
#                 if batch_index == 6:
#                     break
                
        epoch_loss_val = np.mean(batch_losses_val)
        
        if epoch == epochs - 1:
            # Store val_loss of last epoch to get final averaged loss per run???
            # run loss
            validation_loss += epoch_loss_val
            
        
                
        dt = time.time() - start_time
        lr = optimizer.param_groups[0]['lr']
        
        print(f'Epoch {epoch + 1}/{epochs} \t lr={lr:.1e} \t t={dt:.0f}s \t loss={epoch_loss:.4f}, acc={n_correct.double() / n_samples_train:.4f} \t val_loss={epoch_loss_val:.4f}, val_acc={n_correct_val.double() / n_samples_val:.4f}')

        if epoch_loss_val <= validation_loss_min:
            print(f'Validation loss decreased. Saving model... ')
            torch.save(model.state_dict(), f'model_{fold}.pt')
            validation_loss_min = epoch_loss_val
        
    # Predict on test set 
    batch_outputs_test = []
    
    with torch.no_grad():
        for batch_index, batch_data in enumerate(test_loader, 0):
            for key in batch_data.keys():
                batch_data[key] = move_data_to_device(batch_data[key], device)
                
            waveforms = batch_data["waveform"]
            
            random_seed()
            test_outputs = model(waveforms)
            test_outputs = F.softmax(test_outputs["clipwise_output"], dim = 1)
            test_outputs = test_outputs.cpu().detach().numpy()
            batch_outputs_test.append(test_outputs)
    
    return validation_loss, np.vstack(batch_outputs_test)

In [None]:
def loss_func(output_dict, target_dict):
#     loss = - torch.mean(target_dict['target'] * output_dict['clipwise_output'])
    # not dict
    loss = - torch.mean(target_dict * output_dict)
    return loss

In [None]:
%%time

validation_losses_per_fold = []
test_predicitons_per_fold = []  

# Folds
for fold in range(1, 4):
    train_sampler = TrainSampler1(
        hdf5_path="./features/waveform_train.h5", 
        holdout_fold = fold, 
        batch_size = BATCH_SIZE * 2 if MIXUP else BATCH_SIZE
    )

    valid_sampler = EvaluateSampler(
        hdf5_path = "./features/waveform_train.h5", 
        holdout_fold = fold, 
        batch_size = BATCH_SIZE
    )
    
    test_sampler = TestSampler(
        hdf5_path = "./features/waveform_test.h5", 
        batch_size = BATCH_SIZE
    )   
    
    train_dataset = AudioDataset()
    test_dataset = AudioDataset(train = False)

    # Data loader
    train_loader = DataLoader(dataset = train_dataset, 
        batch_sampler=train_sampler, collate_fn=collate_fn, 
        num_workers=2, pin_memory=True
    )

    valid_loader = DataLoader(dataset = train_dataset, 
        batch_sampler=valid_sampler, collate_fn=collate_fn, 
        num_workers=2, pin_memory=True
    )    
    
    test_loader = DataLoader(dataset = test_dataset, 
        batch_sampler=test_sampler, collate_fn=collate_fn, 
        num_workers=2, pin_memory=True
    )  
    
#     if 'mixup' in AUGMENTATION:
#         MIXUP_AUGMENTER = Mixup(mixup_alpha = 1.)

    random_seed()
    Model = eval(MODEL_TYPE)
    model = Model(SAMPLE_RATE, WINDOW_SIZE, HOP_SIZE, MEL_BINS, FMIN, FMAX, NUM_CLASSES, FREEZE_BASE)
    model.load_from_pretrain(PRETRAINED_CHECKPOINT_PATH)
    model.to(DEVICE)
    
    loss_fn = nn.CrossEntropyLoss().to(DEVICE)

    print(f'Fold {fold}')

    # Train, evaluate, predict
    validation_loss, test_prediction = train_evaluate(model, train_loader = train_loader, valid_loader = valid_loader, test_loader = test_loader, loss_fn = loss_fn, lr = 1e-3, epochs = 30, warm_up_prop = 0.1, device = DEVICE, n_samples_train = 4339, n_samples_val = 370, fold = fold)
    # Obtain validation result per fold
    validation_losses_per_fold.append(validation_loss)
    # Obtain test predictions per fold
    test_predicitons_per_fold.append(test_prediction)

    

print("=" * 100)
# Print summary validation result of all runs

print(f'Total avg val_loss={np.mean(validation_losses_per_fold)}, S/Dev={np.std(validation_losses_per_fold)}')    

In [None]:
Total avg val_loss=1.5729644298553467, S/Dev=0.05230955180110382


In [None]:
Epoch 20/20 	 lr=6.7e-09 	 t=26s 	 loss=0.9724, acc=0.7412 	 val_loss=1.6636, val_acc=0.6189

In [None]:
tp = np.mean(test_predicitons_per_fold, axis = 0)
# tp = test_predicitons_per_fold[0]
tp = pd.DataFrame(tp)

In [None]:
ss = pd.read_csv("../input/giz-data/giz_data/SampleSubmission.csv")

In [None]:
tp.columns = df_train_new[["label", "target"]].sort_values("target").drop_duplicates().label.values

In [None]:
ss.iloc[:,1:] = tp

In [None]:
ss.iloc[:,1:].sum(axis = 1).head()

In [None]:
ss.iloc[:,1:].max(axis = 1)

In [None]:
ss.to_csv("cnn14_1.csv", index = False)

In [None]:
?optim.lr_scheduler.OneCycleLR

In [None]:
from sklearn import metrics


def calculate_accuracy(y_true, y_score):
    N = y_true.shape[0]
    accuracy = np.sum(np.argmax(y_true, axis=-1) == np.argmax(y_score, axis=-1)) / N
    return accuracy


class Evaluator(object):
    def __init__(self, model):
        self.model = model

    def evaluate(self, data_loader):

        # Forward
        output_dict = forward(
            model=self.model, 
            generator=data_loader, 
            return_target=True)

        clipwise_output = output_dict['clipwise_output']    # (audios_num, classes_num)
        target = output_dict['target']    # (audios_num, classes_num)

        cm = metrics.confusion_matrix(np.argmax(target, axis=-1), np.argmax(clipwise_output, axis=-1), labels=None)
        accuracy = calculate_accuracy(target, clipwise_output)

        statistics = {'accuracy': accuracy}

        return statistics

In [None]:
evaluator = Evaluator(model=model)

In [None]:
?librosa.feature.melspectrogram

# Model Architecture(s) and requirements/models
if putting in separate module, it should have it's own dependencies import

In [None]:
matplotlib==3.0.3
soundfile==0.10.3.post1
librosa==0.6.3
torch==1.0.1.post2
torchlibrosa==0.0.4

pip install -r requirements.txt+

others
#librosa

In [None]:
!pip install torchlibrosa

In [None]:
from torchlibrosa.stft import Spectrogram, LogmelFilterBank
from torchlibrosa.augmentation import SpecAugmentation

In [None]:
def init_layer(layer):
    """Initialize a Linear or Convolutional layer. """
    nn.init.xavier_uniform_(layer.weight)
 
    if hasattr(layer, 'bias'):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)

In [None]:
def init_bn(bn):
    """Initialize a Batchnorm layer. """
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.)

In [None]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        
        super(ConvBlock, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=in_channels, 
                              out_channels=out_channels,
                              kernel_size=(3, 3), stride=(1, 1),
                              padding=(1, 1), bias=False)
                              
        self.conv2 = nn.Conv2d(in_channels=out_channels, 
                              out_channels=out_channels,
                              kernel_size=(3, 3), stride=(1, 1),
                              padding=(1, 1), bias=False)
                              
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.init_weight()
        
    def init_weight(self):
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)
        init_bn(self.bn2)

        
    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
        
        x = input
        x = F.relu_(self.bn1(self.conv1(x)))
        x = F.relu_(self.bn2(self.conv2(x)))
        if pool_type == 'max':
            x = F.max_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg':
            x = F.avg_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg+max':
            x1 = F.avg_pool2d(x, kernel_size=pool_size)
            x2 = F.max_pool2d(x, kernel_size=pool_size)
            x = x1 + x2
        else:
            raise Exception('Incorrect argument!')
        
        return x

In [None]:
class AttBlock(nn.Module):
    def __init__(self, n_in, n_out, activation='linear', temperature=1.):
        super(AttBlock, self).__init__()
        
        self.activation = activation
        self.temperature = temperature
        self.att = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True)
        self.cla = nn.Conv1d(in_channels=n_in, out_channels=n_out, kernel_size=1, stride=1, padding=0, bias=True)
        
        self.bn_att = nn.BatchNorm1d(n_out)
        self.init_weights()
        
    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)
        init_bn(self.bn_att)
         
    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.clamp(self.att(x), -10, 10), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla

    def nonlinear_transform(self, x):
        if self.activation == 'linear':
            return x
        elif self.activation == 'sigmoid':
            return torch.sigmoid(x)


In [None]:
class Cnn14(nn.Module):
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
        fmax, classes_num):
        
        super(Cnn14, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
            freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048, bias=True)
        self.fc_audioset = nn.Linear(2048, classes_num, bias=True)
        
        self.init_weight()

    def init_weight(self):
        init_bn(self.bn0)
        init_layer(self.fc1)
        init_layer(self.fc_audioset)
 
    def forward(self, input, mixup_lambda=None):
        """
        Input: (batch_size, data_length)"""

        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
        
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        
        if self.training:
            x = self.spec_augmenter(x)

        # Mixup on spectrogram
        if self.training and mixup_lambda is not None:
            x = do_mixup(x, mixup_lambda)

        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = torch.mean(x, dim=3)
        
        (x1, _) = torch.max(x, dim=2)
        x2 = torch.mean(x, dim=2)
        x = x1 + x2
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu_(self.fc1(x))
        embedding = F.dropout(x, p=0.5, training=self.training)
        clipwise_output = torch.sigmoid(self.fc_audioset(x))
        
        output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding}

        return output_dict

In [None]:
class Transfer_Cnn14(nn.Module):
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
        fmax, classes_num, freeze_base):
        """Classifier for a new task using pretrained Cnn14 as a sub module.
        """
        super(Transfer_Cnn14, self).__init__()
        audioset_classes_num = 527
        
        self.base = Cnn14(sample_rate, window_size, hop_size, mel_bins, fmin, 
            fmax, audioset_classes_num)

        # Transfer to another task layer
        self.fc_transfer = nn.Linear(2048, classes_num, bias=True)

        if freeze_base:
            # Freeze AudioSet pretrained layers
            for param in self.base.parameters():
                param.requires_grad = False

        self.init_weights()

    def init_weights(self):
        init_layer(self.fc_transfer)

    def load_from_pretrain(self, pretrained_checkpoint_path):
        checkpoint = torch.load(pretrained_checkpoint_path)
        self.base.load_state_dict(checkpoint['model'])

    def forward(self, input, mixup_lambda=None):
        """Input: (batch_size, data_length)
        """
        output_dict = self.base(input, mixup_lambda)
        embedding = output_dict['embedding']

        clipwise_output =  torch.log_softmax(self.fc_transfer(embedding), dim=-1)
        output_dict['clipwise_output'] = clipwise_output
 
        return output_dict