In [119]:
import os
import numpy as np
import pandas as pd
import librosa
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
from torch.utils.data.sampler import SubsetRandomSampler

import matplotlib.pyplot as plt
%matplotlib inline

In [32]:
train_df = pd.read_csv('../data/train.csv')

In [33]:
train_df.head()

Unnamed: 0,fname,label,manually_verified
0,00044347.wav,Hi-hat,0
1,001ca53d.wav,Saxophone,1
2,002d256b.wav,Trumpet,0
3,0033e230.wav,Glockenspiel,1
4,00353774.wav,Cello,1


## Dataset

In [37]:
le = LabelEncoder()
le.fit(np.unique(train_df.label))
train_df['label_idx'] = le.transform(train_df['label'])
train_df.head()

Unnamed: 0,fname,label,manually_verified,label_idx
0,00044347.wav,Hi-hat,0,23
1,001ca53d.wav,Saxophone,1,30
2,002d256b.wav,Trumpet,0,38
3,0033e230.wav,Glockenspiel,1,19
4,00353774.wav,Cello,1,6


In [38]:
le.classes_

array(['Acoustic_guitar', 'Applause', 'Bark', 'Bass_drum',
       'Burping_or_eructation', 'Bus', 'Cello', 'Chime', 'Clarinet',
       'Computer_keyboard', 'Cough', 'Cowbell', 'Double_bass',
       'Drawer_open_or_close', 'Electric_piano', 'Fart',
       'Finger_snapping', 'Fireworks', 'Flute', 'Glockenspiel', 'Gong',
       'Gunshot_or_gunfire', 'Harmonica', 'Hi-hat', 'Keys_jangling',
       'Knock', 'Laughter', 'Meow', 'Microwave_oven', 'Oboe', 'Saxophone',
       'Scissors', 'Shatter', 'Snare_drum', 'Squeak', 'Tambourine',
       'Tearing', 'Telephone', 'Trumpet', 'Violin_or_fiddle', 'Writing'],
      dtype=object)

In [146]:
num_classes = len(le.classes_)
print(num_classes)

41


In [157]:
np.save('labels.npy', le.classes_)

In [158]:
labels = np.load('labels.npy')

In [162]:
test_df = pd.read_csv('../data/sample_submission.csv')
test_df.head()

Unnamed: 0,fname,label
0,00063640.wav,Laughter Hi-Hat Flute
1,0013a1db.wav,Laughter Hi-Hat Flute
2,002bb878.wav,Laughter Hi-Hat Flute
3,002d392d.wav,Laughter Hi-Hat Flute
4,00326aa9.wav,Laughter Hi-Hat Flute


In [130]:
def random_crop(y, max_length=176400):
    """音声波形を固定長にそろえる
    
    max_lengthより長かったらランダムに切り取る
    max_lengthより短かったらランダムにパディングする
    """
    if len(y) > max_length:
        max_offset = len(y) - max_length
        offset = np.random.randint(max_offset)
        y = y[offset:max_length + offset]
    else:
        if max_length > len(y):
            max_offset = max_length - len(y)
            offset = np.random.randint(max_offset)
        else:
            offset = 0
        y = np.pad(y, (offset, max_length - len(y) - offset), 'constant')
    return y


class AudioDataset(torch.utils.data.Dataset):
    
    def __init__(self, df, wav_dir, sr=None, max_length=4.0, window_size=0.02, hop_size=0.01, n_mels=64):
        if not os.path.exists(wav_dir):
            print('ERROR: not found %s' % wav_dir)
            exit(1)
        self.df = df
        self.wav_dir = wav_dir
        self.sr = sr
        self.max_length = max_length     # sec
        self.window_size = window_size   # sec
        self.hop_size = hop_size         # sec
        self.n_mels = n_mels

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        fpath = os.path.join(self.wav_dir, self.df.fname[index])
        y, sr = librosa.load(fpath, sr=self.sr)
        y = random_crop(y, int(self.max_length * sr))
        
        # feature
        n_fft = int(self.window_size * sr)
        hop_length = int(self.hop_size * sr)
        mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=self.n_mels)
        # (channel, features, frames)
        mel = np.resize(mel, (1, mel.shape[0], mel.shape[1]))
        tensor = torch.from_numpy(mel).float()

        # label
        label = self.df.label_idx[index]
        
        return tensor, label

- メルスペクトログラムの標準化はどうする？
- ファイルごとに平均0、標準偏差1でよいか？ -> GCommandsPytorch

In [131]:
train_dataset = AudioDataset(train_df, '../data/audio_train')
print(len(train_dataset))
data, target = train_dataset[0]
print(data.size(), target, le.classes_[target])

9473
torch.Size([1, 64, 401]) 23 Hi-hat


## DataLoader

In [132]:
train_dataset = AudioDataset(train_df, '../data/audio_train')

In [133]:
seed = 0
valid_size = 0.1
batch_size = 128
num_workers= 0

np.random.seed(0)

# 訓練データとバリデーションデータに分割
num_train = len(train_dataset)
indices = list(range(num_train))
split = int(valid_size * num_train)
np.random.shuffle(indices)
train_idx, valid_idx = indices[split:], indices[:split]

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size,
    sampler=train_sampler,
    drop_last=True,
    num_workers=num_workers
)

valid_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size,
    sampler=valid_sampler,
    drop_last=True,
    num_workers=num_workers
)

In [134]:
len(train_loader), len(valid_loader)

(66, 7)

In [135]:
data, target = iter(train_loader).next()
print(data.size())
print(target.size())

torch.Size([128, 1, 64, 401])
torch.Size([128])


## Model

In [150]:
class LeNet2D(nn.Module):
    def __init__(self):
        super(LeNet2D, self).__init__()
        self.block1 = nn.Sequential(
            nn.Conv2d(1, 20, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.block2 = nn.Sequential(
            nn.Conv2d(20, 20, kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.fc1 = nn.Linear(25220, 1000)
        self.fc2 = nn.Linear(1000, num_classes)

    def forward(self, x):
        print(x.size())
        x = self.block1(x)
        print(x.size())
        x = self.block2(x)
        print(x.size())
        x = x.view(x.size(0), -1)
        print(x.size())
        x = F.relu(self.fc1(x))
        print(x.size())
        x = self.fc2(x)
        print(x.size())
        return x

In [152]:
model = LeNet2D()

In [153]:
model(data)

torch.Size([128, 1, 64, 401])
torch.Size([128, 20, 30, 198])
torch.Size([128, 20, 13, 97])
torch.Size([128, 25220])
torch.Size([128, 1000])
torch.Size([128, 41])


tensor([[ 3.4669e-02,  1.0678e-02, -1.6374e-02,  ..., -1.5601e-02,
          7.3538e-03,  2.2586e-02],
        [ 4.0632e-02,  8.4185e-03, -3.2735e-02,  ..., -5.0126e-03,
         -5.3709e-03,  1.9890e-02],
        [ 3.4416e-02,  1.2960e-02, -1.8639e-02,  ..., -1.4152e-02,
          3.2253e-03,  2.2069e-02],
        ...,
        [ 1.9631e-01, -1.0667e-01,  2.7603e-01,  ...,  1.1381e-01,
         -6.7733e-01,  9.8624e-01],
        [ 3.3981e-02,  1.1522e-02, -1.5906e-02,  ..., -1.5315e-02,
          7.8248e-03,  2.0316e-02],
        [ 2.2161e-01,  2.2645e-02, -2.0796e-02,  ..., -1.8583e-01,
         -1.8259e-01,  7.2796e-02]])