https://www.kaggle.com/alanchn31/free-spoken-digits

In [1]:
#from google.colab import drive
#drive.mount("/content/gdrive")  

In [2]:
#!tar xvzf recordings.tar.gz

In [3]:
#!pip install torchaudio

In [3]:
import torch
import torchaudio
import numpy as np

In [8]:
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch import optim
from itertools import islice
from tqdm import tqdm

In [9]:
import pathlib

In [17]:
test_indexes = [] # np.zeros(2*600)
for i in range(60):
    #test_indexes[2*i:2*(i+1)] = np.random.choice(50, 2) + 50*i 
    test_indexes = test_indexes + list(np.random.choice(50, 2)+ 50*i)
train_indexes = [i for i in range(2900)]
train_indexes = list(set(train_indexes) - set(test_indexes))

In [18]:
path_to_data = pathlib.Path('./recordings')
paths = list(path_to_data.rglob('*.wav'))
paths_train = [paths[i] for  i in train_indexes] 
paths_test = [paths[i] for  i in test_indexes]

In [19]:
class AudioMnist(Dataset):
    def __init__(self, path_to_data):
        #self.path_to_data = pathlib.Path(path_to_data)
        #self.paths = list(self.path_to_data.rglob('*.wav'))
        self.paths = path_to_data
        self.featurizer = torchaudio.transforms.MelSpectrogram( \
            sample_rate=16000, n_fft=1024, win_length=1024,hop_length=256, n_mels=80)
        
    def __getitem__(self, index):
        path_to_wav = self.paths[index].as_posix()
        wav, _ = torchaudio.load(path_to_wav)
        mel_spec = self.featurizer(wav).squeeze(dim=0).clamp(1e-5).log()
        label = int(path_to_wav.split('/')[-1].split('_')[0])
        return mel_spec, label
        
    def __len__(self):
        return len(self.paths)

In [20]:
dataset = AudioMnist(paths_train)
dataset_test = AudioMnist(paths_test)

In [21]:
dataset[5]

(tensor([[-10.2543,  -9.1099, -10.1668,  ...,  -7.6816,  -8.2048,  -8.6610],
         [ -9.3566,  -8.5928,  -9.0443,  ...,  -8.1125,  -9.2040,  -8.3888],
         [ -8.7034,  -8.1926,  -9.4329,  ...,  -7.6201,  -8.3900,  -8.2539],
         ...,
         [ -4.3113,  -5.4017,  -8.2199,  ...,  -8.9894,  -8.5764,  -8.2613],
         [ -5.0711,  -6.1382,  -8.4130,  ...,  -8.8574,  -8.6966,  -7.9494],
         [ -6.5580,  -7.4778,  -8.5633,  ...,  -8.9127,  -8.8966,  -8.6079]]),
 4)

In [22]:
def collate_fn_padd(batch):
    '''
    Padds batch of variable length

    note: it converts things ToTensor manually here since the ToTensor transform
    assume it takes in images rather than arbitrary tensors.
    '''
    data = torch.nn.utils.rnn.pad_sequence([item[0].transpose(-1,-2) for item in batch], batch_first=True, padding_value=np.log(1e-5))
    target = torch.LongTensor([item[1] for item in batch])
    
    return data.transpose(-1,-2), target

In [23]:
dataloader = DataLoader(dataset, collate_fn=collate_fn_padd, shuffle=True, batch_size=32)

In [None]:
for i, batch in enumerate(dataloader):
    mel_spec, label = batch
    print(mel_spec.shape)
    if i == 2:
        break

torch.Size([32, 80, 56])
torch.Size([32, 80, 61])
torch.Size([32, 80, 52])


In [24]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn = nn.LSTM(input_size=80, hidden_size=256, batch_first=True)
        self.clf = nn.Linear(256, 10)
        self.s = nn.Softmax(dim=1)
    def forward(self, input):
        output, _ = self.rnn(input.transpose(-1,-2))
        output = self.clf(output[:,-1])
        output = self.s(output)
        return output

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [26]:
model = Model().to(device)

In [27]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [29]:
for epoc in range(10):
    for i, batch in tqdm(enumerate(dataloader)):
        mels, labels = batch
        logits = model(mels.to(device))
        loss = criterion(logits,labels.to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    #if i%50 ==0:
    print(loss.item())

88it [00:01, 63.60it/s]


2.1963295936584473


88it [00:01, 61.23it/s]


1.8848272562026978


88it [00:01, 63.72it/s]


1.5498021841049194


88it [00:01, 66.42it/s]


1.52317476272583


88it [00:01, 64.07it/s]


1.4739850759506226


88it [00:01, 65.82it/s]


1.4625533819198608


88it [00:01, 66.24it/s]


1.462689757347107


88it [00:01, 65.09it/s]


1.4631344079971313


88it [00:01, 63.68it/s]


1.4732743501663208


88it [00:01, 62.54it/s]

1.46268892288208





In [32]:
predict = []
target = []
for i in range(len(dataset_test)):
    wav = dataset_test[i][0].unsqueeze(dim = 0).to(device)
    predict.append(int(model(wav).to('cpu').detach().argmax().numpy()))
    target.append(dataset_test[i][1])

In [None]:
#model(wav).detach().argmax().numpy()

array(6, dtype=int64)

In [33]:
from sklearn.metrics import accuracy_score
accuracy_score(predict, target)

0.9833333333333333

In [36]:
torch.save(model.state_dict(), 'weights.pth')

In [None]:
from ipywebrtc import VideoStream, AudioStream, AudioRecorder, CameraStream
from IPython.display import Audio

In [None]:
camera = CameraStream(constraints={'audio': True,'video':False})
recorder = AudioRecorder(stream=camera)
recorder

AudioRecorder(audio=Audio(value=b'', format='webm'), stream=CameraStream(constraints={'audio': True, 'video': …

In [None]:
with open('recording.webm', 'wb') as f:
    f.write(recorder.audio.value)
!C:\ffmpeg\bin\ffmpeg -i recording.webm -ac 1 -f wav file.wav -y -hide_banner -loglevel panic
sig, sr = torchaudio.load("file.wav")

In [None]:
sr

48000

In [None]:
wav = torchaudio.transforms.Resample(sr, 16000)(sig)

In [None]:
torchaudio.save('file1.wav', wav, 16000)

In [None]:
sig, sr = torchaudio.load("file1.wav")
Audio(data=sig, rate=sr)

In [None]:
wav = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=1024, win_length=1024,hop_length=256, n_mels=80)(sig)

In [None]:
int(model(wav).detach().argmax().numpy())

9

In [None]:
wav.shape

torch.Size([1, 80, 98])