In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchaudio
import numpy as np
import random
import os

In [20]:
SAMPLE_RATE = 16000
DURATION = 1.0          # 1 second clips
NUM_SAMPLES = int(SAMPLE_RATE * DURATION)
BATCH_SIZE = 32
EPOCHS = 10
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [21]:
# Function to generate tone

def generate_tone(frequency,sample_rate=SAMPLE_RATE,duration=DURATION):
  t= np.linspace(0,duration,int(sample_rate*duration),False)
  tone=0.5*np.sin(2*np.pi*t*frequency)
  return tone.astype(np.float32)

# Function to generate noise
def generate_noise(sample_rate=SAMPLE_RATE,duration=DURATION):
  return np.random.normal(0,0.3,int(sample_rate*duration)).astype(np.float32)

# To create samples

def create_samples(label):
  if label==1:        # Wake word detected
    tone = generate_tone(frequency=np.random.choice([440,550,660]))
    noise = generate_noise()
    audio= tone + 0.2*noise
  else:               # No wake word detected
    noise = generate_noise()
    audio = noise
  audio=torch.tensor(audio,dtype=torch.float32)
  return audio


In [50]:
# Creating Custom Dataset

class WakeWordDataset(Dataset):
  def __init__(self,num_samples):
    self.labels=[]
    self.data=[]
    for _ in range(num_samples):
      label = random.choice([0,1])
      self.labels.append(label)
      audio = create_samples(label)
      self.data.append(audio)

    self.mfcc=torchaudio.transforms.MFCC(sample_rate=SAMPLE_RATE,n_mfcc=40,melkwargs={
        "n_fft": 512,          # FFT window size
        "hop_length": 256,     # step between frames
        "n_mels": 40           # number of mel filters
    },log_mels=True)

  def __len__(self):
    return len(self.data)

  def __getitem__(self,index):
    x= self.data[index]
    y= torch.tensor(self.labels[index],dtype=torch.float32)
    mfcc= self.mfcc(x)
    if mfcc.dim() == 2:
      mfcc = mfcc.unsqueeze(0)
    elif mfcc.shape[0] != 1:
      mfcc = mfcc.unsqueeze(1)
    return mfcc,y



In [60]:
# TinyKWS

class TinyKWS(nn.Module):
  def __init__(self):
    super().__init__()
    self.cnn=nn.Sequential(
        nn.Conv2d(in_channels=1,out_channels=8,kernel_size=3,padding=1),
        nn.ReLU(),
        nn.MaxPool2d(2),
        nn.Conv2d(in_channels=8,out_channels=16,kernel_size=3,padding=1),
        nn.ReLU(),
        nn.AdaptiveAvgPool2d(1)
    )
    self.fc=nn.Linear(in_features=16,out_features=1)

  def forward(self,x):
    x=self.cnn(x)
    x=x.view(x.size(0),-1)
    return torch.sigmoid(self.fc(x))

In [61]:
# TRAINING LOOP
def train_model(model, loader, criterion, optimizer,device=DEVICE):
    model.train()
    total_loss = 0
    for x, y in loader:
        x, y = x.to(device), y.to(device).unsqueeze(1)
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# EVALUATE LOOP
def evaluate_model(model, loader):
  model.eval()
  correct, total = 0, 0
  with torch.no_grad():
      for x, y in loader:
          x, y = x.to(DEVICE), y.to(DEVICE)
          preds = (model(x).squeeze() > 0.5).float()
          correct += (preds == y).sum().item()
          total += y.size(0)
  return correct / total

In [62]:
from tqdm.auto import tqdm

In [70]:
train_data=WakeWordDataset(num_samples=1600)
test_data=WakeWordDataset(num_samples=320)

train_loader=DataLoader(train_data,batch_size=BATCH_SIZE,shuffle=True)
test_loader=DataLoader(test_data,batch_size=BATCH_SIZE,shuffle=True)

model=TinyKWS().to(DEVICE)
criterion=nn.BCELoss()
optimizer=optim.Adam(model.parameters(),lr=1e-3)

for epoch in tqdm(range(EPOCHS)):
  train_loss=train_model(model=model,loader=train_loader,criterion=criterion,optimizer=optimizer)
  test_acc=evaluate_model(model,test_loader)
  print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {train_loss:.4f}, Test Acc: {test_acc:.4f}")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/10, Train Loss: 0.5066, Test Acc: 1.0000
Epoch 2/10, Train Loss: 0.1386, Test Acc: 1.0000
Epoch 3/10, Train Loss: 0.0222, Test Acc: 1.0000
Epoch 4/10, Train Loss: 0.0078, Test Acc: 1.0000
Epoch 5/10, Train Loss: 0.0040, Test Acc: 1.0000
Epoch 6/10, Train Loss: 0.0025, Test Acc: 1.0000
Epoch 7/10, Train Loss: 0.0017, Test Acc: 1.0000
Epoch 8/10, Train Loss: 0.0012, Test Acc: 1.0000
Epoch 9/10, Train Loss: 0.0009, Test Acc: 1.0000
Epoch 10/10, Train Loss: 0.0007, Test Acc: 1.0000


In [71]:
for x, y in test_loader:
    x, y = x.to(DEVICE), y.to(DEVICE)
    preds = (model(x).squeeze() > 0.5).float()
    print("Labels:", y[:10])
    print("Preds :", preds[:10])
    break

Labels: tensor([1., 1., 0., 0., 1., 1., 0., 1., 1., 0.], device='cuda:0')
Preds : tensor([1., 1., 0., 0., 1., 1., 0., 1., 1., 0.], device='cuda:0')
