In [1]:
import torch
import torch.nn as nn


class Attention(nn.Module):
    def __init__(self, input_size: int, hidden_size: int):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.Tanh(),
        )
        self.linear = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        outputs = self.linear(self.fc(x))
        alpha = torch.softmax(outputs, dim=1)
        x = (x * alpha).sum(dim=1)
        return x

class KWSNet(nn.Module):
    def __init__(self, params):
        super().__init__()
        self.params = params
        self.cnn = nn.Sequential(
            nn.Conv1d(self.params["num_features"], self.params["cnn_channels"],
                      kernel_size=self.params["cnn_kernel_size"], padding=self.params["cnn_kernel_size"] // 2),
            nn.ReLU(),
        )
        self.rnn = nn.GRU(input_size=self.params["cnn_channels"], hidden_size=self.params["gru_hidden_size"],
                          bidirectional=True, batch_first=True)
        self.attention = Attention(self.params["gru_hidden_size"] * 2, self.params["attention_hidden_size"])
        self.linear = nn.Linear(self.params["gru_hidden_size"] * 2, 1 + 1, bias=False) # 1 keyword


    def forward(self, x):
        conv = self.cnn(x).permute(0, 2, 1)
        rnn_output, _ = self.rnn(conv)
        linear_attn = self.linear(self.attention(rnn_output))
        return torch.log_softmax(linear_attn, dim=1)

    def inference(self, x: torch.Tensor, window_size: int):
        if window_size > x.shape[2]:
            window_size = x.shape[2]
        probs = []
        hidden = None
        for i in range(window_size, x.shape[2] + 1, 50):
            window = x[:, :, i - window_size:i]
            window = self.cnn(window)
            window = window.permute(0, 2, 1)
            window, h = self.rnn(window, hidden)
            window = self.attention(window)
            window = self.linear(window)
            p = torch.softmax(window, dim=1).squeeze()[1]
            probs.append(p.item())
        return probs

In [4]:
params = {"num_features": 40,
          "cnn_channels": 16,
          "cnn_kernel_size": 51,
          "gru_hidden_size": 64,
          "attention_hidden_size": 64,
          "window_size": 100,
          "batch_size": 64,
          "num_workers": 8,
          "lr": 0.001,
          "sample_rate": 16000,
          "num_epochs": 10,
          "noise_variance": 0.05,
          "min_time_stretch": 0.9,
          "max_time_stretch": 1.1,
          "min_shift": -3,
          "max_shift": 3,
          "time_masking": 1,
          "wandb_name": "KWSNet",
          "clip_grad_norm": 15,
          "vocab_size": 120,
          "from_pretrained": False,
          "model_path": "kws_model.pth",
          "start_epoch": 40,
          "path_to_file": "test_double.wav",
          }

In [6]:
from torchsummary import summary
model = KWSNet(params)

In [40]:
#model.Conv
print(f"input shape: (1,{params['num_features']},{801})")  
summary(model, (params['num_features'],1))     

input shape: (1,40,801)
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1                [-1, 16, 1]          32,656
              ReLU-2                [-1, 16, 1]               0
               GRU-3  [[-1, 1, 128], [-1, 2, 64]]               0
            Linear-4                [-1, 1, 64]           8,256
              Tanh-5                [-1, 1, 64]               0
            Linear-6                 [-1, 1, 1]              64
         Attention-7                  [-1, 128]               0
            Linear-8                    [-1, 2]             256
Total params: 41,232
Trainable params: 41,232
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.12
Params size (MB): 0.16
Estimated Total Size (MB): 0.28
----------------------------------------------------------------


In [32]:
model(torch.zeros(801,40,1)).shape

torch.Size([801, 2])

In [41]:
40*0.25

10.0

#build the network for this paper
https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43969.pdf

In [42]:
import torch
import torch.nn as nn

In [43]:
n = 1
t = 32
f = 40 

X = torch.zeros(n,t,f)

In [44]:
m = nn.Conv1d(16, 33, 3, stride=2)
input = torch.randn(20, 16, 50)
output = m(input)

output.shape

torch.Size([20, 33, 24])

# try adding batch norm and dropout

file:///C:/Users/AT030915/Downloads/Trigger_Word_Recognition_using_LSTM-1.pdf


In [49]:
class TriggerWord_LSTM2(nn.Module):
    '''
    LSTM neural network for performing trigger word detection - based on paper Trigger_Word_Recognition_using_LSTM
    '''
    
    def __init__(self, input_freq, input_time , hidden_time, output_time, Conv_p, GRU_p):
        super().__init__()
        '''
        Create layers of the neural network - note freq/time denote the sizes of the 1st and 2nd dimensions respectively
        '''
        #save parameters here
        self.input_freq = input_freq
        self.input_time = input_time
        self.hidden_time = hidden_time
        self.output_time = output_time
        self.Conv_p = Conv_p
        self.GRU_p = GRU_p
        
        self.p_drop = 0.8
        
        
        #CONV1D
        self.Conv = nn.Conv1d(in_channels = input_freq, 
                              out_channels = Conv_p.out_channels,
                              kernel_size = Conv_p.kernel_size, 
                              stride=Conv_p.stride)
        #create Relu later
        self.batch = nn.BatchNorm1d(num_feature=1)
        self.ReLU = nn.ReLU()
        self.Dropout =  nn.Dropout(p=self.p_drop)

        
        #GRU
        # calculate size of final dimension from conv1d - equation from documentation
        #https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
        Conv_outsize = self.Conv(torch.ones([1,self.input_freq,self.input_time])).shape[2]
        
        self.GRU = nn.GRU(input_size =Conv_outsize, 
        hidden_size =hidden_time,
        num_layers  = GRU_p.num_layers, 
        batch_first = GRU_p.batch_first, 
        dropout     = GRU_p.dropout)
                
        # DENSE
        self.Dense = nn.Linear(in_features = hidden_time , out_features = output_time)

        # Sigmoid layer
        self.Sigmoid = nn.Sigmoid()

        
    def forward(self,xb):
        '''
        Apply the layers to the batch input
        '''
        out = self.Conv(xb)
        
        #apply relu and batch norm
        out = self.batch(out)
        out = self.ReLU(out)
        out = self.Dropout(out)
            
        out, hidden_state = self.GRU(out)

        out = self.Dense(out).squeeze(1)  #remove 1 singleton dimension - not the batch dimension

        out = self.Sigmoid(out)
               
        return out
        
    
def get_accuracy(y_true, y_prob,cutoff=0.8):
    y_true = y_true.squeeze()
    y_prob = y_prob.squeeze()
    
    
    assert y_true.ndim == 1 and y_true.size() == y_prob.size()
    y_prob = y_prob > cutoff
    return (y_true == y_prob).sum().item() / y_true.size(0)


In [53]:
#### set devices 
class Params(object):

    def __init__(self, batch_size, test_batch_size, number_frequencies, number_time_steps, epochs, lr, seed, cuda, log_interval,early_stopper_patience,early_stopper_min_delta, label_time,cutoff):
        '''
        Names self explanatory - seed = Random seed number, log_interval - the intervals at which the weights and biases will be recorded 
        '''

        self.batch_size = batch_size

        self.test_batch_size = test_batch_size

        self.epochs = epochs

        self.lr = lr
        
        self.number_frequencies = number_frequencies
        
        self.number_time_steps = number_time_steps
        

        self.seed = seed

        self.cuda = cuda

        self.log_interval = log_interval

        self.early_stopper_patience = early_stopper_patience
        
        self.early_stopper_min_delta = early_stopper_min_delta
        
        self.label_time = label_time
        
        self.cutoff = cutoff

args =Params(batch_size = 4, test_batch_size = 4,
             number_frequencies = 151,
             number_time_steps = 400,
             epochs = 200, lr =0.01, 
             seed = 1, cuda = False, 
             log_interval = 200,
             early_stopper_patience = 5,
             early_stopper_min_delta=0.01,
             label_time = 801,#changed to same size as input time dimension of spectrogram.
            cutoff =0.1) 

In [54]:
#model.Con
model= TriggerWord_LSTM2(params)
input_freq = 151
input_time  = 801
print(f"input shape: (1,{input_freq},{input_time})")  
summary(model, (input_freq,input_time))     


TypeError: TriggerWord_LSTM2.__init__() missing 5 required positional arguments: 'input_time', 'hidden_time', 'output_time', 'Conv_p', and 'GRU_p'

In [48]:
# With Learnable Parameters
m = nn.BatchNorm2d(1)
# Without Learnable Parameters
m = nn.BatchNorm2d(100, affine=False)
input = torch.randn(20, 100, 35, 45)
output = m(input)

output.shape

torch.Size([20, 100, 35, 45])