In [3]:
import torch
from torch import Tensor
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as f
import numpy as np
import torchinfo
import math

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

class AudioEmbedding(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Sequential(nn.Conv2d(in_channels = 1, out_channels = 32, kernel_size=(3,1), stride=1, padding="same"),
                                    nn.Tanh(), #default neg slope of 0.01
                                    nn.MaxPool2d(kernel_size=(2,1), stride=2))
        self.conv2 = nn.Sequential(nn.Conv2d(in_channels = 32,out_channels = 64, kernel_size=(3,1), stride=1, padding="same"),
                                   nn.Tanh(),
                                   nn.MaxPool2d(kernel_size=(2,1), stride=2))
        self.conv3 = nn.Sequential(nn.Conv2d(in_channels = 64,out_channels = 128, kernel_size=(3,1), stride=1, padding="same"),
                                   nn.Tanh(),
                                   nn.MaxPool2d(kernel_size=(2,1), stride=2))
        self.conv4 = nn.Sequential(nn.Conv2d(in_channels = 128,out_channels = 256, kernel_size=(3,1), stride=1, padding="same"),
                                   nn.Tanh(),
                                   nn.MaxPool2d(kernel_size=(2,1), stride=2))
        self.conv5 = nn.Sequential(nn.Conv2d(in_channels = 256,out_channels = 512, kernel_size=(3,1), stride=1, padding="same"),
                                   nn.Tanh(),
                                   nn.MaxPool2d(kernel_size=(2,1), stride=2))
        
        self.drop  = nn.Dropout(0.1)
          
    def getPositionEncoding(self,rows, cols,n=10000):
        P = torch.zeros((rows, cols))
        for k in range(rows):
            for i in torch.arange(int(cols/2)):
                denominator = torch.pow(n, 2*i/cols)
                P[k, 2*i] = torch.sin(k/denominator)
                P[k, 2*i+1] = torch.cos(k/denominator)
        return P
    
    
        
    def forward(self, data):
       
        out_32_enc = self.conv1(data)
        out_64_enc = self.conv2(out_32_enc)
        out_128_enc = self.conv3(out_64_enc)
        out_256_enc = self.conv4(out_128_enc)
        out_512_enc = self.conv5(out_256_enc)
      
        out_drop = self.drop(out_512_enc)
        conv_feature_embedding = torch.transpose(torch.squeeze(out_drop),0,1) #matrix of 250 by 512 (so 512 feature vectors each 250 long)
        
        
        position_embedding = self.getPositionEncoding(250,512).to(device=conv_feature_embedding.device)
        print(torch.add(conv_feature_embedding,position_embedding).shape )
        return torch.add(conv_feature_embedding,position_embedding) 

Using device: cuda
torch.Size([250, 512])


Layer (type:depth-idx)                   Output Shape              Param #
AudioEmbedding                           [250, 512]                --
├─Sequential: 1-1                        [1, 32, 4000, 1]          --
│    └─Conv2d: 2-1                       [1, 32, 8000, 1]          128
│    └─Tanh: 2-2                         [1, 32, 8000, 1]          --
│    └─MaxPool2d: 2-3                    [1, 32, 4000, 1]          --
├─Sequential: 1-2                        [1, 64, 2000, 1]          --
│    └─Conv2d: 2-4                       [1, 64, 4000, 1]          6,208
│    └─Tanh: 2-5                         [1, 64, 4000, 1]          --
│    └─MaxPool2d: 2-6                    [1, 64, 2000, 1]          --
├─Sequential: 1-3                        [1, 128, 1000, 1]         --
│    └─Conv2d: 2-7                       [1, 128, 2000, 1]         24,704
│    └─Tanh: 2-8                         [1, 128, 2000, 1]         --
│    └─MaxPool2d: 2-9                    [1, 128, 1000, 1]         --
├─Seque