# 2022/01/09 Memo & Tests

#### Aim
remember many forgetten architectures of torch and/or others..

#### Get & Add the path

In [1]:
import pathlib
import sys

from pathlib import Path


current_dir: Path = pathlib.Path().cwd().resolve()
project_root: Path = current_dir.parent
data_dir: Path = project_root / "data"

sys.path.append(str(project_root))


In [2]:
import torch
import torch.nn.functional as F


groups = 1
batches = 5
in_channels = 8
out_channels = 16
freq_axis = 128
time_axis = 1500
filters = torch.randn(out_channels, in_channels // groups, 3, 3)
inputs = torch.randn(batches, in_channels, freq_axis, time_axis)
outputs = F.conv2d(inputs, filters, padding=1, stride=(1, 1))

print(inputs.size())
print(outputs.size())

torch.Size([5, 8, 128, 1500])
torch.Size([5, 16, 128, 1500])


In [3]:
from corsmal_challenge.models.convolution import DepthWiseConv2d, PointWiseConv2d


batches = 5
in_channels = 8
expansion = 6
freq_axis = 128
time_axis = 1500

inputs = torch.randn(batches, in_channels, freq_axis, time_axis)


dconv = DepthWiseConv2d(
    in_channels,
    expansion,
    kernel_size=(5, 5),
    stride=(1, 5 - 1),
    bias=True,
    padding=2,
)
pconv = PointWiseConv2d(
    dconv.out_channels,
    dconv.out_channels * 2,
    bias=False,
)


outputs: torch.Tensor = dconv(inputs)
outputs2: torch.Tensor = pconv(outputs)

print(inputs.size())
print(outputs.size())
print(outputs2.size())

torch.Size([5, 8, 128, 1500])
torch.Size([5, 48, 128, 375])
torch.Size([5, 96, 128, 375])


In [4]:
import torchinfo
from corsmal_challenge.models.convolution import InvertedResBlock


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batches = 5
in_channels = 8
expansion = 6
freq_axis = 128
time_axis = 1500

inputs = torch.randn(batches, in_channels, freq_axis, time_axis).to(device)

irb = InvertedResBlock(
    in_channels,
    bias=True,
    kernel_size=(3, 1),
    expansion=6,
)
irb2 = InvertedResBlock(
    in_channels,
    bias=True,
    kernel_size=(3, 1),
    expansion=6,
)

irb = irb.to(device)
irb2 = irb2.to(device)

outputs: torch.Tensor = irb2(irb(inputs))

print(inputs.size())
print(outputs.size())

torchinfo.summary(irb2, inputs_size=(batches, in_channels, freq_axis, time_axis))

torch.Size([5, 8, 128, 1500])
torch.Size([5, 8, 128, 1500])


Layer (type:depth-idx)                   Param #
InvertedResBlock                         --
├─BatchNorm2d: 1-1                       16
├─PointWiseConv2d: 1-2                   384
├─BatchNorm2d: 1-3                       96
├─DepthWiseConv2d: 1-4                   144
├─BatchNorm2d: 1-5                       96
├─PointWiseConv2d: 1-6                   392
Total params: 1,128
Trainable params: 1,128
Non-trainable params: 0

In [5]:
import torchinfo

from corsmal_challenge.models.convolution import LightCNNEncoder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batches = 8
in_channels = 8
expansion = 6
freq_axis = 128
time_axis = 1500
# time_axis = 512

inputs = torch.randn(batches, in_channels, freq_axis, time_axis)
inputs = inputs.to(device)

encoder = LightCNNEncoder(in_channels=in_channels)
encoder = encoder.to(device)

outputs: torch.Tensor = encoder(inputs)

print(inputs.size())
print(outputs.size())

torchinfo.summary(encoder, inputs_size=(batches, in_channels, freq_axis, time_axis))


torch.Size([8, 8, 128, 1500])
torch.Size([8, 1, 256])


Layer (type:depth-idx)                   Param #
LightCNNEncoder                          --
├─SquaredReLU: 1-1                       --
├─BatchNorm2d: 1-2                       16
├─DepthWiseConv2d: 1-3                   72
├─BatchNorm2d: 1-4                       16
├─PointWiseConv2d: 1-5                   144
├─Sequential: 1-6                        --
│    └─InvertedResBlock: 2-1             --
│    │    └─BatchNorm2d: 3-1             32
│    │    └─PointWiseConv2d: 3-2         1,536
│    │    └─BatchNorm2d: 3-3             192
│    │    └─DepthWiseConv2d: 3-4         864
│    │    └─BatchNorm2d: 3-5             192
│    │    └─PointWiseConv2d: 3-6         1,536
│    └─InvertedResBlock: 2-2             --
│    │    └─BatchNorm2d: 3-7             32
│    │    └─PointWiseConv2d: 3-8         1,536
│    │    └─BatchNorm2d: 3-9             192
│    │    └─DepthWiseConv2d: 3-10        864
│    │    └─BatchNorm2d: 3-11            192
│    │    └─PointWiseConv2d: 3-12        1,536
│    └─I

In [6]:
import torch
from torch import nn
from torch.nn import functional as F

from corsmal_challenge.models.transformer import MultiheadedSelfAttention

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batches = 7
seq_len = 1000
embed_dim = 144
num_heads = 12


a = torch.randn(batches, seq_len, embed_dim)
mha = MultiheadedSelfAttention(embed_dim, num_heads)
a = a.to(device)
mha = mha.to(device)


b: torch.Tensor = mha(a)
print(f"a.size(): {a.size()}")
print(f"b.size(): {b.size()}")

torchinfo.summary(mha, inputs_size=(batches, seq_len, embed_dim))

a.size(): torch.Size([7, 1000, 144])
b.size(): torch.Size([7, 1000, 144])


Layer (type:depth-idx)                   Param #
MultiheadedSelfAttention                 --
├─Linear: 1-1                            62,640
├─Dropout: 1-2                           --
├─Linear: 1-3                            20,880
├─Dropout: 1-4                           --
Total params: 83,520
Trainable params: 83,520
Non-trainable params: 0

In [7]:
class PositionalEmbedding(nn.Module):
    def __init__(
        self,
        embed_dim,
        max_len=8000,
        freq=16000.0,
    ):
        import math
        super(PositionalEmbedding, self).__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(freq) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div)
        pe[:, 1::2] = torch.cos(position * div)
        pe: torch.Tensor = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pe", pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.pe[: x.size(0), :]
        return x


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batches = 7
seq_len = 1000
embed_dim = 144
num_heads = 12
a = torch.randn(batches, seq_len, embed_dim).to(device)
pose = PositionalEmbedding(embed_dim).to(device)

b = pose(a)


print(f"a.size(): {a.size()}")
print(f"b.size(): {b.size()}")

torchinfo.summary(pose, inputs_size=(batches, seq_len, embed_dim))

a.size(): torch.Size([7, 1000, 144])
b.size(): torch.Size([7, 1000, 144])


Layer (type:depth-idx)                   Param #
PositionalEmbedding                      --
Total params: 0
Trainable params: 0
Non-trainable params: 0

In [8]:
import torch
from torch import nn
from torch.nn import functional as F

from corsmal_challenge.models.transformer import TransformerEncoder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batches = 7
seq_len = 1000
embed_dim = 128
num_heads = 8


a = torch.randn(batches, seq_len, embed_dim)
mha = TransformerEncoder(6, embed_dim, num_heads)
a = a.to(device)
mha = mha.to(device)


b: torch.Tensor = mha(a)
print(f"a.size(): {a.size()}")
print(f"b.size(): {b.size()}")

torchinfo.summary(mha, inputs_size=(batches, seq_len, embed_dim))

a.size(): torch.Size([7, 1000, 128])
b.size(): torch.Size([7, 1001, 128])


Layer (type:depth-idx)                             Param #
TransformerEncoder                                 --
├─PositionalEncoding: 1-1                          --
├─MultiheadedSelfAttention: 1-2                    --
│    └─Linear: 2-1                                 49,536
│    └─Dropout: 2-2                                --
│    └─Linear: 2-3                                 16,512
│    └─Dropout: 2-4                                --
├─Sequential: 1-3                                  --
│    └─TransformerEncoderBlock: 2-5                --
│    │    └─LayerNorm: 3-1                         256
│    │    └─MultiheadedSelfAttention: 3-2          66,048
│    │    └─FFN: 3-3                               65,920
│    └─TransformerEncoderBlock: 2-6                --
│    │    └─LayerNorm: 3-4                         256
│    │    └─MultiheadedSelfAttention: 3-5          66,048
│    │    └─FFN: 3-6                               65,920
│    └─TransformerEncoderBlock: 2-7                

In [9]:
import torch
from torch import nn
from torch.nn import functional as F

from corsmal_challenge.models.transformer import TransformerEncoder
from corsmal_challenge.models.audio import LogMelEncoder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batches = 7
seq_len = 1000
in_channels = 8
embed_dim = 128
num_heads = 4
num_layers = 6


# a = torch.randn(batches, seq_len, embed_dim)
# classifier = TransformerEncoder(num_layers, embed_dim, num_heads)
a = torch.randn(batches, in_channels, seq_len, embed_dim)
classifier = LogMelEncoder(in_channels, num_layers, embed_dim, num_heads)
a = a.to(device)
classifier = classifier.to(device)


b: torch.Tensor = classifier(a)
print(f"a.size(): {a.size()}")
print(f"b.size(): {b.size()}")

torchinfo.summary(classifier, inputs_size=a.shape)

a.size(): torch.Size([7, 8, 1000, 128])
b.size(): torch.Size([7, 1001, 128])


Layer (type:depth-idx)                                  Param #
LogMelEncoder                                           --
├─BatchNorm2d: 1-1                                      16
├─DepthWiseConv2d: 1-2                                  416
├─PointWiseConv2d: 1-3                                  17
├─TransformerEncoder: 1-4                               --
│    └─PositionalEncoding: 2-1                          --
│    └─MultiheadedSelfAttention: 2-2                    --
│    │    └─Linear: 3-1                                 49,536
│    │    └─Dropout: 3-2                                --
│    │    └─Linear: 3-3                                 16,512
│    │    └─Dropout: 3-4                                --
│    └─Sequential: 2-3                                  --
│    │    └─TransformerEncoderBlock: 3-5                132,224
│    │    └─TransformerEncoderBlock: 3-6                132,224
│    │    └─TransformerEncoderBlock: 3-7                132,224
│    │    └─TransformerEnco

In [10]:
import torchinfo
import torch
from torch import nn
from torch.nn import functional as F

from corsmal_challenge.models.audio import LogMelEncoder  # noqa (E402)
from corsmal_challenge.models.task1_2 import T1Head, T2Head  # noqa (E402)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batches = 1
seq_len = 500
in_channels = 8
embed_dim = 128
num_heads = 4
num_layers = 4

class TaskChallenger(nn.Module):
    def __init__(self, task_id: int = 1):
        super(TaskChallenger, self).__init__()
        self.task_id = task_id
        self.encoder = LogMelEncoder()
        self.classify_head1 = T1Head()
        self.classify_head2 = T2Head()

    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
        x: torch.Tensor = self.encoder(inputs)
        if self.task_id == 1:
            x = self.classify_head1(x[:, 0, :])  # extract embedding of class token
        elif self.task_id == 2:
            x = self.classify_head2(x[:, 0, :])  # extract embedding of class token
        x = x.squeeze(1)
        return x

# a = torch.randn(batches, seq_len, embed_dim)
# model = LiT(num_layers, embed_dim, num_heads)
a = torch.randn(batches, in_channels, seq_len, embed_dim)
# model = LogMelEncoder(in_channels, num_layers, embed_dim, num_heads)
model = TaskChallenger(2).to(device)
a = a.to(device)
model = model.to(device)


b: torch.Tensor = model(a)
print(f"a.size(): {a.size()}")
print(f"b.size(): {b.size()}")

torchinfo.summary(model, inputs_size=a.shape)

a.size(): torch.Size([1, 8, 500, 128])
b.size(): torch.Size([1, 4])


Layer (type:depth-idx)                                       Param #
TaskChallenger                                               --
├─LogMelEncoder: 1-1                                         --
│    └─BatchNorm2d: 2-1                                      16
│    └─DepthWiseConv2d: 2-2                                  416
│    └─PointWiseConv2d: 2-3                                  17
│    └─TransformerEncoder: 2-4                               --
│    │    └─PositionalEncoding: 3-1                          --
│    │    └─MultiheadedSelfAttention: 3-2                    66,048
│    │    └─Sequential: 3-3                                  661,120
├─T1Head: 1-2                                                --
│    └─Linear: 2-5                                           4,128
│    └─Linear: 2-6                                           99
├─T2Head: 1-3                                                --
│    └─Linear: 2-7                                           4,128
│    └─Linear: 2-8 

In [11]:
%%timeit
model(a)

2.24 ms ± 22.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
