In [2]:
%ls

exp.ipynb  [0m[01;34mlibs[0m/  [01;34mscripts[0m/  train.py  [01;34mutils[0m/  [01;34mwandb[0m/


In [1]:
import numpy as np
import pandas as pd

from libs.dataset import BirdClefDataset
from libs.models import BirdNet

  warn(f"Failed to load image Python extension: {e}")


In [7]:
train_files = np.load("../csv/train_files.npy")
train_files.shape
train_files[0]

'../dataset/logmel/eubeat1/XC337595.npy'

In [6]:
train_sample = np.load("../dataset/logmel/abethr1/XC128013.npy")
train_sample.shape

(9, 128, 313)

In [15]:
val_files = np.load("csv/val_files.npy")
val_files.shape

(3284,)

In [16]:
ss = pd.read_csv('data/sample_submission.csv')
birds = list(ss.columns[1:])
bird_label_map = {birds[i]:i for i in range(len(birds))}

train_data = BirdClefDataset(files=train_files, transform=None, bird_label_map=bird_label_map, split='train')

In [5]:
from torchinfo import summary


In [8]:
m = BirdNet()
batch_size = 16
summary(model=m, input_size=(batch_size, 3, 128, 313))

Layer (type:depth-idx)                             Output Shape              Param #
BirdNet                                            [16, 264]                 --
├─EfficientNet: 1-1                                [16, 264]                 --
│    └─Conv2dSame: 2-1                             [16, 32, 64, 157]         864
│    └─BatchNormAct2d: 2-2                         [16, 32, 64, 157]         64
│    │    └─Identity: 3-1                          [16, 32, 64, 157]         --
│    │    └─SiLU: 3-2                              [16, 32, 64, 157]         --
│    └─Sequential: 2-3                             [16, 320, 4, 10]          --
│    │    └─Sequential: 3-3                        [16, 16, 64, 157]         1,448
│    │    └─Sequential: 3-4                        [16, 24, 32, 79]          16,714
│    │    └─Sequential: 3-5                        [16, 40, 16, 40]          46,640
│    │    └─Sequential: 3-6                        [16, 80, 8, 20]           242,930
│    │    └─Sequen

In [9]:
m.in_features

1280

In [3]:
import timm
import torch
import torch.nn as nn
from torchinfo import summary

In [4]:
model_name = 'tf_efficientnet_b0_ns'
m = timm.create_model(model_name, pretrained=True, num_classes=0)

In [7]:
batch_size = 16
summary(model=m, input_size=(batch_size, 3, 128, 313))

Layer (type:depth-idx)                        Output Shape              Param #
EfficientNet                                  [16, 1280]                --
├─Conv2dSame: 1-1                             [16, 32, 64, 157]         864
├─BatchNormAct2d: 1-2                         [16, 32, 64, 157]         64
│    └─Identity: 2-1                          [16, 32, 64, 157]         --
│    └─SiLU: 2-2                              [16, 32, 64, 157]         --
├─Sequential: 1-3                             [16, 320, 4, 10]          --
│    └─Sequential: 2-3                        [16, 16, 64, 157]         --
│    │    └─DepthwiseSeparableConv: 3-1       [16, 16, 64, 157]         1,448
│    └─Sequential: 2-4                        [16, 24, 32, 79]          --
│    │    └─InvertedResidual: 3-2             [16, 24, 32, 79]          6,004
│    │    └─InvertedResidual: 3-3             [16, 24, 32, 79]          10,710
│    └─Sequential: 2-5                        [16, 40, 16, 40]          --
│    │   

In [15]:
class MaxPool(nn.Module):
    def __init__(self, height, width):
        super(MaxPool, self).__init__()
        #self.pool = nn.MaxPool2d((height, width))
        self.pool = nn.MaxPool2d(height)

    def forward(self, x):
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return x

maxpool = MaxPool(128)
batch_size = 16
summary(model=maxpool, input_size=(batch_size, 3, 128, 313))


Layer (type:depth-idx)                   Output Shape              Param #
MaxPool                                  [16, 3]                   --
├─MaxPool2d: 1-1                         [16, 3, 1, 1]             --
Total params: 0
Trainable params: 0
Non-trainable params: 0
Total mult-adds (M): 0
Input size (MB): 7.69
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 7.69

In [26]:
class BirdNetwMaxpool(nn.Module):
    def __init__(self, model_name:str = 'tf_efficientnet_b0_ns', pretrained:bool = True, output_dim = 264, part_size = 6) -> None:
        super().__init__()
        self.backbone = timm.create_model(model_name, pretrained=pretrained, num_classes=0)
        #print(vars(self.backbone))
        self.in_features = self.backbone.num_features

        self.part_size = part_size
        self.pool = nn.MaxPool1d(part_size)
        self.classifier = nn.Linear(self.in_features, output_dim)
        
    def forward(self, x):
        """
        Returns tensor (bs, num_classes)

        Argument:
        x - tensor (bs, part_size, time_segment, freq)
        """
        
        x = x.view((x.shape[0]*x.shape[1], x.shape[2], x.shape[3], x.shape[4])) #(bs, part_size, 9, time_segment, freq)→(bs*part_size, time_segment, freq)
        x = self.backbone(x) #(bs*part_size, time_segment, freq)→(bs*part_size, in_features)
        x = x.view((-1, self.in_features, self.part_size)) #(bs*part_size, in_features) → (bs, in_features, part_size)
        x = self.pool(x) #(bs, in_features, part_size) → (bs, in_features, 1)
        x = x.view(x.shape[0], -1) #(bs, in_features, 1) → (bs, in_features)
        clipwise_logits = self.classifier(x) #(bs, in_features) → (bs, out_size)

        output_dict = {
            "logit": clipwise_logits, # (batch_size, out_dim)
            'clipwise_output': nn.Softmax(dim = -1)(clipwise_logits)
        }
        return output_dict

In [27]:
model = BirdNetwMaxpool()

batch_size = 16
part_size = 6
inputs = torch.ones((batch_size, part_size, 3, 128, 313))

output_dict = model(inputs)
output_dict['logit'].shape

torch.Size([96, 3, 128, 313])
torch.Size([96, 1280])
torch.Size([16, 1280, 6])
torch.Size([16, 1280, 1])


torch.Size([16, 264])

In [None]:
maxpool = MaxPool(128)
batch_size = 16
summary(model=maxpool, input_size=(batch_size, 3, 128, 313))

In [2]:
from libs.models.BirdNet_SED import *

In [7]:
class BirdNet_SED_(nn.Module):
    def __init__(self, model_name:str = 'tf_efficientnet_b0_ns', pretrained:bool = True, output_dim = 264) -> None:
        super().__init__()
        self.backbone = timm.create_model(model_name, pretrained=pretrained)
        self.in_features = self.backbone.classifier.in_features
        self.backbone.classifier = nn.Sequential(
            nn.Linear(self.in_features, output_dim)
        )
        
        self.bn0 = nn.BatchNorm2d(128)

        layers = list(self.backbone.children())[:-2]
        self.encoder = nn.Sequential(*layers)

        self.fc1 = nn.Linear(self.in_features, self.in_features, bias=True)
        self.att_block = AttBlockV2(
            self.in_features, output_dim, activation="sigmoid")
        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=2, time_stripes_num=2,
                                               freq_drop_width=2, freq_stripes_num=2)        
        self.init_weight()
        
    def init_weight(self):
        init_layer(self.fc1)
        init_bn(self.bn0)
        
    def forward(self, x):
        # (batch_size, 3, mel_bins, time_steps)
        frames_num = x.shape[3]

        # if self.training:
        #     x = self.spec_augmenter(x)
        
        # (batch_size, channels, freq, frames)
        x = self.encoder(x)

        x = torch.mean(x, dim=2) # (batch_size, channels, frames)

        # channel smoothing
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2

        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = x.transpose(1, 2)
        x = F.dropout(x, p=0.5, training=self.training)
        print(x.shape)
        (clipwise_output, norm_att, segmentwise_output) = self.att_block(x)
        print(norm_att.shape)
        
        logit = torch.sum(norm_att * self.att_block.cla(x), dim=2)
        segmentwise_logit = self.att_block.cla(x).transpose(1, 2)
        segmentwise_output = segmentwise_output.transpose(1, 2)
        
        interpolate_ratio = frames_num // segmentwise_output.size(1)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       interpolate_ratio)
        framewise_output = pad_framewise_output(framewise_output, frames_num)

        framewise_logit = interpolate(segmentwise_logit, interpolate_ratio)
        framewise_logit = pad_framewise_output(framewise_logit, frames_num)

        output_dict = {
            "framewise_output": framewise_output, # (batch_size, time_steps, out_dim)
            "segmentwise_output": segmentwise_output, # (batch_size, 4 よくわからん, out_dim])
            "logit": logit, # (batch_size, out_dim)
            "framewise_logit": framewise_logit, # (batch_size, time_steps, out_dim)
            "clipwise_output": clipwise_output # (batch_size, out_dim)
        }

        return output_dict

In [8]:
m = BirdNet_SED_()
inputs = torch.rand(16, 3, 128, 313)
outputs = m(inputs)

torch.Size([16, 1280, 10])
torch.Size([16, 264, 10])


In [3]:
from libs.models.BirdNet_taxonomy import BirdNet_taxonomy
import torch

In [5]:
model = BirdNet_taxonomy()

inputs = torch.rand(16, 3, 128, 313)
outputs = model(inputs)

In [8]:
s_out = outputs['species']
s_out['framewise_output']

tensor([[[0.0155, 0.5371, 0.0241,  ..., 0.5942, 0.9891, 0.9583],
         [0.0155, 0.5371, 0.0241,  ..., 0.5942, 0.9891, 0.9583],
         [0.0155, 0.5371, 0.0241,  ..., 0.5942, 0.9891, 0.9583],
         ...,
         [0.1822, 0.3524, 0.2304,  ..., 0.4277, 0.4316, 0.4074],
         [0.1822, 0.3524, 0.2304,  ..., 0.4277, 0.4316, 0.4074],
         [0.1822, 0.3524, 0.2304,  ..., 0.4277, 0.4316, 0.4074]],

        [[0.5116, 0.9691, 0.7613,  ..., 0.3462, 0.8400, 0.9572],
         [0.5116, 0.9691, 0.7613,  ..., 0.3462, 0.8400, 0.9572],
         [0.5116, 0.9691, 0.7613,  ..., 0.3462, 0.8400, 0.9572],
         ...,
         [0.3667, 0.2811, 0.7408,  ..., 0.6989, 0.8033, 0.6708],
         [0.3667, 0.2811, 0.7408,  ..., 0.6989, 0.8033, 0.6708],
         [0.3667, 0.2811, 0.7408,  ..., 0.6989, 0.8033, 0.6708]],

        [[0.1613, 0.5800, 0.2354,  ..., 0.8117, 0.8698, 0.2846],
         [0.1613, 0.5800, 0.2354,  ..., 0.8117, 0.8698, 0.2846],
         [0.1613, 0.5800, 0.2354,  ..., 0.8117, 0.8698, 0.