# Birdsong Pytorch Baseline: Inference - 2022

## About

Resnet-50 as baseline

This is a notebook for **_inference & submission_**. 

Most of this notebook consists of [great baseline](https://www.kaggle.com/hidehisaarai1213/inference-pytorch-birdcall-resnet-baseline) shared by @hidehisaarai1213 .  
Thank you for sharing !

## Prepare

### import libraries

In [1]:
!pip install fastprogress

import os
import gc
import time
import math
import shutil
import random
import warnings
import typing as tp
from pathlib import Path
from contextlib import contextmanager
import torchvision.models as models
import yaml
from joblib import delayed, Parallel

import cv2
import librosa
import audioread
import soundfile as sf

import numpy as np
import pandas as pd

from fastprogress import progress_bar
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Conv2d, Module, Linear, BatchNorm2d, ReLU
from torch.nn.modules.utils import _pair
import torch.utils.data as data

pd.options.display.max_rows = 500
pd.options.display.max_columns = 500



### define utilities

In [2]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
#     torch.backends.cudnn.deterministic = True  # type: ignore
#     torch.backends.cudnn.benchmark = True  # type: ignore
    

@contextmanager
def timer(name: str) -> None:
    """Timer Util"""
    t0 = time.time()
    #print("[{}] start".format(name))
    yield
    #print("[{}] done in {:.0f} s".format(name, time.time() - t0))

In [3]:
# logger = get_logger("main.log")
set_seed(1213)

### read data

In [4]:
ROOT = Path.cwd()
INPUT_ROOT = ROOT / "input"
RAW_DATA = INPUT_ROOT / "birdclef-2022"
TRAIN_AUDIO_DIR = RAW_DATA / "train_audio"
# TRAIN_RESAMPLED_AUDIO_DIRS = [
#   INPUT_ROOT / "birdsong-resampled-train-audio-{:0>2}".format(i)  for i in range(5)
# ]
TEST_AUDIO_DIR = RAW_DATA / "test_soundscapes"

In [5]:
test = pd.read_csv(RAW_DATA / "test.csv")

In [6]:
test.head()

Unnamed: 0,row_id,file_id,bird,end_time
0,soundscape_453028782_akiapo_5,soundscape_453028782,akiapo,5
1,soundscape_453028782_akiapo_10,soundscape_453028782,akiapo,10
2,soundscape_453028782_akiapo_15,soundscape_453028782,akiapo,15
3,soundscape_453028782_aniani_5,soundscape_453028782,aniani,5
4,soundscape_453028782_aniani_10,soundscape_453028782,aniani,10


In [7]:
test_unique = test[['file_id', 'end_time']].drop_duplicates(subset=['file_id','end_time'])
test_unique 

Unnamed: 0,file_id,end_time
0,soundscape_453028782,5
1,soundscape_453028782,10
2,soundscape_453028782,15


In [8]:
sub = pd.read_csv("./input/birdclef-2022/sample_submission.csv")
sub.to_csv("submission.csv", index=False)  # this will be overwritten if everything goes well

### set parameters

In [9]:
TARGET_SR = 32000

model_config = {"name": "resnet50",
"params":{
    "pretrained": True,
    "n_classes": 152 #changed
    }
}

melspectrogram_parameters = {
    "n_mels": 128,
    "fmin": 20,
    "fmax": 16000,
    "n_fft": 1024,
    "hop_length": 256
}



## Definition

### Dataset

In [10]:
BIRD_CODE = {'afrsil1': 0, 'akekee': 1, 'akepa1': 2, 'akiapo': 3, 'akikik': 4, 
    'amewig': 5, 'aniani': 6, 'apapan': 7, 'arcter': 8, 'barpet': 9, 
    'bcnher': 10, 'belkin1': 11, 'bkbplo': 12, 'bknsti': 13, 'bkwpet': 14, 
    'blkfra': 15, 'blknod': 16, 'bongul': 17, 'brant': 18, 'brnboo': 19, 
    'brnnod': 20, 'brnowl': 21, 'brtcur': 22, 'bubsan': 23, 'buffle': 24, 
    'bulpet': 25, 'burpar': 26, 'buwtea': 27, 'cacgoo1': 28, 'calqua': 29, 
    'cangoo': 30, 'canvas': 31, 'caster1': 32, 'categr': 33, 'chbsan': 34, 
    'chemun': 35, 'chukar': 36, 'cintea': 37, 'comgal1': 38, 'commyn': 39, 
    'compea': 40, 'comsan': 41, 'comwax': 42, 'coopet': 43, 'crehon': 44, 
    'dunlin': 45, 'elepai': 46, 'ercfra': 47, 'eurwig': 48, 'fragul': 49, 
    'gadwal': 50, 'gamqua': 51, 'glwgul': 52, 'gnwtea': 53, 'golphe': 54, 
    'grbher3': 55, 'grefri': 56, 'gresca': 57, 'gryfra': 58, 'gwfgoo': 59, 
    'hawama': 60, 'hawcoo': 61, 'hawcre': 62, 'hawgoo': 63, 'hawhaw': 64, 
    'hawpet1': 65, 'hoomer': 66, 'houfin': 67, 'houspa': 68, 'hudgod': 69, 
    'iiwi': 70, 'incter1': 71, 'jabwar': 72, 'japqua': 73, 'kalphe': 74, 
    'kauama': 75, 'laugul': 76, 'layalb': 77, 'lcspet': 78, 'leasan': 79, 
    'leater1': 80, 'lessca': 81, 'lesyel': 82, 'lobdow': 83, 'lotjae': 84, 
    'madpet': 85, 'magpet1': 86, 'mallar3': 87, 'masboo': 88, 'mauala': 89, 
    'maupar': 90, 'merlin': 91, 'mitpar': 92, 'moudov': 93, 'norcar': 94, 
    'norhar2': 95, 'normoc': 96, 'norpin': 97, 'norsho': 98, 'nutman': 99, 
    'oahama': 100, 'omao': 101, 'osprey': 102, 'pagplo': 103, 'palila': 104, 
    'parjae': 105, 'pecsan': 106, 'peflov': 107, 'perfal': 108, 'pibgre': 109, 
    'pomjae': 110, 'puaioh': 111, 'reccar': 112, 'redava': 113, 'redjun': 114, 
    'redpha1': 115, 'refboo': 116, 'rempar': 117, 'rettro': 118, 'ribgul': 119, 
    'rinduc': 120, 'rinphe': 121, 'rocpig': 122, 'rorpar': 123, 'rudtur': 124, 
    'ruff': 125, 'saffin': 126, 'sander': 127, 'semplo': 128, 'sheowl': 129, 
    'shtsan': 130, 'skylar': 131, 'snogoo': 132, 'sooshe': 133, 'sooter1': 134, 
    'sopsku1': 135, 'sora': 136, 'spodov': 137, 'sposan': 138, 'towsol': 139, 
    'wantat1': 140, 'warwhe1': 141, 'wesmea': 142, 'wessan': 143, 'wetshe': 144, 
    'whfibi': 145, 'whiter': 146, 'whttro': 147, 'wiltur': 148, 'yebcar': 149, 
    'yefcan': 150, 'zebdov': 151}

INV_BIRD_CODE = {v: k for k, v in BIRD_CODE.items()}


In [11]:
def mono_to_color(X: np.ndarray,
                  mean=None,
                  std=None,
                  norm_max=None,
                  norm_min=None,
                  eps=1e-6):
    """
    Code from https://www.kaggle.com/daisukelab/creating-fat2019-preprocessed-data
    """
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    X = X - mean
    std = std or X.std()
    Xstd = X / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Normalize to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V


In [12]:
class TestDatasetNew(data.Dataset):
    def __init__(self, df: pd.DataFrame, clip: np.ndarray,
                 img_size=224, melspectrogram_parameters={}):
        self.df = df
        self.clip = clip
        self.img_size = img_size
        self.melspectrogram_parameters = melspectrogram_parameters
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx: int):
        SR = 32000
        sample = self.df.loc[idx, :]
        # site = sample.site
        # row_id = sample.row_id
        file_id = sample.file_id
        end_time = sample.end_time
        
        end_seconds = int(end_time)
        start_seconds = int(end_seconds - 5)
        
        start_index = SR * start_seconds
        end_index = SR * end_seconds
        
        y = self.clip[start_index:end_index].astype(np.float32)

        melspec = librosa.feature.melspectrogram(y, sr=SR, **self.melspectrogram_parameters)
        melspec_base = librosa.power_to_db(melspec).astype(np.float32)
        #print('this is the melspec shape site 1',melspec.shape)
        image = mono_to_color(melspec)
        height, width, _ = image.shape
        image = cv2.resize(image, (int(width * self.img_size / height), self.img_size))
        image = np.moveaxis(image, 2, 0)
        image = (image / 255.0).astype(np.float32)

        return np.array(image), file_id, end_time 

### model


In [13]:
from torchvision import models

import torch as t
import torch.nn as nn
from torch.nn import functional as F


class ResidualBlock(nn.Module):
    #显式的继承自nn.Module
    #resnet是卷积的一种
    def __init__(self, inchannel, outchannel, stride=1, shortcut=None):
        #shortcut是直连，resnet和densenet的精髓所在
        #层的定义都在初始化里
        super(ResidualBlock, self).__init__()
        self.left = nn.Sequential(
                nn.Conv2d(inchannel, outchannel, 3, stride, 1, bias=False),
                nn.BatchNorm2d(outchannel),
                nn.ReLU(inplace=True),
                nn.Conv2d(outchannel,outchannel,3,1,1,bias=False),
                nn.BatchNorm2d(outchannel))
        self.right = shortcut
    
    def forward(self,x):
        out = self.left(x)
        residual = x if self.right is None else self.right(x)
        out += residual
        return F.relu(out)
    

class ResNet(nn.Module):
    #包括34，50，101等多种结构，可以按需实现，这里是Resnet34
    def __init__(self, num_classes=152):
        super(ResNet,self).__init__()
        self.pre = nn.Sequential(nn.Conv2d(3,64,7,2,3,bias=False),
                                 nn.BatchNorm2d(64),#这个64是指feature_num
                                 nn.ReLU(inplace=True),
                                 nn.MaxPool2d(3,2,1) )
        self.layer1 = self._make_layer(64, 128, 3)
        self.layer2 = self._make_layer(128, 256, 4, stride=2)
        self.layer3 = self._make_layer(256, 512, 6, stride=2)
        self.layer4 = self._make_layer(512, 512, 3, stride=2)
        self.adp = nn.AdaptiveAvgPool2d(output_size = 1)
        self.classifier = nn.Linear(512, num_classes)

    def _make_layer(self, inchannel, outchannel, block_num, stride=1):
        short_cut = nn.Sequential(
                nn.Conv2d(inchannel,outchannel,1,stride,bias=False),
                nn.BatchNorm2d(outchannel)
                )
        layers = []
        layers.append(ResidualBlock(inchannel,outchannel,stride,short_cut))
        for i in range(1, block_num):
            layers.append(ResidualBlock(outchannel,outchannel))#输入和输出要一致
        return nn.Sequential(*layers)
    
    def forward(self, x):
        x = self.pre(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        # x = F.avg_pool2d(x,7)#注意F和原生的区别
        x = self.adp(x)
        x = x.view(x.size(0), -1)
        # x = x.view(-1, x.size(0))
        x = self.classifier(x)
        multiclass_proba = F.softmax(x, dim=1)
        multilabel_proba = F.sigmoid(x)
        return {
            "logits": x,
            "multiclass_proba": multiclass_proba,
            "multilabel_proba": multilabel_proba
        }

In [14]:
device = torch.device("cpu")
# def get_model(args: tp.Dict): # original code for emsemble
#     # # get resnest50_fast_1s1x64d
    
#     model_1 = YuvNet(args)
#     state_dict = torch.load("../input/bird-fold1-150-v1/bird.pth")
#     model_1.load_state_dict(state_dict)
#     model_1.to(device)
#     model_1.eval()
    
#     model_2 = YuvNet(args)
#     state_dict = torch.load("../input/bird-label-v1/bird.pth")
#     model_2.load_state_dict(state_dict)
#     model_2.to(device)
#     model_2.eval()
    
#     return model_1, model_2 

def get_model(config: dict):
    model_config = config
    model_name = model_config["name"]
    model_params = model_config["params"]

    if "resnet" in model_name:
        model = ResNet(  # type: ignore
            base_model_name=model_name,
            pretrained=model_params["pretrained"],
            num_classes=model_params["n_classes"])
        state_dict = torch.load("./output/000_ResNet50/fold0/checkpoints/model.last.pth")
        model.load_state_dict(state_dict)
        # model.to(device)
        model.eval()
        return model
    else:
        raise NotImplementedError

In [15]:
# model_1, model_2 = get_model(model_config)
model_1 = get_model(model_config)

In [16]:
model_1

ResNet(
  (encoder): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): Conv2d(64, 2

In [17]:
model_2 = get_model(model_config) # for emsemble

## Prediction loop

In [18]:
def prediction_for_clip_new(test_df: pd.DataFrame, 
                        clip: np.ndarray, 
                        model: ResNet, 
                        mel_params: dict, 
                        threshold=0.5):

    dataset = TestDatasetNew(df=test_df, 
                          clip=clip,
                          img_size=224,
                          melspectrogram_parameters=mel_params)
    loader = data.DataLoader(dataset, batch_size=1, shuffle=False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model_1.eval()
    model_2.eval()
    prediction_df = pd.DataFrame(None, columns=['file_id', 'end_time', 'predictions'])
    for image, file_id, end_time in progress_bar(loader):

        # row_id = row_id[0] ?
        # if site in {"site_1", "site_2"}:
        #image = image.unsqueeze(0)
        image = image.to(device)
        #print('site 1 image ',image.shape)
        with torch.no_grad():
            prediction1 = model_1(image)
            prediction2 = model_2(image)
            # proba1 = prediction1.detach().cpu().numpy().reshape(-1)
            # proba2 = prediction2.detach().cpu().numpy().reshape(-1)
            proba1 = prediction1["multiclass_proba"].numpy().reshape(-1)
            proba2 = prediction2["multiclass_proba"].numpy().reshape(-1)
        # print('proba1 ',proba1.shape)
        # print('proba2 ',proba2.shape)
        proba = (proba1 + proba2)/2.
        # print('proba ',proba)
        events = proba >= threshold
        labels = np.argwhere(events).reshape(-1).tolist()
        #print('this is site 1 labels ',labels)

        file_id_str = str(file_id[0])
        end_time_int = int(end_time[0])

        
        if len(labels) == 0:
            
            print('file_id',file_id_str)
            print('end_time', end_time_int)
            cur_prediction = pd.DataFrame({"file_id" : file_id_str, 
                                        "end_time" : end_time_int,
                                        "predictions" : "nocall"}, index = [0])
        else:
            labels_str_list = list(map(lambda x: INV_BIRD_CODE[x], labels))
            label_string = " ".join(labels_str_list)
            cur_prediction = pd.DataFrame({"file_id" : file_id_str, 
                                        "end_time" : end_time_int,
                                        "predictions" : label_string}, index = [0])
        # print("current prediction:", cur_prediction)
        prediction_df = prediction_df.append(cur_prediction, ignore_index=True)
        # print("prediction_df:", prediction_df)
    return prediction_df

In [19]:
test.head()

Unnamed: 0,row_id,file_id,bird,end_time
0,soundscape_453028782_akiapo_5,soundscape_453028782,akiapo,5
1,soundscape_453028782_akiapo_10,soundscape_453028782,akiapo,10
2,soundscape_453028782_akiapo_15,soundscape_453028782,akiapo,15
3,soundscape_453028782_aniani_5,soundscape_453028782,aniani,5
4,soundscape_453028782_aniani_10,soundscape_453028782,aniani,10


In [20]:
test.end_time.max()

15

In [21]:
def prediction(test_df: pd.DataFrame,
               test_audio: Path,
               model_config: dict,
               mel_params: dict,
               target_sr: int,
               threshold=0.5):
    model = get_model(model_config)
    unique_file_id = test_df.file_id.unique()

    warnings.filterwarnings("ignore")
    prediction_dfs = pd.DataFrame(None, columns=['file_id', 'end_time', 'predictions'])
    for file_id in unique_file_id:
        with timer(f"Loading {file_id}"):
            clip, _ = librosa.load(test_audio / (file_id + ".ogg"),
                                   sr=target_sr,
                                   mono=True,
                                   res_type="kaiser_fast")
        
        test_df_for_file_id = test_df.query(
            f"file_id == '{file_id}'").reset_index(drop=True)
        with timer(f"Prediction on {file_id}"):
            prediction_dict = prediction_for_clip_new(test_df_for_file_id,
                                                  clip=clip,
                                                  model=model,
                                                  mel_params=mel_params,
                                                  threshold=threshold)
            # print("prediction_dict", prediction_dict)
        # row_id = list(prediction_dict.keys())
        # birds = list(prediction_dict.values())
        # prediction_df = pd.DataFrame({
        #     "row_id": row_id,
        #     "birds": birds
        # })
        prediction_dfs = prediction_dfs.append(prediction_dict, ignore_index=True)
    
    # prediction_df = pd.concat(prediction_dfs, axis=0, sort=False).reset_index(drop=True)
    return prediction_dfs

## Prediction

In [33]:
prediction_dfs = prediction(test_df=test_unique,
                        test_audio=TEST_AUDIO_DIR,
                        model_config=model_config,
                        mel_params=melspectrogram_parameters,
                        target_sr=TARGET_SR,
                        threshold=0.05)

prediction_dfs

Unnamed: 0,file_id,end_time,predictions
0,soundscape_453028782,5,arcter rudtur sander
1,soundscape_453028782,10,dunlin rudtur sander
2,soundscape_453028782,15,dunlin rudtur sander


In [45]:
submission = pd.DataFrame(None, columns=['row_id', 'target'])

for _, row in test.iterrows():
    lst = prediction_dfs[prediction_dfs['file_id'] == row.file_id][prediction_dfs['end_time'] == row.end_time]['predictions'].str.split(' ')
    cur_prediction = pd.DataFrame({"row_id" : row.row_id, 
                                        "target" : row.bird in lst}, index = [0])
    submission = submission.append(cur_prediction, ignore_index=True)

print(submission)
submission.to_csv("submission.csv", index=False)

                           row_id target
0   soundscape_453028782_akiapo_5  False
1  soundscape_453028782_akiapo_10  False
2  soundscape_453028782_akiapo_15  False
3   soundscape_453028782_aniani_5  False
4  soundscape_453028782_aniani_10  False
5  soundscape_453028782_aniani_15  False
