In [1]:
!pip install -q /kaggle/input/birdclef-extra/python_packages/lightning-2.2.0-py3-none-any.whl --no-deps

!pip install -q /kaggle/input/birdclef-extra/python_packages/colorednoise-2.2.0-py3-none-any.whl

In [2]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import random
from typing import List
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

import torchaudio

import torch

from torch.utils.data import DataLoader,TensorDataset

import lightning as L

import datasets

from torch.utils.data import Dataset, DataLoader,WeightedRandomSampler

from pathlib import Path
import multiprocessing
import colorednoise as cn
import torch.nn as nn
import librosa
from torch.distributions import Beta

import timm
from torchinfo import summary

import torch.nn.functional as F

from torch.optim.lr_scheduler import (
    CosineAnnealingLR,
    CosineAnnealingWarmRestarts,
    ReduceLROnPlateau,
    OneCycleLR,
)
from lightning.pytorch.callbacks  import ModelCheckpoint, EarlyStopping

In [3]:

class AttBlockV2(nn.Module):
    def __init__(self, in_features: int, out_features: int, activation="linear"):
        super().__init__()

        self.activation = activation
        self.att = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )
        self.cla = nn.Conv1d(
            in_channels=in_features,
            out_channels=out_features,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=True,
        )

        self.init_weights()

    def init_weights(self):
        init_layer(self.att)
        init_layer(self.cla)

    def forward(self, x):
        # x: (n_samples, n_in, n_time)
        norm_att = torch.softmax(torch.tanh(self.att(x)), dim=-1)
        cla = self.nonlinear_transform(self.cla(x))
        x = torch.sum(norm_att * cla, dim=2)
        return x, norm_att, cla
    
    def nonlinear_transform(self, x):
        if self.activation == "linear":
            return x
        elif self.activation == "sigmoid":
            return torch.sigmoid(x)

In [4]:
def init_layer(layer):
    '''
    Initialize the parameters of the fully connected layer
    '''
    nn.init.xavier_uniform_(layer.weight) # Initialize the weights and biases of the network layer

    if hasattr(layer, "bias"): # Check if the layer has a bias attribute
        if layer.bias is not None: # and bis is not none
            layer.bias.data.fill_(0.0) # If a bias exists, initialize it to 0

In [5]:
class BirdModelModule(L.LightningModule):

    def __init__(self,sample_rate:int=32000,pretrained_model_name:str='tf_efficientnetv2_s_in21k',class_num:int=182):
        super().__init__()
        self.sample_rate=sample_rate
        self.class_num=class_num


        # load  pretrained model
        pretrained_model = timm.create_model(pretrained_model_name, pretrained=False,in_chans=3)
        pretrained_model.load_state_dict(torch.load('/kaggle/input/birdclef-extra/backbones/tf-efficientnetv2_s_in21k/tf_efficientnetv2_s_in21k_weights.pth'))

        # The last two layers are an adaptive pooling layer and a fully connected layer.
        # Here I choose to replace these two layers. First remove these two layers
        layers = list(pretrained_model.children())[:-2]

        self.encoder = nn.Sequential(*layers) 

        self.in_features=pretrained_model.classifier.in_features 

        # dense layer
        self.fc1 = nn.Linear(in_features=self.in_features, out_features=self.in_features, bias=True)

        self.att_block=AttBlockV2(in_features=self.in_features, out_features=self.class_num, activation="sigmoid")

        # Initialize the weights and biases of the fully connected layer
        init_layer(self.fc1)

        # loss function
        self.loss_function = nn.BCEWithLogitsLoss(reduction="none")


        # freeze
        self.freeze()



    def freeze(self):
        self.encoder.eval()
        # self.fc1.eval()
        for param in self.encoder.parameters():
            param.requires_grad = False
        # for param in self.fc1.parameters():
        #     param.requires_grad = False
        return



    def forward(self,clip):

        # Use the pre-trained model (excluding the last two layers) for calculation
        clip=self.encoder(clip) # feature extractor

        # Calculate the mean of each frequency band and merge them to compress the dimension
        clip = torch.mean(clip, dim=2)

        # channel smoothing
        x1 = F.max_pool1d(clip, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(clip, kernel_size=3, stride=1, padding=1)
        x = x1 + x2

        x = F.dropout(x, p=0.3, training=True)

        x = x.transpose(1, 2)

        x = F.relu_(self.fc1(x))

        x = x.transpose(1, 2)

        x = F.dropout(x, p=0.3, training=True)

        target_pred, norm_att, segmentwise_output = self.att_block(x)

        
        return target_pred




    def training_step(self,batch,batch_idx):


        audio_label=batch[0]
        clip=batch[1]
        audio_weights=batch[2]
    

        # mix audio up
        mixup = Mixup(mix_beta=5,mixup_prob=0.7,mixup_double=0.5)

        clip, audio_label,audio_weights=mixup(X=clip,Y=audio_label,weight=audio_weights)

        # Use Compose to combine multiple audio transformation operations. 
        # These operations are applied to the input audio data to enhance the generalization and robustness of the model.
        # clip=self.audio_transforms(clip,sample_rate=self.sample_rate)

        # Convert audio data into mel spectrogram
        clip=mel_transform(sample_rate=self.sample_rate,audio=clip)

        db_transform = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)

        clip=db_transform(clip)

        #generalization
        clip=(clip+80)/80

        # Random mask part of the Spectrogram, which helps the model learn to be robust when information is missing in certain time periods.

        time_mask_transform = torchaudio.transforms.TimeMasking(time_mask_param=20, iid_masks=True, p=0.3)

        clip = time_mask_transform(clip)

        # Calculate the first and second order differences of audio or other time series data, usually called delta and delta-delta (also called acceleration) features.
        clip= image_delta(clip)

        # mix audio up
        mixup2 = Mixup2(mix_beta=2, mixup2_prob=0.15)

        clip, audio_label,audio_weights = mixup2(clip, audio_label, audio_weights)

        # predictions
        target_pred=self(clip)

        loss = self.loss_function(torch.logit(target_pred), audio_label)

        loss = loss.sum(dim=1) * audio_weights

        loss = loss.sum()

        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss


    def validation_step(self,batch,batch_idx):
        audio_label=batch[0]
        clip=batch[1]
        audio_weights=batch[2]

        audio_label=audio_label
        clip=clip
        audio_weights=audio_weights

        # convert audio to mel spectrogram
        clip=mel_transform(sample_rate=self.sample_rate,audio=clip)

        db_transform = torchaudio.transforms.AmplitudeToDB(stype="power", top_db=80)

        clip=db_transform(clip)

        #generalization
        clip=(clip+80)/80

        # Calculate the first and second order differences of audio or other time series data, usually called delta and delta-delta (also called acceleration) features.
        clip= image_delta(clip)

        # predictions
        target_pred=self(clip)

        loss = self.loss_function(torch.logit(target_pred), audio_label)

        loss = loss.sum(dim=1) * audio_weights

        loss = loss.sum()

        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss

        

    def configure_optimizers(self):
        model_optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, self.parameters()),
            lr=0.001,
            weight_decay=0.001,
        )
        interval = "epoch"

        lr_scheduler = CosineAnnealingWarmRestarts(
            model_optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1
        )

        return {
            "optimizer": model_optimizer,
            "lr_scheduler": {
                "scheduler": lr_scheduler,
                "interval": interval,
                "monitor": "val_loss",
                "frequency": 1,
            },
        }

    def on_train_epoch_end(self):
        pass


    def on_validation_epoch_end(self):
        pass

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        # If you have only one tensor (feature) in your TensorDataset, batch will be a tuple containing a tensor and an empty tuple (since there are no labels)
        features= batch
        features=features
        predictions = self(features)
        # Because what our model ultimately wants is the probability of an object corresponding to all categories, 
        # the sigmoid function is used here because we want to treat each class as a separate probability, so softmax is not needed
        probabilities = predictions.sigmoid().detach()

        return probabilities

In [6]:
# 1. load checkpoint
class_num=len(np.load('/kaggle/input/birdclef-extra/external_files/3-bird-cates.npy',allow_pickle=True))

model = BirdModelModule.load_from_checkpoint(
    checkpoint_path="/kaggle/input/birdclef-extra/attention-05-19/sed_s21k_43-44-450.98.ckpt",
    class_num=class_num
)

/opt/conda/lib/python3.10/site-packages/lightning/pytorch/utilities/migration/utils.py:56: The loaded checkpoint was produced with Lightning v2.2.2, which is newer than your current Lightning version: v2.2.0
  model = create_fn(


In [7]:
test_audio_dir = '/kaggle/input/birdclef-2024/test_soundscapes/'
pred_files = [test_audio_dir+f for f in sorted(os.listdir(test_audio_dir))]
if len(pred_files) == 1:
    test_audio_dir = '/kaggle/input/birdclef-2024/unlabeled_soundscapes/'
    pred_files = [test_audio_dir+f for f in sorted(os.listdir(test_audio_dir))][:2]

In [8]:
def split_audio(audio: torch.Tensor, segment_length:int):

    '''
    split raw audio tensor into multiple clips with 5 seconds long.

    Parameters:
        audio: the raw audio tensor
        segment_length: the audio length of each 5 seconds

    return:
        parts: list includes all clips
        end_time_list: the list of all clips' end time in seconds
    '''

    length_audio = audio.shape[1]
    parts = []
    end_time_list=[]
    end_time=5
    for i in range(0, length_audio, segment_length):
        part = audio[0][i:i + segment_length]
        if len(part) == segment_length:  # Ensure the fragment lengths are consistent
            parts.append(part)  #Store the raw bytes of audio data
            end_time_list.append(end_time)
            end_time+=5

        

    return parts,end_time_list

In [9]:
# Regarding the data of a single audio, some audio information needs to be paid attention to, such as audio duration and number of channels.


def audio_info(audio: torch.Tensor, sample_rate: int):
    """
    Grab all information of the input audio loaded by torchaudio.

    Parameters:
        audio: Tensor representing the waveform
        sample_rate: Sample rate of the audio file

    Return:
        duration_seconds: Duration of the audio in seconds
        num_channels: Number of audio channels
    """
    # The audio duration time (seconds)
    duration_seconds = audio.shape[1] / sample_rate

    # The number of channels
    num_channels = audio.shape[0]


    return duration_seconds, num_channels

In [10]:
def read_audio(path: str):
    """
    Read an OGG file using torchaudio and return the waveform tensor and sample rate.

    Parameters:
        path: Path to the .ogg file

    Returns:
        waveform: Tensor representing the waveform
        sample_rate: Sample rate of the audio file
    """
    audio, sample_rate = torchaudio.load(path)
    return audio, sample_rate

In [11]:
def pred_transform(batch):
    """
    Transform audio data into normalized mel spectrogram in decibel scale.
    """
#     print('1111')
    n_fft = int(0.04 * 32000)  # Convert window size to sample points
    hop_length = int(0.02 * 32000)  # Convert hop size to sample points
    n_mels = 40  # Number of Mel filters

    # Create Mel Spectrogram transformer
    mel_transformer = MelSpectrogram(
        sample_rate=32000,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
        f_min=0,
        f_max=16000
    )
    
    # Create dB transformer
    db_transform = AmplitudeToDB(stype="power", top_db=80)

    melspec_list = []

    for audio_clip in batch:
        # Convert audio clip to tensor and add a new dimension
        audio_clip = audio_clip.unsqueeze(0)

        # Generate Mel Spectrogram
        melspec = mel_transformer(audio_clip)
        
        # Convert Mel Spectrogram to dB
        db_melspec = db_transform(melspec)
        
        # Normalize the spectrogram
        normalized_melspec = (db_melspec + 80) / 80
#         print(normalized_melspec.shape)

        melspec_list.append(normalized_melspec)

    # Stack the list of tensors into a single tensor
    stacked_melspecs = torch.stack(melspec_list)
    
    return stacked_melspecs

In [12]:
def compute_deltas(specgram: torch.Tensor, win_length: int = 5, mode: str = "replicate") -> torch.Tensor:
    """Compute delta coefficients of a tensor, usually a spectrogram.

    Args:
        specgram (Tensor): Tensor of audio of dimension (..., freq, time)
        win_length (int, optional): The window length used for computing delta (Default: 5)
        mode (str, optional): Mode parameter passed to padding (Default: "replicate")

    Returns:
        Tensor: Tensor of deltas of dimension (..., freq, time)
    """
    device = specgram.device  # Get the device of the input tensor
    dtype = specgram.dtype

    # pack batch
    shape = specgram.size()
    specgram = specgram.reshape(1, -1, shape[-1])

    assert win_length >= 3
    n = (win_length - 1) // 2
    denom = n * (n + 1) * (2 * n + 1) / 3

    specgram = torch.nn.functional.pad(specgram, (n, n), mode=mode)

    # Create the kernel tensor, making sure it is on the same device as the input tensor
    kernel = torch.arange(-n, n + 1, 1, dtype=dtype,device=device).repeat(specgram.shape[1], 1, 1)

    output = (
        torch.nn.functional.conv1d(specgram, kernel, groups=specgram.shape[1]) / denom
    )

    # unpack batch
    output = output.reshape(shape)

    return output



def make_delta(input_tensor: torch.Tensor):
    input_tensor = input_tensor.transpose(3, 2)
    input_tensor = compute_deltas(input_tensor)
    input_tensor = input_tensor.transpose(3, 2)
    return input_tensor

In [13]:
def image_delta(batch):
#     batch=torch.stack(batch)
    delta_1 = make_delta(batch)
    delta_2 = make_delta(delta_1)
    x = torch.cat([batch, delta_1, delta_2], dim=1)
    
    return x

In [14]:
from torch.utils.data import TensorDataset, DataLoader

In [15]:
submission=pd.DataFrame(columns=['row_id']+np.load('/kaggle/input/birdclef-extra/ChroNet-05-17/13-2-bird-cates-preprepared.npy',allow_pickle=True).tolist())

In [16]:
model.eval()

with torch.no_grad():
    # 3. Using the model to make predictions
    trainer = L.Trainer(
        accelerator="auto", 
        devices='auto'
    )

INFO: GPU available: False, used: False
INFO: TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs


In [17]:
# audio_clips_list=[]
# clip_names_list=[]

for path in pred_files:
    # read audio as tensor
    audio,sr=read_audio(path=path)

    # get audio corresponding informatino
    duration_seconds,num_channels=audio_info(audio=audio,sample_rate=sr)

    # split audio into multi clips with 5 seconds
    audio_clips,end_time_list=split_audio(audio=audio,segment_length=5*sr)

    # generate each label name for each clip
    soundscape_id=path.split('/')[-1].split('.')[0]
    clip_name=[f'{soundscape_id}_{end_time}' for end_time in end_time_list]

#     audio_clips_list.extend(audio_clips)
    
#     clip_names_list.extend(clip_name)
    
#     print(audio_clips)
#     print(type(audio_clips))
    
#     audio_clips = torch.stack(audio_clips)
#     print(type(audio_clips))

    
    melspec_list=pred_transform(audio_clips)
    
#     print(melspec_list.shape)
#     print(len(melspec_list))
#     print(type(melspec_list))
#     print(type(melspec_list[0][0]))
    
    x=image_delta(melspec_list)
    
#     print(x.shape)
#     print(type(x))
    
#     ds=TensorDataset(x)
    
    dataloader = DataLoader(dataset=x, batch_size=8, shuffle=False, num_workers=3)
    
#     dataset = TensorDataset(x)  # Assuming 'x' is a tensor
#     for i in range(min(len(dataset), 3)):  # Check the first few elements
#         sample = dataset[i]
#         print(type(sample), [s.shape for s in sample])

#     for batch in dataloader:
#         if isinstance(batch, tuple):
#             print([b.shape for b in batch])  # If it's a tuple of tensors
#         else:
#             print(type(batch), batch.shape)
#         break
    
    # prediction
    predictions = trainer.predict(model, dataloaders=dataloader)
    
    # Convert each tensor to a NumPy array and use them as rows of the DataFrame
    data_frames = [pd.DataFrame(tensor.numpy()) for tensor in predictions]
    
    # Merge all DataFrames into one big DataFrame
    # Each tensor forms a block of the DataFrame
    df = pd.concat(data_frames, ignore_index=True)
    
    df.columns=np.load('/kaggle/input/birdclef-extra/ChroNet-05-17/13-2-bird-cates-preprepared.npy',allow_pickle=True).tolist()
    
    # create a new Series
    new_column = pd.Series(clip_name, name='row_id')
    
    df.insert(0,'row_id',new_column)
    
    submission=pd.concat([submission,df],ignore_index=True)
    
#     display(submission)
    
    del df, new_column, data_frames, predictions, dataloader, x, melspec_list,clip_name, audio_clips, audio

2024-05-19 07:06:19.058687: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-19 07:06:19.058818: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-19 07:06:19.237283: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  self.pid = os.fork()


Predicting: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()
  submission=pd.concat([submission,df],ignore_index=True)
  self.pid = os.fork()


Predicting: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


In [18]:
submission.to_csv('submission.csv',index=False)

In [19]:
# submission