In [1]:
# basic packages
import os
import json
import pandas as pd
import numpy as np

# Deep Learning framework
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Audio processing
import torchaudio
import torchaudio.transforms as T
import librosa

# Image processing
from PIL import Image
import torchvision.transforms as transforms

# Pre-trained image models
import timm

In [2]:
INPUT_PATH = '/kaggle/input'
DATA_PATH = '/kaggle/input/birdclef-2023'
TEST_PATH = os.path.join(DATA_PATH, 'test_soundscapes')

In [3]:
to_split = []

for dirname, _, filename in os.walk(TEST_PATH):
    for f in filename:
        to_split.append(os.path.join(dirname, f))

to_split

['/kaggle/input/birdclef-2023/test_soundscapes/soundscape_29201.ogg']

In [4]:
def split_audio(file_path, duration=5):
    # load audio file with librosa
    audio, sr = librosa.load(file_path, sr=None)

    # compute duration
    duration_samples = int(duration * sr)
    segments = range(0, len(audio), duration_samples)
    timecodes = {}

    i = 5
    # save each segment as a wav file
    for start in segments:
        # save only if the audio is equal than the duration
        if len(audio[start:]) >= duration_samples:
            path = file_path.split(os.sep)
            path = path[-1].replace(".ogg", f"_{i}")
            timecodes[path] = (start, start + duration_samples)
            i += 5
        else:
            break
    
    return timecodes

In [5]:
data = {}
for file in to_split:
    data[file] = split_audio(file)

# convert to dataframe
test_df = pd.DataFrame(columns=['row_id', 'file', 'timecodes'])
for i, (k, v) in enumerate(data.items()):
    for k1, v1 in v.items():
        test_df.loc[len(test_df)] = [k1, k, v1]

test_df



Unnamed: 0,row_id,file,timecodes
0,soundscape_29201_5,/kaggle/input/birdclef-2023/test_soundscapes/s...,"(0, 160000)"
1,soundscape_29201_10,/kaggle/input/birdclef-2023/test_soundscapes/s...,"(160000, 320000)"
2,soundscape_29201_15,/kaggle/input/birdclef-2023/test_soundscapes/s...,"(320000, 480000)"
3,soundscape_29201_20,/kaggle/input/birdclef-2023/test_soundscapes/s...,"(480000, 640000)"
4,soundscape_29201_25,/kaggle/input/birdclef-2023/test_soundscapes/s...,"(640000, 800000)"
...,...,...,...
115,soundscape_29201_580,/kaggle/input/birdclef-2023/test_soundscapes/s...,"(18400000, 18560000)"
116,soundscape_29201_585,/kaggle/input/birdclef-2023/test_soundscapes/s...,"(18560000, 18720000)"
117,soundscape_29201_590,/kaggle/input/birdclef-2023/test_soundscapes/s...,"(18720000, 18880000)"
118,soundscape_29201_595,/kaggle/input/birdclef-2023/test_soundscapes/s...,"(18880000, 19040000)"


In [6]:
class BirdDataset(Dataset):
    def __init__(self, df, sample_rate=32000):
        self.file = df['file'].values
        self.row_ids = df['row_id'].values
        self.timecodes = df['timecodes'].values
        self.sample_rate = sample_rate
        self.melspectrogram = T.MelSpectrogram(sample_rate=self.sample_rate, n_mels=128, n_fft=2048, hop_length=512, f_min=500, f_max=15000)
        self.preprocess = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    
    def __len__(self):
        return len(self.row_ids)
    
    def __getitem__(self, idx):
        audio, sr = torchaudio.load(self.file[idx])

        # get start and end timecodes
        start, end = self.timecodes[idx]

        # get audio segment
        audio = audio[:, start:end]

        # if audio has more than one channel, convert it to mono by averaging the channels
        audio = torch.mean(audio, axis=0)

        if sr != self.sample_rate:
            resampler = T.Resample(sr, self.sample_rate)
            audio = resampler(audio)

        # add random noise to audio
        if np.random.rand() < 0.3:
            noise = torch.randn_like(audio) * 0.005
            audio += noise

        # normalize audio 
        audio = audio / torch.max(torch.abs(audio))

        # convert audio to melspectrogram
        melspec = self.melspectrogram(audio)

        # convert melspec to image
        image = Image.fromarray(melspec.numpy()).convert("RGB")
        image = self.preprocess(image)
        
        return {"image": image, "row_id": self.row_ids[idx]}

In [7]:
test_dataset = BirdDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=32, num_workers=4)

In [8]:
class BirdModel(nn.Module):
    def __init__(self, model_name='tf_efficientnet_b3', num_classes=264):
        super(BirdModel, self).__init__()

        self.model = timm.create_model(model_name, checkpoint_path='/kaggle/input/tf-efficientnet/pytorch/tf-efficientnet-b3/1/tf_efficientnet_b3_aa-84b4657e.pth')
        self.in_features = self.model.classifier.in_features
        self.model.classifier = nn.Sequential(
            nn.Linear(self.in_features, num_classes)
        )
   

    def forward(self, img):
        return self.model(img)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [10]:
model = BirdModel()
model.load_state_dict(torch.load(os.path.join(INPUT_PATH,'models', 'birdclef.pth'), map_location=torch.device('cpu')))
model.eval()

BirdModel(
  (model): EfficientNet(
    (conv_stem): Conv2dSame(3, 40, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn1): BatchNormAct2d(
      40, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
      (drop): Identity()
      (act): SiLU(inplace=True)
    )
    (blocks): Sequential(
      (0): Sequential(
        (0): DepthwiseSeparableConv(
          (conv_dw): Conv2d(40, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=40, bias=False)
          (bn1): BatchNormAct2d(
            40, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
            (drop): Identity()
            (act): SiLU(inplace=True)
          )
          (se): SqueezeExcite(
            (conv_reduce): Conv2d(40, 10, kernel_size=(1, 1), stride=(1, 1))
            (act1): SiLU(inplace=True)
            (conv_expand): Conv2d(10, 40, kernel_size=(1, 1), stride=(1, 1))
            (gate): Sigmoid()
          )
          (conv_pw): Conv2d(40, 24, kernel_size=(1, 1), stride=(

In [11]:
with open(os.path.join(INPUT_PATH, "onehotencoding","one_hot_encoding.json"), "r") as f:
    one_hot_encoding = json.load(f)

labels = []

for label in one_hot_encoding:
    labels.append(label)

csv = pd.DataFrame(columns=['row_id']+labels)
csv

Unnamed: 0,row_id,abethr1,abhori1,abythr1,afbfly1,afdfly1,afecuc1,affeag1,afgfly1,afghor1,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1


In [12]:
import time

start = time.time()
result = []

with torch.no_grad():
    for data in test_loader:
        image = data['image'].to(device)
        row_id = data['row_id']
        output = model(image)
        for i in range(len(output)):
            max_val = output[i].max()
            result.append([row_id[i]] + [1 if output[0][i] == max_val else 0 for i in range(264)])

# add result to csv
for row in result:
    csv.loc[len(csv)] = row

print("Time : ", time.time() - start)
csv

Time :  39.28887748718262


Unnamed: 0,row_id,abethr1,abhori1,abythr1,afbfly1,afdfly1,afecuc1,affeag1,afgfly1,afghor1,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1
0,soundscape_29201_5,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,soundscape_29201_10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,soundscape_29201_15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,soundscape_29201_20,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,soundscape_29201_25,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,soundscape_29201_580,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
116,soundscape_29201_585,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
117,soundscape_29201_590,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
118,soundscape_29201_595,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
csv.to_csv('/kaggle/working/submission.csv', index=False)