In [1]:
import sys
sys.path.append('../input/bird-tools')
import noisereduce as nr
import torch

PATH_DATA = "../input/birdclef-2022"
class config:
    seed=2022
    num_fold = 5
    sample_rate= 32_000
    sampleNum = 32_000*5
    n_fft=1024
    win_length = 1024
    hop_length=512
    n_mels=64
    duration=5
    num_classes = 152
    learning_rate = 1e-3
    device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [2]:
import torchvision,torch
model = torchvision.models.alexnet()
model.features[0] = torch.nn.Conv2d(1, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
model.classifier[6] = torch.nn.Linear(in_features=4096, out_features=152, bias=True)


In [3]:
import os

PATH_Model = "../input/alex-nex"

model_path = os.path.join(PATH_Model, "model_v2.pt")
model.load_state_dict(torch.load(model_path,map_location='cpu'))
model = model.to(config.device)
model.eval()
print(model)

AlexNet(
  (features): Sequential(
    (0): Conv2d(1, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

In [4]:
import noisereduce as nr
import torchaudio
# STFT
device = "cpu"
transform = torchaudio.transforms.MFCC(
    sample_rate = 32000, 
    n_mfcc = 128, 
    dct_type = 2, 
    norm = 'ortho', 
    log_mels = False, 
).to(device)

@torch.no_grad()
def create_spectrogram(fname,reduce_noise = False,channel = 0):
    waveform, sample_rate = torchaudio.load(fname)
       
    # Change singal to mono
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, axis=0, keepdim=True)
        
    # Reduce noise
    if reduce_noise:
        waveform = torch.tensor(nr.reduce_noise(
            y=waveform,
            sr=sample_rate,
            use_tqdm=True,
            n_jobs=3,
        ))
    step = int(5 * sample_rate)
    lenSamples = waveform.size()[-1]
    if (lenSamples%step) > (step*0.6):
        waveform = torch.nn.functional.pad(waveform, (0,step-lenSamples%step), mode='constant', value=0.0)
    

    frames = []
    for i in range(waveform.size()[-1]//step):
        begin = i * step
#         print(output_path)
        frame = waveform[:,begin:begin + step]
        frames.append(frame)
    return transform(torch.stack(frames)) 

  "At least one mel filterbank has all zero values. "


In [5]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
df = pd.read_csv('../input/birdclef-2022/train_metadata.csv')

In [6]:
encoderList = np.load(PATH_Model+'/encoder_list.npy',allow_pickle=True)
encoder = LabelEncoder()
encoder.fit(encoderList)
# encoder.classes_

LabelEncoder()

In [7]:
import json

with open(os.path.join(PATH_DATA, "scored_birds.json")) as fp:
    scored_birds = json.load(fp)

print(scored_birds)
print(encoder.transform(scored_birds))

['akiapo', 'aniani', 'apapan', 'barpet', 'crehon', 'elepai', 'ercfra', 'hawama', 'hawcre', 'hawgoo', 'hawhaw', 'hawpet1', 'houfin', 'iiwi', 'jabwar', 'maupar', 'omao', 'puaioh', 'skylar', 'warwhe1', 'yefcan']
[  3   6   7   9  44  46  47  60  62  63  64  65  67  70  72  90 101 111
 131 141 150]


In [8]:
import glob
import math

TestPathAudio = glob.glob("/kaggle/input/birdclef-2022/test_soundscapes/*.ogg")
threshold = 1/config.num_classes
outputBirds = list(zip(scored_birds,encoder.transform(scored_birds)))

submission = []
for path_audio in TestPathAudio:
    filename = os.path.basename(path_audio).replace('.ogg','')

    spec = create_spectrogram(path_audio, reduce_noise=True).to(config.device)
    outputs = model(spec)
    outputs = torch.nn.Softmax(dim=1)(outputs)
    
    for i in range(len(outputs)):
        for bird,pos in outputBirds:
            submission.append({
                "row_id": filename + '_'+bird +'_' + str((i+1)*5),
                "target": outputs[i,pos].item() > threshold,
            })

In [9]:
df_submission = pd.DataFrame(submission).set_index("row_id")
df_submission.to_csv("submission.csv")

In [10]:
df_submission

Unnamed: 0_level_0,target
row_id,Unnamed: 1_level_1
soundscape_453028782_akiapo_5,False
soundscape_453028782_aniani_5,False
soundscape_453028782_apapan_5,False
soundscape_453028782_barpet_5,False
soundscape_453028782_crehon_5,False
...,...
soundscape_453028782_omao_60,False
soundscape_453028782_puaioh_60,False
soundscape_453028782_skylar_60,False
soundscape_453028782_warwhe1_60,False
