In [1]:
# !pip install transformers
import sys
sys.path.append('../input/bird-filter-data')
import noisereduce as nr

In [2]:
import torch
import numpy as np
import random
import os
class config:
    num_fold = 10
    sample_rate= 32_000
    sampleNum = 32_000*5
    n_fft=1024
    win_length = 1024
    hop_length=512
    n_mels=64
    duration=5
    num_classes = 152
    train_batch_size = 128
    valid_batch_size = 128
    epochs = 5
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    learning_rate = 1e-3
    
config.device

'cuda'

In [3]:
import pandas as pd
import os
PATH_TRAIN_DATASET = "../input/bird-filter-data/Slice_data"
path_csv = os.path.join(PATH_TRAIN_DATASET, "Filter_Clip_Data.csv")
train_meta = pd.read_csv(path_csv)
train_meta.head()

Unnamed: 0,primary_label,secondary_labels,type,rating,filename,duration,segmentNum,seg_index
0,afrsil1,[],"['call', 'flight call']",2.5,afrsil1/XC125458.ogg,11.102031,2.0,0.0
1,afrsil1,[],"['call', 'flight call']",2.5,afrsil1/XC125458.ogg,11.102031,2.0,1.0
2,afrsil1,"['houspa', 'redava', 'zebdov']",['call'],3.5,afrsil1/XC175522.ogg,47.020406,9.0,0.0
3,afrsil1,"['houspa', 'redava', 'zebdov']",['call'],3.5,afrsil1/XC175522.ogg,47.020406,9.0,1.0
4,afrsil1,"['houspa', 'redava', 'zebdov']",['call'],3.5,afrsil1/XC175522.ogg,47.020406,9.0,2.0


In [4]:
train_meta = train_meta.dropna().reset_index(drop=True)

train_meta['new_filename'] = train_meta['filename'].str.replace('.ogg', '_') + train_meta['seg_index'].values.astype(int).astype(str) +'.ogg' 

  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
train_meta['new_filename'].str.len().max()

24

In [6]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
train_meta['primary_label_encoded'] = encoder.fit_transform(train_meta['primary_label'])

# save encoder

In [7]:
np.save('encoder_list.npy',encoder.classes_)


In [8]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=config.num_fold)
for k, (_, val_ind) in enumerate(skf.split(X=train_meta, y=train_meta['primary_label_encoded'])):
    train_meta.loc[val_ind, 'fold'] = k



In [9]:
def get_data(df,fold):
    train_df = df[~df['fold'].isin(fold)].reset_index(drop=True)
    valid_df = df[df['fold'].isin(fold)].reset_index(drop=True)
    return train_df,valid_df  

train_df,valid_df = get_data(train_meta,[7,8,9])

In [10]:
valid_df[['primary_label','filename']].groupby('primary_label').count().describe()


Unnamed: 0,filename
count,152.0
mean,215.309211
std,276.315642
min,1.0
25%,31.75
50%,88.0
75%,300.0
max,1191.0


In [11]:
train_df[['primary_label','filename']].groupby('primary_label').count().describe()


Unnamed: 0,filename
count,152.0
mean,502.407895
std,644.303067
min,1.0
25%,74.0
50%,205.5
75%,701.75
max,2779.0


In [12]:
# filename = os.path.join(PATH_TRAIN_DATASET, 'Slice_data',train_df.iloc[0].new_filename)
# waveform,_ = torchaudio.load(filename)

In [13]:
import torchaudio
# STFT
n_fft = 1024
win_length = 1024
hop_length = 512
transform = torchaudio.transforms.Spectrogram(
    n_fft = n_fft,           # freqGroup = n_fft//2 + 1
    win_length = win_length, # freq gap for each group
    hop_length = hop_length, # length = samples / hop_length
    center = True,
    pad_mode = 'reflect',
    power=2.0
).to('cpu')
transform

Spectrogram()

In [14]:
transform = torchaudio.transforms.MFCC(
    sample_rate = 32000, 
    n_mfcc = 128, 
    dct_type = 2, 
    norm = 'ortho', 
    log_mels = False, 
)
transform

  "At least one mel filterbank has all zero values. "


MFCC(
  (amplitude_to_DB): AmplitudeToDB()
  (MelSpectrogram): MelSpectrogram(
    (spectrogram): Spectrogram()
    (mel_scale): MelScale()
  )
)

In [15]:
# waveform,_ = torchaudio.load('../input/bird-filter-data/Slice_data/Slice_data/akekee/XC174953_0.ogg')
# waveform.shape

In [16]:
# spec = transform(waveform).unsqueeze(0)
# spec

In [17]:
from torch.utils.data import Dataset, DataLoader
import torchaudio
import random
class BirdClefDataset(Dataset):
    def __init__(self, df):
        self.audio_paths = df['new_filename'].values
        self.labels = df['primary_label_encoded'].values
        self.stretch = torchaudio.transforms.TimeStretch()
        self.sr = 32000
    def __len__(self):
        return len(self.audio_paths)
    
    def __getitem__(self, index):
        filename = os.path.join(PATH_TRAIN_DATASET, 'Slice_data',self.audio_paths[index])
        waveform,_ = torchaudio.load(filename)
        splitPoint = random.randint(self.sr,self.sr*4)
        newWaveform=torch.cat([waveform[:,splitPoint:],waveform[:,:splitPoint]],dim=1)
        label = torch.tensor(self.labels[index])
        
        return transform(newWaveform), label

In [18]:
import torch.nn as nn
import torch.nn.functional as F
# from transformers import Wav2Vec2ForSequenceClassification


In [19]:
def loss_fn(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

# Train

In [20]:
from sklearn.metrics import f1_score

def train(model, data_loader, optimizer, scheduler, device, epoch):
    model.train()
    pred = []
    label = []
    
    running_loss = 0
    acc = 0
    loop = tqdm(data_loader, position=0)
    for i, (spec, labels) in enumerate(loop):
        spec = spec.to(device)
        labels = labels.to(device)
        
        outputs = model(spec)
        _, preds = torch.max(outputs, 1)
        acc += (preds==labels).sum()
        
        loss = loss_fn(outputs, labels)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        if scheduler is not None:
            scheduler.step()
            
        running_loss += loss.item()
        pred.extend(preds.view(-1).cpu().detach().numpy())
        label.extend(labels.view(-1).cpu().detach().numpy())
        
        loop.set_description(f"Epoch [{epoch+1}/{config.epochs}]")
        loop.set_postfix(loss=loss.item())

    return running_loss/len(data_loader),acc/(len(data_loader)*config.train_batch_size)

In [21]:
def valid(model, data_loader, device, epoch):
    model.eval()
    
    acc = 0
    running_loss = 0
    pred = []
    label = []

    loop = tqdm(data_loader, position=0)
    for spec, labels in loop:
        spec = spec.to(device)
        labels = labels.to(device)
        
        outputs = model(spec)
        _, preds = torch.max(outputs, 1)
        acc += (preds==labels).sum()
        
        loss = loss_fn(outputs, labels)
            
        running_loss += loss.item()
        
        pred.extend(preds.view(-1).cpu().detach().numpy())
        label.extend(labels.view(-1).cpu().detach().numpy())
        
        loop.set_description(f"Epoch [{epoch+1}/{config.epochs}]")
        loop.set_postfix(loss=loss.item())
        
    valid_f1 = f1_score(label, pred, average='macro')
    
    return running_loss/len(data_loader), valid_f1,acc/(len(data_loader)*config.valid_batch_size)

In [22]:
train_dataset = BirdClefDataset(train_df)
valid_dataset = BirdClefDataset(valid_df)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.train_batch_size, shuffle=True,num_workers=os.cpu_count(),pin_memory=(torch.cuda.is_available()))
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=config.valid_batch_size, shuffle=True,num_workers=os.cpu_count(),pin_memory=(torch.cuda.is_available()))

In [23]:
import torchvision,torch
model = torchvision.models.alexnet()
model.features[0] = torch.nn.Conv2d(1, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
model.classifier[6] = torch.nn.Linear(in_features=4096, out_features=152, bias=True)

# import torchvision.models as models

# # You will need the number of filters in the `fc` for future use.
# # Here the size of each output sample is set to 2.
# # Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
# model = models.resnet18(progress=True)
# model.fc = nn.Linear(model.fc.in_features, config.num_classes) 
# model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)

# model.load_state_dict(torch.load('../input/alex-nex/model.pt',map_location='cpu'))
model = model.to(config.device)

In [31]:
from tqdm import tqdm
 
# from transformers import Wav2Vec2Model, Wav2Vec2Config

# configuration = Wav2Vec2Config(num_labels=config.num_classes)
# model = Wav2Vec2ForSequenceClassification(configuration)


# PATH_Model = "../input/model-02f1"
# model_path = os.path.join(PATH_Model, "model.pt")
# model = BirdClefModel()
# model.load_state_dict(torch.load(model_path,map_location='cpu'))


optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, eta_min=1e-5, T_max=5)

for epoch in range(5):
    train_loss,train_acc = train(model, train_loader, optimizer, scheduler, config.device, epoch)
    valid_loss, valid_f1,valid_acc = valid(model, valid_loader, config.device, epoch)
    print('train Loss: ',train_loss,'train_acc: ',train_acc.item(),'valid Loss: ',valid_loss,'valid_acc: ',valid_acc.item()," vaild_F1",valid_f1)

Epoch [1/5]: 100%|██████████| 597/597 [09:41<00:00,  1.03it/s, loss=1.39] 
Epoch [1/5]: 100%|██████████| 256/256 [03:50<00:00,  1.11it/s, loss=2.47]


train Loss:  1.3411532507669586 train_acc:  0.658841073513031 valid Loss:  2.5094944811426103 valid_acc:  0.466827392578125  vaild_F1 0.29151476260665066


Epoch [2/5]: 100%|██████████| 597/597 [09:33<00:00,  1.04it/s, loss=1.09] 
Epoch [2/5]: 100%|██████████| 256/256 [03:54<00:00,  1.09it/s, loss=2.56]


train Loss:  1.238934044362712 train_acc:  0.6797136664390564 valid Loss:  2.547313285525888 valid_acc:  0.474090576171875  vaild_F1 0.30047273404102187


Epoch [3/5]: 100%|██████████| 597/597 [09:27<00:00,  1.05it/s, loss=1.16] 
Epoch [3/5]: 100%|██████████| 256/256 [03:50<00:00,  1.11it/s, loss=2.42]


train Loss:  1.1685747469290217 train_acc:  0.6983485221862793 valid Loss:  2.5634861811995506 valid_acc:  0.4747314453125  vaild_F1 0.30477544946836643


Epoch [4/5]: 100%|██████████| 597/597 [09:35<00:00,  1.04it/s, loss=0.884]
Epoch [4/5]: 100%|██████████| 256/256 [03:54<00:00,  1.09it/s, loss=2.27]


train Loss:  1.1154910587585551 train_acc:  0.7075219750404358 valid Loss:  2.556160689331591 valid_acc:  0.47625732421875  vaild_F1 0.3118335439931455


Epoch [5/5]: 100%|██████████| 597/597 [09:42<00:00,  1.02it/s, loss=1.31] 
Epoch [5/5]: 100%|██████████| 256/256 [03:57<00:00,  1.08it/s, loss=2.54]

train Loss:  1.0358476760599082 train_acc:  0.7259736061096191 valid Loss:  2.5509622539393604 valid_acc:  0.478912353515625  vaild_F1 0.31593017781733423





In [25]:
model

AlexNet(
  (features): Sequential(
    (0): Conv2d(1, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

In [32]:
torch.save(model.state_dict(),"model_v3.pt")
