Copyright (c) Microsoft Corporation. All rights reserved.
Licensed under the MIT License.

In [None]:
# pip install matplotlib
# !pip install opencv-python==4.5.5.64
# !pip install torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113 --extra-index-url https://download.pytorch.org/whl/cu113

In [1]:
import torch
import torch.optim as optim
import torch.utils.data as data_utils
import os
import numpy as np
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
import csv
import pandas as pd
import matplotlib.pyplot as plt
import glob
import gc
import h5py
import pickle as pk

from utils import log_results, SaveBestModel, train_seq, test_seq
from utils import normalize_mel_sp_slides

from models import cnn_rnn

In [2]:
torch.cuda.is_available()

True

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
device

device(type='cuda', index=0)

# Set directories

In [5]:
dataDir = './data'
resultsDir = 'Results'
tempDir = 'temp'

if not os.path.exists(resultsDir):
    os.makedirs(resultsDir)
if not os.path.exists(tempDir):
    os.makedirs(tempDir)

# Load data

In [6]:
# !conda install numpy pytables

In [7]:
# fname = 'birds_xeno_spectr_slide_105_species_sr_32000_len_7_sec_500_250_New.h5'
# fileLoc = os.path.join(dataDir,fname)
# hf = h5py.File(fileLoc, 'r')
# mel_sp = hf.get('mel_spectr')[()]
# metadata_total = pd.read_hdf(fileLoc, 'info')
# hf.close()

# ##Saving files

# np.save('./data/mel_sp.npy', mel_sp)
# metadata_total.to_pickle('./data/metadata_total.pkl', protocol=4)

In [7]:
import numpy as np
import pandas as pd

mel_sp = np.load('./data/mel_sp.npy')
metadata_total = pd.read_pickle('./data/metadata_total.pkl')


In [8]:
original_label = list(metadata_total['species'])
lb_bin = LabelBinarizer()
lb_enc = LabelEncoder()
labels_one_hot = lb_bin.fit_transform(original_label)
labels_multi_lbl = lb_enc.fit_transform(original_label)

number_of_sample_classes = len(lb_enc.classes_)
print("Number of Species: ", number_of_sample_classes)
species_id_class_dict_tp = dict()
for (class_label, species_id) in enumerate(lb_bin.classes_):
    species_id_class_dict_tp[species_id] = class_label

Number of Species:  105


In [9]:
mel_sp_normalized = []
for i in range(len(mel_sp)):
    xx_ = normalize_mel_sp_slides(mel_sp[i]).astype('float32')
    mel_sp_normalized += [np.expand_dims(xx_, axis=-3)]
mel_sp_normalized = np.array(mel_sp_normalized)

In [10]:
batch_size = 16*2
shuffleBatches=True
num_epoch = 50

## CNN configs

In [11]:
cfg_cnn = [32, 'M', 64, 64, 'M', 128, 128, 128, 'M', 128, 128, 128, 'M'] # CNN1
# n_units = 128*2

cfg_cnn2 = [32, 64, 'M', 64, 64, 64, 'M', 128, 128, 128, 'M', 128, 128, 128, 'M', 256, 256, 256, 'M']
# n_units = 256*2

cfg_cnn3 = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'] # CNN3
n_units = 512*2

## RNN configs

For RNN, a list of configs could be provided for testing multiple configurations.

Each configuration element is dictionary with key as 'ordered' name of required RNNS. For example, to have 2 layers of GRUs, use 'GRU_0', 'GRU_1', similarly, for 1 GRU followed by 1 LMU, use 'GRU_0', 'LMU_1', contrary, to use LMU and then GRU, use 'LMU_0', 'GRU_1'. Currently supported RNN cells are LSTM, GRU, and LMU.

Each key has value as another dictionary with entries:
input_size-> input dimension of this RNN cell
h_states_ctr-> number of inner states in the RNN cell. For LSTM it is 2, GRU has 1, LMU has 2.

In [12]:
hidden_size = 512
rnnConfigs = [
    {'LSTM_0':{'input_size':n_units, 'h_states_ctr':2},
    'LSTM_1':{'input_size':hidden_size, 'h_states_ctr':2}  # 2 layers of LSTM cell
    },
    {'LMU_0':{'input_size':n_units, 'h_states_ctr':2},
    'LMU_1':{'input_size':hidden_size, 'h_states_ctr':2}, # 2 layers of LMU cell
    },
    {'GRU_0':{'input_size':n_units, 'h_states_ctr':1},
    'GRU_1':{'input_size':hidden_size, 'h_states_ctr':1}, # 2 layers of GRU cell
    },
    {'GRU_0':{'input_size':n_units, 'h_states_ctr':1},
    'LMU_1':{'input_size':hidden_size, 'h_states_ctr':2}, # 1 GRU cell and then 1 LMU cell
    },
]

make sure to assign different exp_no for each experiments

In [13]:
# device = torch.device('cpu')

In [None]:
exp_no_base = 0
exp_ctr = 0
for ii, cfg in enumerate(rnnConfigs):
    exp_ctr += 1

    exp_no = exp_no_base + exp_ctr
    log_file_name = f'100_species_spectr_cnn_rnn_7sec_h_{hidden_size}_nl_{ii+1}_{exp_no}.p'
    store_ = log_results(file_name=log_file_name, results_dir = resultsDir)
    PATH_curr = os.path.join(tempDir, f'currentModel_cnn_rnn_{exp_no}.pt')
    saveModel = SaveBestModel(PATH=PATH_curr, monitor=-np.inf, verbose=True)

    exp_ind = 0
    skf = StratifiedKFold(n_splits=5, random_state=None)
    for train_ind, test_ind in skf.split(mel_sp_normalized, labels_multi_lbl):

        PATH_curr = os.path.join(tempDir, f'currentModel_cnn_rnn_{exp_no}_{exp_ind}.pt')
        saveModel = SaveBestModel(PATH=PATH_curr, monitor=-np.inf, verbose=True)

        X_train, X_test_p_valid = mel_sp_normalized[train_ind,:], mel_sp_normalized[test_ind,:]

        y_train, y_test_p_valid = labels_one_hot[train_ind], labels_one_hot[test_ind]
        y_train_mlbl, y_test_p_valid_mlbl = labels_multi_lbl[train_ind], labels_multi_lbl[test_ind]
        X_valid, X_test, y_valid, y_test = train_test_split(X_test_p_valid, y_test_p_valid,
                                                               test_size=0.5,
                                                               stratify=y_test_p_valid_mlbl,
                                                               random_state=42)

        print('X_train shape: ', X_train.shape)
        print('X_valid shape: ', X_valid.shape)
        print('X_test shape: ', X_test.shape)

        X_train, X_valid  = torch.from_numpy(X_train).float(), torch.from_numpy(X_valid).float()
        y_train, y_valid = torch.from_numpy(y_train), torch.from_numpy(y_valid)

        y_train, y_valid = y_train.float(), y_valid.float()
        train_use = data_utils.TensorDataset(X_train, y_train)
        train_loader = data_utils.DataLoader(train_use, batch_size=batch_size, shuffle=shuffleBatches)

        val_use = data_utils.TensorDataset(X_valid, y_valid)
        val_loader = data_utils.DataLoader(val_use, batch_size=32, shuffle=False)

        model = cnn_rnn(cnnConfig = cfg_cnn3, 
                        rnnConfig = cfg, 
                        hidden_size=hidden_size, 
                        # order=order,
                        # theta=theta,
                        num_classes=105)
        model.to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001, weight_decay=1e-7)

        val_acc_epochs = []
        val_loss_epochs = []
        for epoch in range(1, num_epoch+1):
            train_loss = train_seq(model, train_loader, optimizer, epoch, 
                                    device,
                                    verbose=1, loss_fn = 'bceLogit')
            val_loss, val_acc = test_seq(model, val_loader,
                                        device,
                                        loss_fn = 'bceLogit')
            val_acc_epochs.append(val_acc)
            val_loss_epochs.append(val_loss)
            print('val loss = %f, val acc = %f'%(val_loss, val_acc))
            saveModel.check(model, val_acc, comp='max')

        # loading best validated model
        model = cnn_rnn(cnnConfig = cfg_cnn3, 
                        rnnConfig = cfg, 
                        hidden_size=hidden_size, 
                        # order=order,
                        # theta=theta,
                        num_classes=105)
        model.to(device)
        model.load_state_dict(torch.load(PATH_curr))

        X_test, y_test  = torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float()

        test_use = data_utils.TensorDataset(X_test, y_test)
        test_loader = data_utils.DataLoader(test_use, batch_size=32, shuffle=False)
        test_loss, test_acc = test_seq(model, test_loader,
                                    device,
                                    loss_fn = 'bceLogit')
        print('test loss = %f, test acc = %f'%(test_loss, test_acc))

        log_ = dict(
                exp_ind = exp_ind,
                epochs = num_epoch,
                validation_accuracy = val_acc_epochs,
                validation_loss = val_loss_epochs,
                test_loss = test_loss,
                test_accuracy = test_acc,
                X_train_shape = X_train.shape,
                X_valid_shape = X_valid.shape,
                batch_size =batch_size,
        )
        store_.update(log_)
        exp_ind += 1    

X_train shape:  (34965, 26, 1, 128, 32)
X_valid shape:  (8767, 26, 1, 128, 32)
X_test shape:  (8768, 26, 1, 128, 32)


val loss = 0.037381, val acc = 0.279571
saving best model...


val loss = 0.031803, val acc = 0.400023
saving best model...


val loss = 0.021990, val acc = 0.605566
saving best model...


val loss = 0.019096, val acc = 0.659519
saving best model...


val loss = 0.016868, val acc = 0.709137
saving best model...


val loss = 0.016561, val acc = 0.713471
saving best model...


val loss = 0.014012, val acc = 0.763317
saving best model...


val loss = 0.013640, val acc = 0.772442
saving best model...


val loss = 0.011773, val acc = 0.797422
saving best model...


val loss = 0.014602, val acc = 0.775636


val loss = 0.012395, val acc = 0.806775
saving best model...


val loss = 0.011853, val acc = 0.813049
saving best model...


val loss = 0.012557, val acc = 0.803582


val loss = 0.013883, val acc = 0.798563


val loss = 0.014509, val acc = 0.792175


val loss = 0.010847, val acc = 0.840082
saving best model...


val loss = 0.013126, val acc = 0.810197


val loss = 0.011294, val acc = 0.831870


val loss = 0.011893, val acc = 0.846812
saving best model...


val loss = 0.012572, val acc = 0.839056


val loss = 0.018295, val acc = 0.784191


val loss = 0.012405, val acc = 0.835748


val loss = 0.011953, val acc = 0.850120
saving best model...


val loss = 0.011633, val acc = 0.842934


val loss = 0.013319, val acc = 0.833580


val loss = 0.012642, val acc = 0.840196


val loss = 0.012911, val acc = 0.833809


val loss = 0.011898, val acc = 0.861298
saving best model...


val loss = 0.014198, val acc = 0.814304


val loss = 0.013409, val acc = 0.842820


val loss = 0.011920, val acc = 0.860728


val loss = 0.013960, val acc = 0.850576


val loss = 0.012401, val acc = 0.862439
saving best model...


val loss = 0.013005, val acc = 0.855253


val loss = 0.011947, val acc = 0.855025


val loss = 0.013329, val acc = 0.846812


val loss = 0.012399, val acc = 0.851260


val loss = 0.014445, val acc = 0.850234


val loss = 0.014085, val acc = 0.851717


val loss = 0.013356, val acc = 0.848523


val loss = 0.013246, val acc = 0.858675


val loss = 0.016273, val acc = 0.832782


val loss = 0.013088, val acc = 0.860386


val loss = 0.013881, val acc = 0.847382


val loss = 0.012836, val acc = 0.863693
saving best model...


val loss = 0.012769, val acc = 0.856507


val loss = 0.014792, val acc = 0.842135


val loss = 0.012469, val acc = 0.862439


val loss = 0.013711, val acc = 0.844302


val loss = 0.015269, val acc = 0.851717
test loss = 0.012296, test acc = 0.860972
X_train shape:  (34965, 26, 1, 128, 32)
X_valid shape:  (8767, 26, 1, 128, 32)
X_test shape:  (8768, 26, 1, 128, 32)


val loss = 0.038006, val acc = 0.260294
saving best model...


val loss = 0.026580, val acc = 0.517281
saving best model...


val loss = 0.026914, val acc = 0.511692


val loss = 0.016760, val acc = 0.703091
saving best model...


val loss = 0.016801, val acc = 0.698186


val loss = 0.013709, val acc = 0.753393
saving best model...


val loss = 0.013758, val acc = 0.764686
saving best model...


val loss = 0.011525, val acc = 0.799019
saving best model...


val loss = 0.012882, val acc = 0.780313


val loss = 0.010992, val acc = 0.817155
saving best model...


val loss = 0.011064, val acc = 0.821376
saving best model...


val loss = 0.012095, val acc = 0.816699


val loss = 0.013269, val acc = 0.797308


val loss = 0.011989, val acc = 0.819094


val loss = 0.010935, val acc = 0.831413
saving best model...


val loss = 0.010673, val acc = 0.837345
saving best model...


val loss = 0.011172, val acc = 0.838143
saving best model...


val loss = 0.011793, val acc = 0.829474


val loss = 0.011135, val acc = 0.841565
saving best model...


val loss = 0.010650, val acc = 0.855823
saving best model...


val loss = 0.011098, val acc = 0.850006


val loss = 0.011296, val acc = 0.845443


val loss = 0.009999, val acc = 0.863123
saving best model...


val loss = 0.011594, val acc = 0.835862


val loss = 0.010675, val acc = 0.854910


val loss = 0.009965, val acc = 0.872362
saving best model...


val loss = 0.013069, val acc = 0.846242


val loss = 0.011686, val acc = 0.851945


val loss = 0.011861, val acc = 0.854910


val loss = 0.011344, val acc = 0.853428


val loss = 0.010635, val acc = 0.865975


val loss = 0.012861, val acc = 0.850576


val loss = 0.014329, val acc = 0.836774


val loss = 0.012955, val acc = 0.845671


val loss = 0.012315, val acc = 0.853199


val loss = 0.011517, val acc = 0.863922


val loss = 0.012888, val acc = 0.841109


val loss = 0.011720, val acc = 0.855709


val loss = 0.010732, val acc = 0.872020


val loss = 0.011879, val acc = 0.865404


val loss = 0.011252, val acc = 0.857648


val loss = 0.011876, val acc = 0.872020


val loss = 0.012523, val acc = 0.848637


val loss = 0.012369, val acc = 0.863009


val loss = 0.012326, val acc = 0.864264


val loss = 0.012214, val acc = 0.865861


val loss = 0.012278, val acc = 0.867914


val loss = 0.013499, val acc = 0.850348


val loss = 0.015589, val acc = 0.846128


val loss = 0.012614, val acc = 0.843276
test loss = 0.010999, test acc = 0.858234
X_train shape:  (35070, 26, 1, 128, 32)
X_valid shape:  (8715, 26, 1, 128, 32)
X_test shape:  (8715, 26, 1, 128, 32)


val loss = 0.038189, val acc = 0.247963
saving best model...


val loss = 0.029717, val acc = 0.439816
saving best model...


val loss = 0.022141, val acc = 0.588296
saving best model...


val loss = 0.017670, val acc = 0.688583
saving best model...


val loss = 0.016367, val acc = 0.710155
saving best model...


val loss = 0.013670, val acc = 0.763052
saving best model...


val loss = 0.012766, val acc = 0.781411
saving best model...


val loss = 0.012097, val acc = 0.790476
saving best model...


val loss = 0.012132, val acc = 0.791050
saving best model...


val loss = 0.013299, val acc = 0.773953


val loss = 0.012960, val acc = 0.790361


val loss = 0.010260, val acc = 0.832243
saving best model...


val loss = 0.010759, val acc = 0.830866


val loss = 0.011666, val acc = 0.829489


val loss = 0.011979, val acc = 0.821687


val loss = 0.009823, val acc = 0.850373
saving best model...


val loss = 0.011246, val acc = 0.841767


val loss = 0.011606, val acc = 0.838898


val loss = 0.011099, val acc = 0.837636


val loss = 0.010690, val acc = 0.849111


val loss = 0.012384, val acc = 0.837407


val loss = 0.013414, val acc = 0.834653


val loss = 0.012526, val acc = 0.834309


val loss = 0.012106, val acc = 0.838554


val loss = 0.013205, val acc = 0.818589


val loss = 0.017740, val acc = 0.792542


val loss = 0.012046, val acc = 0.852897
saving best model...


val loss = 0.012110, val acc = 0.852324


val loss = 0.013112, val acc = 0.840734


val loss = 0.012489, val acc = 0.857258
saving best model...


val loss = 0.011470, val acc = 0.860356
saving best model...


val loss = 0.012616, val acc = 0.839702


val loss = 0.011699, val acc = 0.855995


val loss = 0.012917, val acc = 0.848193


val loss = 0.014193, val acc = 0.839702


val loss = 0.012349, val acc = 0.864601
saving best model...


val loss = 0.011635, val acc = 0.872748
saving best model...


val loss = 0.011493, val acc = 0.856110


val loss = 0.011577, val acc = 0.862880


val loss = 0.014674, val acc = 0.843718


val loss = 0.013004, val acc = 0.855766


val loss = 0.012710, val acc = 0.868847


val loss = 0.012483, val acc = 0.851979


val loss = 0.013537, val acc = 0.865290


val loss = 0.012316, val acc = 0.863569


val loss = 0.013592, val acc = 0.839472


val loss = 0.012572, val acc = 0.867585


val loss = 0.012287, val acc = 0.868847


val loss = 0.012968, val acc = 0.863454


val loss = 0.011906, val acc = 0.872060
test loss = 0.011691, test acc = 0.867126
X_train shape:  (34965, 26, 1, 128, 32)
X_valid shape:  (8767, 26, 1, 128, 32)
X_test shape:  (8768, 26, 1, 128, 32)


AttributeError: 'tuple' object has no attribute 'A'

In [2]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.get_device_name(0))

1.10.2+cu113
11.3
NVIDIA RTX A6000


In [17]:
!nvidia-smi

Wed Nov 26 14:10:52 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A6000               Off |   00000000:17:00.0 Off |                  Off |
| 39%   68C    P0             93W /  300W |    1551MiB /  49140MiB |      1%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA RTX A6000               Off |   00