# Project Description : 
https://www.notion.so/RFCX-9259c2261dda4a29a830e588e26e1b7e



In [None]:
import os,re,random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt  
from pathlib import Path
from IPython.display import Audio
import librosa
import librosa.display
from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import StratifiedKFold
import io
import soundfile as sf
from tqdm import tqdm
import cv2
import random
from skimage.transform import resize
import tensorflow as tf
from PIL import Image

In [None]:
# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)
AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print("All devices: ", tf.config.list_logical_devices('TPU'))

### Load Data

In [None]:
def count_data_items(filenames):
    n = [int(re.compile(r'-([0-9]*)\.').search(filename).group(1)) for filename in filenames]
    return np.sum(n)

with open('/kaggle/input/rfcx-species-audio-detection/train_tp.csv') as f:
    TRAIN_TP = pd.read_csv(f)
with open('/kaggle/input/rfcx-species-audio-detection/train_fp.csv') as f:
    TRAIN_FP = pd.read_csv(f)
GCS_PATH = KaggleDatasets().get_gcs_path()
TRAINING_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/tfrecords/train/*.tfrec')
TEST_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/tfrecords/test/*.tfrec')


print('Train_TP size {} {}'.format(len(TRAIN_TP), len(TRAIN_TP.columns)))
print('Train_FP size {} {}'.format(len(TRAIN_FP), len(TRAIN_FP.columns)))
print('TRAINIG_Filenames size {}'.format(count_data_items(TRAINING_FILENAMES)))
print('TEST_Filenames size {}'.format(count_data_items(TEST_FILENAMES)))

### Model Parameters
- Starter code from **Rainforest-Audio classification Tensorflow starter**[https://www.kaggle.com/dimitreoliveira/rainforest-audio-classification-tensorflow-starter#Spectrogram]

In [None]:
BATCH_SIZE = 16 * REPLICAS
LEARNING_RATE = 1e-3 * REPLICAS
EPOCHS = 15
HEIGHT = 224
WIDTH = 512
CHANNELS = 3
N_CLASSES = 24
ES_PATIENCE = 3
TTA_STEPS = 6 # Do TTA if > 0 

### Helper Functions


In [None]:
def decode_audio(audio_binary):
    #decode a 16-bit PCM WAV file to a float tensor
    audio, _ = tf.audio.decode_wav(audio_binary)
    return tf.squeeze(audio, axis = -1)

def string_split_semicolon(column):
    split_labels_sc = tf.strings.split(column, sep=';')
    return split_labels_sc

def string_split_comma(column):
    split_labels_c = tf.strings.split(column, sep=',')
    return split_labels_c

def get_label_info(label_info):
    first_split = string_split_semicolon(label_info)
    remove_quotes = tf.strings.regex_replace(first_split, '""', "")
    label_info = string_split_comma(remove_quotes)
    return label_info

# all wave's sample rate = 48000 
def get_spectogram(waveform, padding = False, min_padding = 48000):
    waveform = tf.cast(waveform, tf.float32)
    if padding:
        zero_padding = tf.zeros([min_padding] - tf.shape(waveform), dtype=tf.float32)
        waveform = tf.concat([waveform, zero_padding], 0)
    #short-time fourier transform of signals
    spectrogram = tf.signal.stft(waveform, frame_length = 2048, frame_step=512, fft_length=2048)
    spectrogram = tf.abs(spectrogram)
    return spectrogram

def get_spectrogram_tf(example):
    audio = example['audio_wav']
    spectrogram = get_spectrogram(audio)
    spectrogram = tf.expand_dims(spectrogram, -1)
    example['audio_wav'] = spectrogram
    return example   

def prepare_sample(example):
    sample = example['audio_wav']
    sample = tf.image.resize(sample, [HEIGHT, WIDTH])
    sample = tf.image.grayscale_to_rgb(sample)
    example['audio_wav'] = sample
    return example

def crop_audio(audio, tmin, tmax, crop_size=10, sample_rate=48000, max_size=60):
    label_size = tmax - tmin
    #no padding needed
    if label_size >= crop_size:
        cut_min = tmin * sample_rate
        cut_max = (tmin + crop_size) * sample_rate
    else: 
        #needs padding
        #pad at the end
        if tmin <= (max_size - crop_size): 
            cut_min = tmin * sample_rate
            cut_max = (tmin + crop_size) * sample_rate
        else: 
            #pad at the beginning
            cut_min = (tmin - crop_size) * sample_rate
            cut_max = tmax * sample_rate
    
    cut_min = tf.cast(cut_min, tf.int32)
    cut_max = tf.cast(cut_max, tf.int32)
    cut_size = tf.cast((crop_size*sample_rate), tf.int32)
    #croping the audio
    audio = audio[cut_min:cut_max]
    #making sure it has the max size
    audio = audio[:cut_size]
    #making sure it has the expected shape
    audio = tf.reshape(audio, [cut_size]) 
    return audio

def random_crop_audio(audio, crop_size=10, sample_rate=48000, max_size=60):
    start = tf.random.uniform([], minval=0, 
                              maxval=(max_size - crop_size), 
                              dtype=tf.int32)
    cut_min = start * sample_rate
    cut_max = (start + crop_size) * sample_rate
    #casting tensors
    cut_min = tf.cast(cut_min, tf.int32)
    cut_max = tf.cast(cut_max, tf.int32)
    cut_size = tf.cast((crop_size*sample_rate), tf.int32)
    audio = audio[cut_min:cut_max] #croping the audio
    audio = audio[:cut_size] #making sure it has the max size
    audio = tf.reshape(audio, [cut_size])
    return audio

"""
1. Parse data based on the 'TFREC_FORMAT' map.
2. Decode PCM WAV file.
3. Break down the information from 'label_info' into other features.
4. Crop the 'audio' waveform if needed.
5. Returns the features as a dictionary.
"""
def read_tfrecord(example, labeled=True, inference=False):
    TFREC_FORMAT = {
        'audio_wav': tf.io.FixedLenFeature([], tf.string), 
        'recording_id': tf.io.FixedLenFeature([], tf.string), 
        'label_info': tf.io.FixedLenFeature([], tf.string, default_value='-1,-1,0,0,0,0,1'), 
    }
    example = tf.io.parse_single_example(example, TFREC_FORMAT)
    audio = decode_audio(example['audio_wav'])
    # Break down 'label_info' into the data columns
    label_info = get_label_info(example['label_info'])
    species_id = tf.strings.to_number(tf.gather_nd(label_info, [0, 0]), tf.int32)
    tmin = tf.strings.to_number(tf.gather_nd(label_info, [0, 2]))
    tmax = tf.strings.to_number(tf.gather_nd(label_info, [0, 4]))
    is_tp = tf.strings.to_number(tf.gather_nd(label_info, [0, 6]), tf.int32)

    if labeled:
        audio = crop_audio(audio, tmin, tmax)
    if inference:
        audio = random_crop_audio(audio)
        
    features = {'audio_wav': audio, 
                'recording_id': example['recording_id'], 
                'species_id': species_id, 
                'is_tp': is_tp
               }
    return features

#Load and parse the TFRecords
def load_dataset(filenames, labeled=True, ordered=False, inference=False):
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False
        dataset = tf.data.Dataset.list_files(filenames)
        dataset = dataset.interleave(tf.data.TFRecordDataset, num_parallel_calls=AUTO)
    else:
        dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO)
    dataset = dataset.with_options(ignore_order)    
    dataset = dataset.map(lambda x: read_tfrecord(x, labeled=labeled, inference=inference), num_parallel_calls=AUTO)
    return dataset

#Configure the output of the dataset
def conf_output(sample, labeled=True):
    output = ({'input_audio': sample['audio_wav']}, sample['species_id'])
    return output

"""
1. Load TFRecord files, parse and generate features (waveform and meta-data).
2. Filter the dataset to contain only true positive samples.
3. Create 'spectrogram' from the 'waveform'.
4. Prepare image for the model.
5. Configure data to have the expected output format.
6. Apply Tensorflow data functions to optimize training.
Returns a Tensorflow dataset ready for training or inference.
"""
def get_dataset(filenames, labeled=True, ordered=False, repeated=False, inference=False):
    dataset = load_dataset(filenames, labeled=labeled, inference=inference)   
    if labeled:
        dataset = dataset.filter(_filtterTP)
    
    dataset = dataset.map(get_spectrogram_tf, num_parallel_calls=AUTO)
    dataset = dataset.map(prepare_sample, num_parallel_calls=AUTO)
    dataset = dataset.map(lambda x: conf_output(x, labeled=labeled), num_parallel_calls=AUTO)
    
    if not ordered:
        dataset = dataset.shuffle(256)
    if repeated:
        dataset = dataset.repeat()
        
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

def _filtterTP(x):
    return x['is_tp'] == 1

In [None]:
#Visualization Helper Functions
def plot_spectrogram(spectrogram, ax):
    #convert to frequencies to log scale and transpose so that the time is represented in the x-axis (columns).
    log_spec = np.log(spectrogram.T)
    height = log_spec.shape[0]
    X = np.arange(spectrogram.shape[0])
    Y = range(height)
    ax.pcolormesh(X, Y, log_spec)
    
def display_waveforms(ds, n_rows=3, n_cols=3, figsize=(20, 16)):
    n = n_rows*n_cols
    fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
    for i, sample in enumerate(ds.take(n)):
        r = i // n_cols
        c = i % n_cols
        ax = axes[r][c]
        ax.plot(sample['audio_wav'].numpy())
        ax.set_yticks(np.arange(-1.2, 1.2, 0.2))
        label = sample['species_id'].numpy()
        recording_id = sample['recording_id'].numpy().decode()
        ax.set_title(f'{recording_id} - {label}')
    plt.show()
    
def display_spectrograms(ds, n_rows=3, n_cols=3, figsize=(20, 16)):
    n = n_rows*n_cols
    fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
    for i, sample in enumerate(ds.take(n)):
        r = i // n_cols
        c = i % n_cols
        ax = axes[r][c]
        plot_spectrogram(np.squeeze(sample['audio_wav'].numpy()), ax)
        label = sample['species_id'].numpy()
        recording_id = sample['recording_id'].numpy().decode()
        ax.set_title(f'{recording_id} - {label}')
    plt.show()
    
def inspect_preds(features, labels, preds, n_rows=3, n_cols=2, figsize=(20, 16)):
    n = n_rows*n_cols
    fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
    for i, (feature, label, pred) in enumerate(zip(features, labels, preds)):
        r = i // n_cols
        c = i % n_cols
        ax = axes[r][c]
        plot_spectrogram(np.squeeze(feature), ax)
        if pred == label:
            color = 'black'
            title = f'{pred} [True]'
        else:
            color = 'red'
            title = f'{pred} [False, should be {label}]'
        ax.set_title(title, fontsize=14, color=color)
    plt.show()
        
def display_waveforms_audio_spectrogram(ds, n_samples=1, sample_rate=48000):
    for sample in ds.take(n_samples):
        waveform = sample['audio_wav']
        label = sample['species_id'].numpy()
        recording_id = sample['recording_id'].numpy().decode()
        spectrogram = get_spectrogram(waveform)

        print(f'Name: {recording_id}')
        print(f'Label: {label}')
        print(f'Waveform shape: {waveform.shape}')
        print(f'Spectrogram shape: {spectrogram.shape}')
        print(f'Audio playback')
        Idisplay.display(Idisplay.Audio(waveform, rate=sample_rate))
        
        fig, axes = plt.subplots(2, figsize=(12, 8))
        timescale = np.arange(waveform.shape[0])
        axes[0].plot(timescale, waveform.numpy())
        axes[0].set_title('Waveform')
        axes[0].set_xlim([0, waveform.shape[0]])
        plot_spectrogram(spectrogram.numpy(), axes[1])
        axes[1].set_title('Spectrogram')
        plt.show()
        
def inspect_preds(features, labels, preds, n_rows=3, n_cols=2, figsize=(20, 16)):
    n = n_rows*n_cols
    fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
    for i, (feature, label, pred) in enumerate(zip(features, labels, preds)):
        r = i // n_cols
        c = i % n_cols
        ax = axes[r][c]
        
        feature = tf.image.rgb_to_grayscale(feature).numpy()
        plot_spectrogram(np.squeeze(feature), ax)
        if pred == label:
            color = 'black'
            title = f'{pred} [True]'
        else:
            color = 'red'
            title = f'{pred} [False, should be {label}]'
        ax.set_title(title, fontsize=14, color=color)
    plt.show()
    
#model evaluation
def plot_metrics(history):
    fig, axes = plt.subplots(2, 1, sharex='col', figsize=(20, 8))
    axes = axes.flatten()
    
    axes[0].plot(history['loss'], label='Train loss')
    axes[0].plot(history['val_loss'], label='Validation loss')
    axes[0].legend(loc='best', fontsize=16)
    axes[0].set_title('Loss')
    axes[0].axvline(np.argmin(history['loss']), linestyle='dashed')
    axes[0].axvline(np.argmin(history['val_loss']), linestyle='dashed', color='orange')
    
    axes[1].plot(history['sparse_categorical_accuracy'], label='Train accuracy')
    axes[1].plot(history['val_sparse_categorical_accuracy'], label='Validation accuracy')
    axes[1].legend(loc='best', fontsize=16)
    axes[1].set_title('Accuracy')
    axes[1].axvline(np.argmax(history['sparse_categorical_accuracy']), linestyle='dashed')
    axes[1].axvline(np.argmax(history['val_sparse_categorical_accuracy']), linestyle='dashed', color='orange')

    plt.xlabel('Epochs', fontsize=16)
    sns.despine()
    plt.show()

In [None]:
train_waveform_ds = load_dataset(TRAINING_FILENAMES)

display_waveforms(train_waveform_ds)


### 2. Data Preprocessing - produce Mel Spectogram

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 16

In [None]:
# VAR
fft = 2048
hop = 512
sr = 48000 # all wave's sampling rate may be 48k
length = 10 * sr
fmin = 24000
fmax = 0
rng_seed = 1234
random.seed(rng_seed)
np.random.seed(rng_seed)
os.environ['PYTHONHASHSEED'] = str(rng_seed)
kfold = 5
num_birds = 24
lr = 0.04
epochs = 24
# Store data
with open('/kaggle/input/rfcx-species-audio-detection/train_tp.csv') as f:
    reader = csv.reader(f)
    TRAIN_DATA = list(reader)
TRAIN_DATA

In [None]:
def get_min_max_frequency(data, fmin, fmax):
    for i in range(1, len(data)):
        if fmin > float(data[i][4]):
            fmin = float(data[i][4])
        if fmax < float(data[i][6]):
            fmax = float(data[i][6])     
    #saftey margin
    fmin = int(fmin * 0.9)
    fmax = int(fmax * 1.1)
    print('Minimum frequency: ' + str(fmin) + ', maximum frequency: ' + str(fmax))
    return(fmin, fmax)

def position_sound_slice(t_min, t_max, wav):
    center = np.round((t_min + t_max) / 2)
    beginning = center - length / 2
    if beginning < 0:
        beginning = 0

    ending = beginning + length
    if ending > len(wav):
        ending = len(wav)
        beginning = ending - length
    return (beginning, ending, center)

def generate_mel_spectogram(data, fmin, fmax):  
    for i in range(1, len(data)):
        wav, sr = librosa.load('/kaggle/input/rfcx-species-audio-detection/train/' + data[i][0] + '.flac', sr=None)
        t_min = float(data[i][3]) * sr
        t_max = float(data[i][5]) * sr
        beginning, ending, center = position_sound_slice(t_min, t_max, wav)
        slice = wav[int(beginning):int(ending)]
        
        # Mel spectrogram generation
        mel_spec = librosa.feature.melspectrogram(slice, n_fft=fft, hop_length=hop, sr=sr, fmin=fmin, fmax=fmax, power=1.5)
        mel_spec = resize(mel_spec, (224, 400))

        # Normalize
        mel_spec = mel_spec - np.min(mel_spec)
        mel_spec = mel_spec / np.max(mel_spec)

        mel_spec = mel_spec * 255
        mel_spec = np.round(mel_spec)
        mel_spec = mel_spec.astype('uint8')
        mel_spec = np.asarray(mel_spec)

        bmp = Image.fromarray(mel_spec, 'L')
        # Saved as recording id_species_id_songtype_id format
        bmp.save('/kaggle/working/' + data[i][0] + '_' + data[i][1] + '_' + str(center) + '.bmp')

        if i % 100 == 0:
            print('Processed ' + str(i) + ' train examples from ' + str(len(data)))
            

In [None]:
fmin, fmax = get_min_max_frequency(TRAIN_DATA, fmin, fmax)
generate_mel_spectogram(TRAIN_DATA, fmin, fmax)

In [None]:
def get_file_label():
    file_list = []
    label_list = []
    for f in os.listdir('/kaggle/working/'):
        if '.bmp' in f:
            file_list.append(f)
            label = str.split(f, '_')[1]
            label_list.append(label)
    return(file_list, label_list)

def apply_stratified_kfold(file_list, label_list, kfold):
    skf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=rng_seed)
    train_files = []
    val_files = []
    for fold_id, (train_index, val_index) in enumerate(skf.split(file_list, label_list)):
        # Picking only first fold to train/val on (loss of 20% training data)
        if fold_id == 0:
            train_files = np.take(file_list, train_index)
            val_files = np.take(file_list, val_index)

    print('Training on ' + str(len(train_files)) + ' examples')
    print('Validating on ' + str(len(val_files)) + ' examples')
    return (train_files, val_files)
    

In [None]:
file_list, label_list = get_file_label()
train_files, val_files = apply_stratified_kfold(file_list, label_list, kfold)
print(train_files, val_files)

### 3. Modeling - RNN

In [None]:
import torch.utils.data as torchdata

class AudioForest():
    def __init__(self, filelist):
        self.specs = []
        self.labels = []
        for f in filelist:
            label = int(str.split(f, '_')[1])
            label_array = np.zeros(num_birds, dtype=np.single)
            label_array[label] = 1.
            self.labels.append(label_array)
            
            img = Image.open('/kaggle/working/' + f)
            mel_spec = np.array(img)
            img.close()
            
            mel_spec = mel_spec / 255
            mel_spec = np.stack((mel_spec, mel_spec, mel_spec))
            self.specs.append(mel_spec)
    
    def __len__(self):
        return len(self.specs)
    
    def __getitem__(self, item):
        return self.specs[item], self.labels[item]

In [None]:
file_list = []
label_list = []

def get_files():
    for f in os.listdir('/kaggle/working/'):
        if '.bmp' in f:
            file_list.append(f)
            label = str.split(f, '_')[1]
            label_list.append(label)
    
get_files()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=rng_seed)


In [None]:
train_files = []
val_files = []

for fold_id, (train_index, val_index) in enumerate(skf.split(file_list, label_list)):
    if fold_id == 0:
        train_files = np.take(file_list, train_index)
        val_files = np.take(file_list, val_index)
        
print('Training on ' + str(len(train_files)) + ' examples')
print('Validating on ' + str(len(val_files)) + ' examples')



In [None]:
batch_size = 16
train_dataset = AudioForest(train_files)
val_dataset = AudioForest(val_files)
train_loader = torchdata.DataLoader(train_dataset, batch_size=batch_size, sampler=torchdata.RandomSampler(train_dataset))
val_loader = torchdata.DataLoader(val_dataset, batch_size=batch_size, sampler=torchdata.RandomSampler(val_dataset))


def create_model():
    model = resnest50(pretrained=True)

    model.fc = nn.Sequential(
        nn.Linear(2048, 1024),
        nn.ReLU(),
        nn.Dropout(p=0.2),
        nn.Linear(1024, 1024),
        nn.ReLU(),
        nn.Dropout(p=0.2),
        nn.Linear(1024, 24),
    )

    return model



In [None]:
from torch.nn import BCEWithLogitsLoss

model = create_model()
criterion = BCEWithLogitsLoss() #PANNsLoss() #MaskedBCEWithLogitsLoss() #BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
num_train_steps = int(len(train_loader) * epochs)
num_warmup_steps = int(0.1 * epochs * len(train_loader))
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.4)


if torch.cuda.is_available():
    model = model.cuda()
    loss_function = loss_function.cuda()

In [None]:
for e in range(0, 32):
    # Stats
    train_loss = []
    train_corr = []
    
    model.train()
    for batch, (data, target) in enumerate(train_loader):
        data = data.float()
        if torch.cuda.is_available():
            data, target = data.cuda(), target.cuda()
    


### 5. Data Augmentation for wave form

In [None]:
#Base class for audio data transformation
class AudioTransform:
    def __init__(self, always_apply=False, p=0.5):
        self.always_apply = always_apply
        self.p = p

    def __call__(self, y: np.ndarray):
        if self.always_apply:
            return self.apply(y)
        else:
            if np.random.rand() < self.p:
                return self.apply(y)
            else:
                return y

    def apply(self, y: np.ndarray):
        raise NotImplementedError

In [None]:
#Gaussian Noise

class AddGaussianNoise(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, max_noise_amplitude=0.5, **kwargs):
        super().__init__(always_apply, p)

        self.noise_amplitude = (0.0, max_noise_amplitude)

    def apply(self, y: np.ndarray, **params):
        noise_amplitude = np.random.uniform(*self.noise_amplitude)
        noise = np.random.randn(len(y))
        augmented = (y + noise * noise_amplitude).astype(y.dtype)
        return augmented

In [None]:
class GaussianNoiseSNR(AudioTransform):
    def __init__(self, always_apply=False, p=0.5, min_snr=5.0, max_snr=20.0, **kwargs):
        super().__init__(always_apply, p)

        self.min_snr = min_snr
        self.max_snr = max_snr

    def apply(self, y: np.ndarray, **params):
        snr = np.random.uniform(self.min_snr, self.max_snr)
        a_signal = np.sqrt(y ** 2).max()
        a_noise = a_signal / (10 ** (snr / 20))

        white_noise = np.random.randn(len(y))
        a_white = np.sqrt(white_noise ** 2).max()
        augmented = (y + white_noise * 1 / a_white * a_noise).astype(y.dtype)
        return augmented

In [None]:
transform = AddGaussianNoise(always_apply=True, max_noise_amplitude=0.05)
y_gaussian_added = transform(y)
Audio(y_gaussian_added, rate=sr)

In [None]:
transform = GaussianNoiseSNR(always_apply=True, min_snr=5, max_snr=20)
y_gaussian_snr = transform(y)
Audio(y_gaussian_snr, rate=sr)
librosa.display.waveplot(y_gaussian_snr, sr=sr);

In [None]:
melspec = librosa.power_to_db(librosa.feature.melspectrogram(y, sr=sr, n_mels=128))
librosa.display.specshow(melspec, sr=sr, x_axis="time", y_axis="mel")
plt.colorbar();

### 6. Metric

In [None]:
from sklearn.metrics import label_ranking_average_precision_score
y_true = np.array([[1, 0, 0], [0, 0, 1]])
y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]])


label_ranking_average_precision_score(y_true, y_score)



In [None]:
ss = pd.read_csv("../input/rfcx-species-audio-detection/sample_submission.csv")

ss