In [1]:
import numpy as np
import pandas as pd
import wave
from scipy.io import wavfile
import os
import librosa
from librosa.feature import melspectrogram
import warnings
from sklearn.utils import shuffle
from sklearn.utils import class_weight
from PIL import Image
from uuid import uuid4
import sklearn
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout, Activation
from tensorflow.keras.layers import BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Flatten, Dropout, Activation, LSTM, SimpleRNN, Conv1D, Input, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0
#from keras_efficientnets import EfficientNetB0


import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
print(tf.__version__)

# Data preparation

In this section, we first create two dictionaries to allow to translate each bird into an ID code and vice versa. As we won't be able to use the entirety of the data in this notebook for processing time reasons, we will shuffle the `DataFrame` with the training data before preparing the data. We will then create a new `DataFrame` where we will store all the samples, of 5 seconds each with a sampling rate of 10 data points per second.

In [2]:
train_df = pd.read_csv('nachtall_train.csv')

It looks like each ebird is associated with a "reasonable" amount of samples. We only keep the bird species with 100 samples of the highest quality (4 or 5) to make the problem easier to start with.

In [3]:
#birds_to_recognise = sorted(shuffle(most_represented_birds)[:20])
birds_to_recognise = ['House Sparrow', 'Common Blackbird', 'Great Tit', 'Common Starling', 'Eurasian Blue Tit', 'Eurasian Tree Sparrow', 'Eurasian Magpie', 'Common Wood Pigeon', 'European Robin', 'Common House Martin', 'Common Swift', 'Carrion Crow', 'Common Chaffinch', 'Eurasian Collared Dove', 'European Goldfinch', 'Great Spotted Woodpecker', 'Barn Swallow', 'Eurasian Jay', 'Rock Dove', 'Eurasian Bullfinch']
print(birds_to_recognise)

['House Sparrow', 'Common Blackbird', 'Great Tit', 'Common Starling', 'Eurasian Blue Tit', 'Eurasian Tree Sparrow', 'Eurasian Magpie', 'Common Wood Pigeon', 'European Robin', 'Common House Martin', 'Common Swift', 'Carrion Crow', 'Common Chaffinch', 'Eurasian Collared Dove', 'European Goldfinch', 'Great Spotted Woodpecker', 'Barn Swallow', 'Eurasian Jay', 'Rock Dove', 'Eurasian Bullfinch']


In [4]:
def get_sample(filename, bird, output_folder):
    wave_data, wave_rate = librosa.load(filename)
    wave_data, _ = librosa.effects.trim(wave_data)
    #only take 5s samples and add them to the dataframe
    song_sample = []
    sample_length = 2*wave_rate
    samples_from_file = []
    #The variable below is chosen mainly to create a 216x216 image
    N_mels=216
    for idx in range(0,len(wave_data),sample_length): 
        song_sample = wave_data[idx:idx+sample_length]
        if len(song_sample)>=sample_length:
            mel = melspectrogram(song_sample, n_mels=N_mels)
            db = librosa.power_to_db(mel)
            normalised_db = sklearn.preprocessing.minmax_scale(db)
            filename = str(uuid4())+".tif"
            db_array = (np.asarray(normalised_db)*255).astype(np.uint8)
            db_image =  Image.fromarray(np.array([db_array, db_array, db_array]).T)
            db_image.save("{}{}".format(output_folder,filename))
            
            samples_from_file.append({"song_sample":"{}{}".format(output_folder,filename),
                                            "bird":bird})
    return samples_from_file

The following cell will set all the samples with non-selected birds to the "nocall" ID code. This allows to focus on the classification of the 5 selected bird species while all of bird species will be categorised as "nocall".

In [5]:
%%time
warnings.filterwarnings("ignore")
samples_df = pd.DataFrame(columns=["song_sample","bird"])

#We limit the number of audio files being sampled to 1000 in this notebook to save time
#on top of having limited the number of bird species previously
#SETTING LIMIT FROM 1000 to 100 for test
sample_limit = 1000
sample_list = []

output_folder = "/home/dvm/Schreibtisch/nachtall_Code/Data_training/melspectrogram_dataset"
#os.mkdir(output_folder)
with tqdm(total=sample_limit) as pbar:
    for idx, row in train_df[:sample_limit].iterrows():
        pbar.update(1)
        try:
            audio_file_path = "/home/dvm/Schreibtisch/nachtall_Code/Data_training/songs"
            #audio_file_path += row.ebird_code
            
            if row.ebird_code in birds_to_recognise:
                sample_list += get_sample('{}/{}'.format(audio_file_path, row.filename), row.ebird_code, output_folder)
            else:
                sample_list += get_sample('{}/{}'.format(audio_file_path, row.filename), "nocall", output_folder)
        except:
            raise
            print("{} is corrupted".format(audio_file_path))
            
samples_df = pd.DataFrame(sample_list)

8%|▊         | 75/1000 [02:28<30:35,  1.98s/it]CPU times: user 3min 50s, sys: 5min 36s, total: 9min 27s
Wall time: 2min 28s



In [6]:
samples_df = pd.DataFrame(sample_list)

In [None]:
samples_df.to_hdf('data.h5', key='df', mode='w')

#store = pd.HDFStore('data.h5')
#store['samples_df'] = samples_df  # save it




In [None]:
samples_df = pd.read_hdf('data.h5')


In [None]:
store['samples_df']  # load it


In [None]:
demo_img = Image.open(samples_df.iloc[0].song_sample)
plt.imshow(demo_img)
plt.show()

In [7]:
samples_df = shuffle(samples_df)
samples_df[:10]

Unnamed: 0,song_sample,bird
986,melspectrogram_dataset0edad534-2ecf-48fc-b937-...,European Robin
1386,melspectrogram_datasetdcc1bd49-d201-4268-9138-...,Common Chaffinch
2332,melspectrogram_dataset6d04a807-f598-4e2e-8f8c-...,nocall
1885,melspectrogram_dataset77975aee-24ff-464d-b132-...,Eurasian Jay
211,melspectrogram_dataset9d7d0eb2-3d0a-4ee8-8bb5-...,Common Blackbird
79,melspectrogram_datasetfd01c292-3383-4c2e-84ee-...,Common Blackbird
1700,melspectrogram_datasetade59988-6b63-47ba-b903-...,Barn Swallow
100,melspectrogram_dataset260dba3f-7918-4772-aa91-...,Common Blackbird
2505,melspectrogram_dataset4d29daa5-c9f4-455d-a37e-...,nocall
2094,melspectrogram_dataset90391441-3e89-461b-bc45-...,nocall


# Model creation

In [8]:
training_percentage = 0.9
training_item_count = int(len(samples_df)*training_percentage)
validation_item_count = len(samples_df)-int(len(samples_df)*training_percentage)
training_df = samples_df[:training_item_count]
validation_df = samples_df[training_item_count:]

In [None]:
##ON!

As shown in this [post](https://www.kaggle.com/c/birdsong-recognition/discussion/158943) by [Nanashi](https://www.kaggle.com/jesucristo), CNN-based models seem to outperform LSTM-based models for this type of tasks. Therefore, we will use the freshly added EfficientNet models from Tensorflow 2.3.0.

Also, I have realised that we can have several birds singing at the same time in our samples, which means that we will have to change the output layer and loss to have several possible outputs and not just one.

In [9]:
classes_to_predict = sorted(samples_df.bird.unique())
#classes_to_predict = birds_to_recognise
input_shape = (216,216, 3)
effnet_layers = EfficientNetB0(weights=None, include_top=False, input_shape=input_shape)

for layer in effnet_layers.layers:
    layer.trainable = True

dropout_dense_layer = 0.3

model = Sequential()
model.add(effnet_layers)
    
model.add(GlobalAveragePooling2D())
model.add(Dense(256, use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(dropout_dense_layer))

model.add(Dense(len(classes_to_predict), activation="softmax"))
    
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
efficientnetb0 (Functional)  (None, 7, 7, 1280)        4049571   
_________________________________________________________________
global_average_pooling2d (Gl (None, 1280)              0         
_________________________________________________________________
dense (Dense)                (None, 256)               327680    
_________________________________________________________________
batch_normalization (BatchNo (None, 256)               1024      
_________________________________________________________________
activation (Activation)      (None, 256)               0         
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 21)                5

In [15]:
callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=2, verbose=1, factor=0.7),
             EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint(filepath='model/best_model.h5', monitor='val_loss', save_best_only=True)]
model.compile(loss="categorical_crossentropy", optimizer='adam')

In [None]:
##OFF

In [16]:
class_weights = class_weight.compute_class_weight("balanced", classes_to_predict, samples_df.bird.values)
class_weights_dict = {i : class_weights[i] for i,label in enumerate(classes_to_predict)}

In [17]:
#orig. 32
#training_batch_size = 32
#validation_batch_size = 32
training_batch_size = 32
validation_batch_size = 32
target_size = (216,216)

train_datagen = ImageDataGenerator(
    rescale=1. / 255
)

train_generator = train_datagen.flow_from_dataframe(
    dataframe = training_df,
    x_col='song_sample',
    y_col='bird',
    directory='/',
    target_size=target_size,
    batch_size=training_batch_size,
    shuffle=True,
    class_mode='categorical')


validation_datagen = ImageDataGenerator(rescale=1. / 255)
validation_generator = validation_datagen.flow_from_dataframe(
    dataframe = validation_df,
    x_col='song_sample',
    y_col='bird',
    directory='/',
    target_size=target_size,
    shuffle=False,
    batch_size=validation_batch_size,
    class_mode='categorical')

Found 0 validated image filenames belonging to 0 classes.
Found 0 validated image filenames belonging to 0 classes.


# Model training

For now, I will comment out the class weights as I don't mind training a model that will be biased to "nocall".

In [13]:
history = model.fit(train_generator,
          epochs = 200, 
          validation_data=validation_generator,
           class_weight=class_weights_dict,
          callbacks=callbacks)

ValueError: Asked to retrieve element 0, but the Sequence has length 0

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss over epochs')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='best')
plt.show()

Running predictions on a single batch from our validation set just to check if our model displays any anomalies.

In [None]:
preds = model.predict_generator(validation_generator)
validation_df = pd.DataFrame(columns=["prediction", "groundtruth", "correct_prediction"])

for pred, groundtruth in zip(preds[:16], validation_generator.__getitem__(0)[1]):
    validation_df = validation_df.append({"prediction":classes_to_predict[np.argmax(pred)], 
                                       "groundtruth":classes_to_predict[np.argmax(groundtruth)], 
                                       "correct_prediction":np.argmax(pred)==np.argmax(groundtruth)}, ignore_index=True)
validation_df

In [None]:
!rm -rf /kaggle/working/melspectrogram_dataset

# Predictions

We load the weights for the best-performing model on our validation set.

In [None]:
##ON!

In [None]:
model.load_weights("model/best_model.h5")

As for the training samples, we will only load each audio file, generate a melspectrogram for each 5-second sequence and predict on it. This prediction function ensures that we do not reload the .mp3 audio file for every sample as it would significantly increase the processing time. Then, it adds all the predictions to the `test_df` DataFrame before generating the submission file.

In [None]:
def predict_on_melspectrogram(song_sample, sample_length):
    N_mels=216

    if len(song_sample)>=sample_length:
        mel = melspectrogram(song_sample, n_mels=N_mels)
        db = librosa.power_to_db(mel)
        normalised_db = sklearn.preprocessing.minmax_scale(db)
        db_array = (np.asarray(normalised_db)*255).astype(np.uint8)

        prediction = model.predict(np.array([np.array([db_array, db_array, db_array]).T]))
        predicted_bird = classes_to_predict[np.argmax(prediction)]
        return predicted_bird
    else:
        return "nocall"

In [None]:

wave_data, wave_rate = librosa.load('songs/xc116226.flac')
sample_length = 2*wave_rate

song_sample = np.array(wave_data[0:sample_length])

predicted_bird = predict_on_melspectrogram(song_sample, sample_length)
print(predicted_bird)

In [None]:
def predict_submission(df, audio_file_path):
        
    loaded_audio_sample = []
    previous_filename = ""
    wave_data = []
    wave_rate = None
    sample_length = None
    
    for idx,row in df.iterrows():
        #I added this exception as I've heard that some files may be corrupted.
        try:
            if previous_filename == "" or previous_filename!=row.audio_id:
                filename = '{}/{}.mp3'.format(audio_file_path, row.audio_id)
                wave_data, wave_rate = librosa.load(filename)
                sample_length = 5*wave_rate
            previous_filename = row.audio_id

            #basically allows to check if we are running the examples or the test set.
            if "site" in df.columns:
                if row.site=="site_1" or row.site=="site_2":
                    song_sample = np.array(wave_data[int(row.seconds-5)*wave_rate:int(row.seconds)*wave_rate])
                elif row.site=="site_3":
                    #for now, I only take the first 5s of the samples from site_3 as they are groundtruthed at file level
                    song_sample = np.array(wave_data[0:sample_length])
            else:
                #same as the first condition but I isolated it for later and it is for the example file
                song_sample = np.array(wave_data[int(row.seconds-5)*wave_rate:int(row.seconds)*wave_rate])
            
            predicted_bird = predict_on_melspectrogram(song_sample, sample_length)
            df.at[idx,"birds"] = predicted_bird
        except:
            df.at[idx,"birds"] = "nocall"
    return df

Below, We can test our prediction function using the examples provided.

In [None]:
audio_file_path = "/example_test_audio"
example_df = pd.read_csv("example_test_audio_summary.csv")
#Ajusting the example filenames and creating the audio_id column to match with the test file.
example_df["audio_id"] = [ "BLKFR-10-CPL_20190611_093000.pt540" if filename=="BLKFR-10-CPL" else "ORANGE-7-CAP_20190606_093000.pt623" for filename in example_df["filename"]]

if os.path.exists(audio_file_path):
    example_df = predict_submission(example_df, audio_file_path)
example_df

In [None]:
test_file_path = "/test_audio"
test_df = pd.read_csv("test.csv")
submission_df = pd.read_csv("sample_submission.csv")

if os.path.exists(test_file_path):
    submission_df = predict_submission(test_df, test_file_path)

submission_df[["row_id","birds"]].to_csv('submission.csv', index=False)
submission_df.head()

### Thanks for reading this notebook! If you found this notebook helpful, please give it an upvote. It is always greatly appreciated!