In [0]:
import os
import numpy as np
import pandas as pd
import librosa
from tensorflow.keras import layers, Input, Model
import fnmatch

In [0]:
path = 'dataset' 

In [0]:
class Preprocessing:

    def __init__(self, data_path):
        self.path = data_path
        self.mp3_name = fnmatch.filter(os.listdir(self.path), '*.mp3')
        self.target = pd.DataFrame(columns=[['species', 'bird']])
        self.target_species = pd.DataFrame()
        self.target_bird = pd.DataFrame()

        self.audio = np.zeros((len(self.mp3_name), 441000,), dtype=np.float32())

    def create_input_data(self):
        #converts .mp3 to float32 values and returns a numpy array
        OFFSET = 10.0
        DURATION = 20.0

        for i, name in enumerate(self.mp3_name):
            audio_dir = os.path.join(self.path, name)
            #librosa requires ffmpeg to load .mp3 files
            audio_data, sr = librosa.load(audio_dir, offset=OFFSET, duration=DURATION)
            self.audio[i] = audio_data

        return self.audio

    def create_target_data(self):
        #creates a pandas dataframe with two targets columns 
        #                                     -species & bird name

        #creates a list of species & bird name from mp3 files
        species = []
        bird = []
        for i, name in enumerate(self.mp3_name):
            species.append(name.split('-')[0].lower())
            bird.append(name.split('-')[1])

        #converting to single column array for ease of use
        self.target[['species']] = np.array(species).reshape(-1, 1)
        self.target[['bird']] = np.array(bird).reshape(-1, 1)

        #returns species and bird data 
        self.target_species = pd.get_dummies(self.target[['species']])
        self.target_bird = pd.get_dummies(self.target[['bird']])

        return (self.target_species, self.target_bird)            

In [0]:
class BirdSongRecognizer(Preprocessing):
    
    def __init__(self, data_path):
        self.path = data_path
        self.pre = Preprocessing(self.path)
        self.input_data = self.pre.create_input_data()
        self.output_species, self.output_bird =  self.pre.create_target_data()
        self.input_shape = self.input_data.shape[1]
        
    def train(self, optimizer='rmsprop', loss='categorical_crossentropy', epochs=3):
        input_tensor = Input((441000,1))
        
        l0 = layers.BatchNormalization()(input_tensor)
        l1 = layers.Bidirectional(layers.GRU(128, return_sequences=False))(l0)
        l2 = layers.Flatten()(l1)
        
        out_species = layers.Dense(2, activation='softmax')(l2)
        out_bird = layers.Dense(2, activation='softmax')(l2)
        
        model = Model(inputs=input_tensor, outputs=[out_species, out_bird])
        model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])
        model.summary()
        model.fit(self.input_data, [self.output_species, self.output_bird], epochs=epochs)
        
    def predict(self):
        return self.train.model.predict(self.input_data)
               
    def test(self, data_path_test):
        pre = Preprocessing(data_path_test)
        input_data_test = pre.create_input_data()
        output_data_test = pre.create_target_data()
        
        return self.train.model.evaluate(input_data_test, output_data_test)
        

In [9]:
BirdSongRecognizer(path).train()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 441000, 1)]  0                                            
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 441000, 1)    4           input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 256)          100608      batch_normalization[0][0]        
__________________________________________________________________________________________________
flatten (Flatten)               (None, 256)          0           bidirectional[0][0]              
______________________________________________________________________________________________