# Preprocess Audio Files
This notebook contains the code for preprocessing the audio files before training.

10 second audio in wav format with a sample rate of 48k was chosen because it matches the audio used to train the l3 model from which I plan to use for feature embeddings.

The sample dataset only includes the training data, so we will split that into training and testing data.  If the mode proves viable we will download more data from xeno-canto.org.

In [19]:
import sox # must install sox locally if you want mp3 support
import os
from glob import glob
import shutil
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np
import pandas as pd
import soundfile as sf

In [7]:
ORIGINAL_DATA_DIR = 'data/train_audio'
EXTRA_DATA_DIR = 'data/xeno-canto/'
OUTPUT_DIR = 'data/train_10sec'
SAMPLE_RATE = 22050

# create output dir if it does not exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [8]:
# get a list of all mp3 files

# from original dataset
audio_files = glob(os.path.join(ORIGINAL_DATA_DIR, '*/*.mp3'))

# from additional downloaded data from xeno-canto.com
audio_files += glob(os.path.join(EXTRA_DATA_DIR, '*.mp3'))

audio_files[:5], audio_files[-5:]

(['data/train_audio/olsfly/XC386256.mp3',
  'data/train_audio/olsfly/XC484154.mp3',
  'data/train_audio/olsfly/XC239498.mp3',
  'data/train_audio/olsfly/XC368006.mp3',
  'data/train_audio/olsfly/XC156193.mp3'],
 ['data/xeno-canto/XC77538.mp3',
  'data/xeno-canto/XC441588.mp3',
  'data/xeno-canto/XC368433.mp3',
  'data/xeno-canto/XC146762.mp3',
  'data/xeno-canto/XC627879.mp3'])

In [10]:
# get the unique class names of the birds in our training set
df = pd.read_csv('data/train.csv')
classes = sorted(df['ebird_code'].unique())
classes

['aldfly',
 'ameavo',
 'amebit',
 'amecro',
 'amegfi',
 'amekes',
 'amepip',
 'amered',
 'amerob',
 'amewig',
 'amewoo',
 'amtspa',
 'annhum',
 'astfly',
 'baisan',
 'baleag',
 'balori',
 'banswa',
 'barswa',
 'bawwar',
 'belkin1',
 'belspa2',
 'bewwre',
 'bkbcuc',
 'bkbmag1',
 'bkbwar',
 'bkcchi',
 'bkchum',
 'bkhgro',
 'bkpwar',
 'bktspa',
 'blkpho',
 'blugrb1',
 'blujay',
 'bnhcow',
 'boboli',
 'bongul',
 'brdowl',
 'brebla',
 'brespa',
 'brncre',
 'brnthr',
 'brthum',
 'brwhaw',
 'btbwar',
 'btnwar',
 'btywar',
 'buffle',
 'buggna',
 'buhvir',
 'bulori',
 'bushti',
 'buwtea',
 'buwwar',
 'cacwre',
 'calgul',
 'calqua',
 'camwar',
 'cangoo',
 'canwar',
 'canwre',
 'carwre',
 'casfin',
 'caster1',
 'casvir',
 'cedwax',
 'chispa',
 'chiswi',
 'chswar',
 'chukar',
 'clanut',
 'cliswa',
 'comgol',
 'comgra',
 'comloo',
 'commer',
 'comnig',
 'comrav',
 'comred',
 'comter',
 'comyel',
 'coohaw',
 'coshum',
 'cowscj1',
 'daejun',
 'doccor',
 'dowwoo',
 'dusfly',
 'eargre',
 'easblu',
 'ea

In [11]:
len(classes)

264

# Convert files to 10 second WAV files at 48khz

The audio files have to be processed in two steps because adding padding to an mp3 then saving as wav results in imprecise final times.  We need files to be exactly 10 seconds long wtih a sample rate of 48khz for a total of 48000 * 10 samples.

The first pass resamples the audio to 48k, cuts the clips to 11 seconds, and saves as wav files.

The second pass pads or crops the files to exactly 10 seconds long.

In [12]:
# get the number of cpu cores available
num_cores = multiprocessing.cpu_count()
print(num_cores)

4


In [15]:
def convert_to_wav(af):
        
    wav_name = os.path.splitext(os.path.basename(af))[0] + '.wav'
    class_name = af.split('/')[-2]
    outfile = os.path.join(output_dir, wav_name)
    
    if(not os.path.exists(outfile)):
        tfm = sox.Transformer()
        tfm.trim(start_time=0.0, end_time=11.0)
        tfm.convert(samplerate=sample_rate, 
                    n_channels=1, 
                    bitdepth= 16)
        tfm.build(input_filepath=af, output_filepath=outfile)    

In [16]:
# Process the conversion using all cores in parallel to save time
_ = Parallel(n_jobs=num_cores, verbose=1)(delayed(convert_to_wav)(i) for i in audio_files)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   10.7s


KeyboardInterrupt: 

In [23]:
def crop_pad_audio(af):

    duration = sox.file_info.duration(af) # gets duration in seconds
    basename = os.path.basename(af)
    class_name = af.split('/')[-2]
    outfile = os.path.join(output_dir, basename)    

    if duration < 10.0:
        # first move the working file to pwd
        os.rename(af, basename)
        
        tfm = sox.Transformer()
        tfm.pad(start_duration=0.0, end_duration=(10.0 - duration))
        tfm.build(input_filepath=basename, output_filepath=outfile)
        
        # remove the old file
        os.remove(basename)        
    
    elif duration > 10.0:
        # first move the working file to pwd
        os.rename(af, basename)
    
        tfm = sox.Transformer()
        tfm.trim(start_time=0.0, end_time=10.0)
        tfm.build(input_filepath=basename, output_filepath=outfile)
    
        # remove the old file
        os.remove(basename)
      

In [24]:
print(f'Starting to process {len(wav_audio_files)} files')
_ = Parallel(n_jobs=num_cores, verbose=1)(delayed(crop_pad_audio)(i) for i in wav_audio_files)

Starting to process 271 files


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    8.2s
[Parallel(n_jobs=4)]: Done 271 out of 271 | elapsed:   11.5s finished


In [26]:
# Sanity check
# Make sure all the files are exatcly 10 seconds long

def check_length(pf):
    duration = sox.file_info.duration(pf)
    sr = sox.file_info.sample_rate(pf)
    if duration != 10.0  or sr != sample_rate:
        return (duration, 
                sox.file_info.sample_rate(pf),
                sox.file_info.bitrate(pf),
                pf)
    channels = sox.file_info.channels(pf)
    if channels != 1:
        return (channels, pf)
    return False

print(f'Starting to process {len(wav_audio_files)} files')
errors = Parallel(n_jobs=num_cores, verbose=1)(delayed(check_length)(i) for i in wav_audio_files)

Starting to process 271 files


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    2.0s
[Parallel(n_jobs=4)]: Done 271 out of 271 | elapsed:    6.3s finished


In [27]:
# count errors
filtered_errors = [x for x in errors if x]

if len(filtered_errors) > 0:
    print('There were errors')
    print(filtered_errors)
else:
    print('All files were 10.0 seconds long.  Ready to move on.')

All files were 10.0 seconds long.  Ready to move on.
