# Create train and test sets
To create a test set we will pull proportion from each of the labels we have.

In [36]:
import os
import shutil

from glob import glob

from tqdm import tqdm

import numpy as np

In [8]:
OUTPATH = 'data/audio_split'
featurefolder = 'data/train_10sec_16khz/'
ffiles = glob(os.path.join(featurefolder, '*/*.wav'))

In [9]:
ffiles[:5]

['data/train_10sec_16khz/olsfly/XC216606.wav',
 'data/train_10sec_16khz/olsfly/XC416924.wav',
 'data/train_10sec_16khz/olsfly/XC216607.wav',
 'data/train_10sec_16khz/olsfly/XC467716.wav',
 'data/train_10sec_16khz/olsfly/XC184678.wav']

In [10]:
len(ffiles)

21375

In [16]:
classes = os.listdir(featurefolder)
classes = [x for x in classes if os.path.isdir(os.path.join(featurefolder,x))]
classes

['olsfly',
 'houspa',
 'semplo',
 'evegro',
 'brnthr',
 'truswa',
 'brespa',
 'gryfly',
 'canwre',
 'belkin1',
 'bkhgro',
 'sagthr',
 'comrav',
 'bnhcow',
 'rufhum',
 'blujay',
 'ribgul',
 'cliswa',
 'carwre',
 'comgol',
 'daejun',
 'pygnut',
 'btywar',
 'gadwal',
 'calgul',
 'bongul',
 'rebwoo',
 'caster1',
 'mouchi',
 'horgre',
 'hoomer',
 'pinsis',
 'banswa',
 'whtswi',
 'annhum',
 'rebnut',
 'comgra',
 'amerob',
 'brebla',
 'blugrb1',
 'magwar',
 'merlin',
 'rocpig',
 'reevir1',
 'westan',
 'amered',
 'norcar',
 'nrwswa',
 'grycat',
 'reshaw',
 'semsan',
 'gnwtea',
 'rufgro',
 'bewwre',
 'greegr',
 'whcspa',
 'barswa',
 'bkbwar',
 'sora',
 'macwar',
 'chiswi',
 'easblu',
 'norpin',
 'scoori',
 'sonspa',
 'greroa',
 'y00475',
 'amepip',
 'bulori',
 'gockin',
 'buwwar',
 'spotow',
 'lotduc',
 'gcrfin',
 'boboli',
 'rthhum',
 'amekes',
 'wooscj2',
 'chukar',
 'yehbla',
 'chispa',
 'bktspa',
 'balori',
 'juntit1',
 'canwar',
 'labwoo',
 'bawwar',
 'yelwar',
 'rudduc',
 'linspa',
 'whts

In [30]:
test_proportion = 0.15
val_proportion = 0.15
train_files = []
test_files = []
val_files = []

for c in tqdm(classes):
    files = glob(os.path.join(featurefolder, c, '*.wav'))
    np.random.shuffle(files)

    test_bound = int(np.floor(len(files) * test_proportion))
    val_bound = test_bound + int(np.floor(len(files) * test_proportion))
    
    test_files += files[:test_bound]
    val_files += files[test_bound:val_bound]
    train_files += files[val_bound:]

100%|██████████| 264/264 [00:00<00:00, 1762.70it/s]


In [31]:
print('Test files:',len(test_files))
print('Validation files:',len(val_files))
print('Train files:',len(train_files))

Test files: 3144
Validation files: 3144
Train files: 15087


# copy files to new directory

In [40]:
def copy_split(files, name):

    for file in tqdm(files):
        path_split = file.split('/')
        outfile = '/'.join([OUTPATH] + [name] +path_split[2:])

        # make the dire if it doesn't exist
        outpath = os.path.split(outfile)[0]
        os.makedirs(outpath, exist_ok=True)

        shutil.copyfile(file, outfile)

In [42]:
copy_split(test_files, 'test')
copy_split(val_files, 'validation')
copy_split(train_files, 'train')


100%|██████████| 3144/3144 [00:05<00:00, 581.24it/s]
100%|██████████| 3144/3144 [00:06<00:00, 499.77it/s]
100%|██████████| 15087/15087 [00:30<00:00, 495.89it/s]


# Sanity check

In [44]:
files = glob(os.path.join(OUTPATH, 'test/*/*.wav'))
print('Test files:',len(files))

files = glob(os.path.join(OUTPATH, 'validation/*/*.wav'))
print('Validation files:',len(files))

files = glob(os.path.join(OUTPATH, 'train/*/*.wav'))
print('Train files:',len(train_files))

Test files: 3144
Validation files: 3144
Train files: 15087
