# Create train and test sets
To create a test set we will pull proportion from each of the labels we have.

In [1]:
import os
import shutil
from glob import glob
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import LabelEncoder

In [2]:
DATA_PATH = 'data/audio_10sec/'

# Combine datasets

In [3]:
df_kag = pd.read_csv('data/train.csv')
df_kag['fileid'] = df_kag['filename'].str.replace('.mp3', '', regex=False)
df_kag = df_kag[['fileid', 'species']]
df_kag['source'] = 'kaggle'
df_kag.head(3)

Unnamed: 0,fileid,species,source
0,XC134874,Alder Flycatcher,kaggle
1,XC135454,Alder Flycatcher,kaggle
2,XC135455,Alder Flycatcher,kaggle


In [4]:
with open('data/xc.json', 'r') as f:
    xc = json.load(f)
df_xc = pd.DataFrame(xc['results'])
df_xc['fileid'] = 'XC' + df_xc['id']

df_xc = df_xc[['fileid', 'en']]
df_xc.columns = ['fileid', 'species']
df_xc['source'] = 'xc'

df_xc.head(3)

Unnamed: 0,fileid,species,source
0,XC554809,Alder Flycatcher,xc
1,XC552408,Alder Flycatcher,xc
2,XC544552,Alder Flycatcher,xc


In [5]:
def remove_missing_files(df):
    # get all wav files
    files = glob(os.path.join(DATA_PATH, '*.wav'))
    
    # get file id
    files = [os.path.splitext(os.path.basename(x))[0] for x in files]
    
    return df[df['fileid'].isin(files)]

In [6]:
df = pd.concat([df_kag, df_xc])
df = remove_missing_files(df)

df['species'].nunique()

264

In [7]:
df

Unnamed: 0,fileid,species,source
0,XC134874,Alder Flycatcher,kaggle
1,XC135454,Alder Flycatcher,kaggle
2,XC135455,Alder Flycatcher,kaggle
3,XC135456,Alder Flycatcher,kaggle
4,XC135457,Alder Flycatcher,kaggle
...,...,...,...
62676,XC375955,Yellow-throated Vireo,xc
62677,XC292664,Yellow-throated Vireo,xc
62678,XC134182,Yellow-throated Vireo,xc
62679,XC93683,Yellow-throated Vireo,xc


In [8]:
# check for exact duplicates and make sure they are all from xc
df[df.duplicated(subset=['fileid', 'species'])]['source'].unique()

array(['xc'], dtype=object)

In [9]:
#drop them
df = df.drop_duplicates(subset=['fileid', 'species'])

In [10]:
# inspect remaining duplicated files
df[df.duplicated(subset=['fileid'], keep=False)].sort_values('fileid').head(10)

Unnamed: 0,fileid,species,source
12825,XC321750,Canyon Wren,xc
16657,XC321750,Rock Wren,kaggle
42417,XC367360,Peregrine Falcon,xc
18387,XC367360,Swainson's Hawk,kaggle
23513,XC376774,Field Sparrow,xc
15333,XC376774,Prairie Warbler,kaggle
715,XC381535,American Robin,kaggle
6949,XC381535,Black-headed Grosbeak,xc
716,XC381579,American Robin,kaggle
6948,XC381579,Black-headed Grosbeak,xc


The kaggle dataset has been revised and corrected so we will take any labels from that set where labels differ.

In [11]:
# get index of xc records to drop
dup = df[df.duplicated(subset=['fileid'], keep=False)]
dup_idx = dup[dup['source'] == 'xc'].index
df = df.drop(dup_idx)

In [12]:
# recheck to make sure there are no duplicates
df[df.duplicated(subset=['fileid'], keep=False)].sort_values('fileid')

Unnamed: 0,fileid,species,source


In [13]:
# get all classes in alphabetical order
classes = sorted(df['species'].unique())
classes[:5]

['Alder Flycatcher',
 'American Avocet',
 'American Bittern',
 'American Bushtit',
 'American Cliff Swallow']

In [14]:
# number of classes
len(classes)

264

## Generate Train, Validation, Test Split

In [15]:
def split_from_df(df, class_col, test_prop, val_prop):

    train = {'files': [], 'labels':[]}
    val = {'files': [], 'labels':[]}
    test = {'files': [], 'labels':[]}
    
    grouped = df.groupby(class_col)
    for name, group in grouped:
        
        # randomly select test rows
        test_rows = group.sample(frac=test_prop, replace=False)
        test['files'] += test_rows['fileid'].tolist()
        test['labels'] += [name] * len(test_rows)
        
        # remove the test rows
        group = group.drop(test_rows.index)
        
        #randomly select validation rows
        val_rows = group.sample(frac=val_prop, replace=False)
        val['files'] += val_rows['fileid'].tolist()
        val['labels'] += [name] * len(val_rows)
        
        # remove the validation rows
        group = group.drop(val_rows.index)
        
        # train is everything left over
        train_rows = group
        train['files'] += train_rows['fileid'].tolist()
        train['labels'] += [name] * len(train_rows)
        
    return train, val, test


In [16]:
train, val, test = split_from_df(df, 'species', 0.15, 0.15)
train_kag, val_kag, test_kag = split_from_df(df_kag, 'species', 0.15, 0.15)

In [17]:
def verfiy_split(train, val, test):
    print('num files == num labels:', len(train['files']) == len(train['labels']))

    # make sure num labels == num files
    print('Train files and labels equal length: ', len(train['files']) == len(train['labels']))
    print('Validation files and labels equal length: ', len(val['files']) == len(val['labels']))
    print('Test files and labels equal length: ', len(test['files']) == len(test['labels']))

    # make sure there is no overlap between the sets
    nunique = len(set(train['files'] + val['files'] + test['files']))
    total = len(train['files'] + val['files'] + test['files'])
    print('All filenames are unique:', nunique == total)
    print(f'Unique: {nunique}, Total: {total}')

In [18]:
verfiy_split(train, val, test)

num files == num labels: True
Train files and labels equal length:  True
Validation files and labels equal length:  True
Test files and labels equal length:  True
All filenames are unique: True
Unique: 61249, Total: 61249


In [19]:
verfiy_split(train_kag, val_kag, test_kag)

num files == num labels: True
Train files and labels equal length:  True
Validation files and labels equal length:  True
Test files and labels equal length:  True
All filenames are unique: True
Unique: 21375, Total: 21375


## Encode labels

Sparse encode the lables because we are only using one identifier per class

In [20]:
# use label encoder
le =LabelEncoder()
le.fit(classes)

# transform labels
train['encoded_labels'] = le.transform(train['labels']).tolist()
val['encoded_labels'] = le.transform(val['labels']).tolist()
test['encoded_labels'] = le.transform(test['labels']).tolist()

train_kag['encoded_labels'] = le.transform(train_kag['labels']).tolist()
val_kag['encoded_labels'] = le.transform(val_kag['labels']).tolist()
test_kag['encoded_labels'] = le.transform(test_kag['labels']).tolist()

In [21]:
# check to make sure encoding worked
reconstructed_labels = [classes[x] for x in test['encoded_labels']]

# check to make sure they match
reconstructed_labels == test['labels']

True

The labels match when reconstructed

## save as json

In [22]:
data = {'mapping': classes,
        'train': train,
        'val': val,
        'test': test}

data_kag = {'mapping': classes,
            'train': train_kag,
            'val': val_kag,
            'test': test_kag}

In [23]:
def save_json(data, fname):
    
    os.makedirs(os.path.split(fname)[0], exist_ok=True)
    
    with open(fname, 'w') as fb:
        json.dump(data, fb, indent=2)

In [29]:
save_json(data, os.path.join('data', 'resources', 'data_split_single_label.json'))
save_json(data_kag, os.path.join('data', 'resources', 'data_kag_split_single_label.json'))