# Augmentation for 600birds xeno-canto files

We have downloaded the xeno-canto recordings from 600+ species of common North American birds, split them into ~6s files (see `../1_splitting`), and then detected which files are silent and which are not (see `../2_silence_detection/`).

We will now:
* Identify which species to create a training set from (based on how many non-silent training files are available)
* Separate the data into train & test sets (where the train & test set do not contain split files that originated from the same long file)
* Perform augmentations on each training file:
    - Directly create one spectrogram image of the training file
    - Create two randomly "augmented" spectrograms of the training file
* Create spectrograms of each test file

## Identify species of interest

We want to exclude any species for which we do not have enough training data.

Load the silence-detection results, set a threshold for the number of non-silent 6s files we need for each species, then create a list of species that have enough files to be included in the model

In [5]:
import pandas as pd
def count_non_silences(silence_csv, silence_col, count_num):
    '''
    Count the number of non-silent files
    
    Args:
        silence_csv (string): path to csv containing
            two columns, one of filenames and one of silences
        silence_col (string): the name of the column
            in the csv that contains silence information
        count_num (int): the integer that we should
            count as being non-silent
    '''
    
    files = pd.read_csv(silence_csv, usecols = [silence_col])
    return files[silence_col].tolist().count(count_num)

In [94]:
import glob

# The directory containing split files: split_top_dir/<species_name>/<split_files>
split_top_dir = 'xeno-canto-splits'

# All silence csvs
csvs = glob.glob(split_top_dir + "**/*.csv")

# Count number of non-silent files for each species csv
num_silences_dict = {}
for csv in csvs:
    species = csv.split('/')[1]
    num_silences_dict[species] = count_non_silences(
        silence_csv = csv, silence_col = 'silent', count_num = 0)

In [98]:
# The number of files required to keep this species in the model
num_files_threshold = 1

# Generate list of species to keep
keep_spp = []
for key, val in num_silences_dict.items():
    if val > num_files_threshold:
        keep_spp.append(key)
keep_spp

['acanthis-flammea', 'acanthis-hornemanii']

## Train/test split

The train set and the test set should not overlap in source files. That means, they should not contain split files that originated from the same long file.

TODO: this currently splits based on number of UNSPLIT files, not number of split files.

In [87]:
import math
import random

# The directory containing unsplit files: split_top_dir/<species_name>/mp3s/<unsplit_files>
unsplit_top_dir = 'xeno-canto'
extension = '.txt' #For testing; change to mp3 or wav depending on filetypes

# Percentage of unsplit files to use as sources for the training set
train_size = 0.8
seed = 22
random.seed(seed)

# Create dictionary pairing species names with lists of files to use for train/test set
train_set_dict = {}
test_set_dict = {}
for sp in keep_spp: 
    all_files = glob.glob(f'{unsplit_top_dir}/{sp}/mp3s/*{extension}')
    num_train_files = math.floor(len(all_files)*train_size)
    random.shuffle(all_files)
    train_set_dict[sp] = all_files[:num_train_files]
    test_set_dict[sp] = all_files[num_train_files:]

In [88]:
train_set_dict

{'acanthis-flammea': ['xeno-canto/acanthis-flammea/mp3s/395211.txt'],
 'acanthis-hornemanii': ['xeno-canto/acanthis-hornemanii/mp3s/93659.txt']}

In [89]:
test_set_dict

{'acanthis-flammea': ['xeno-canto/acanthis-flammea/mp3s/243939.txt'],
 'acanthis-hornemanii': ['xeno-canto/acanthis-hornemanii/mp3s/92045.txt']}

In [103]:
def make_split_files_dict(split_top_dir, unsplit_dict, extension):
    '''
    From dict of unsplit files, generate dict of split files
    
    Args:
        split_top_dir (str): top directory containing
            species directories and split files
        unsplit_dict (dict): dictionary with keys = species
            and values = filenames of unsplit files.
            Unsplit filenames must be of the format
            <split_top_dir>/<species>/<id_num>*<extension>
        extension (str): filename extension
    '''


    split_files_dict = {}
    for sp, files in unsplit_dict.items():
        split_files = []

        for file in files:
            id_num = file.strip(extension).split(os.sep)[-1]
            split_files.extend(glob.glob(f'{split_top_dir}/{sp}/{id_num}*{extension}'))

        split_files_dict[sp] = split_files
    
    return split_files_dict

    
# Create list of split files based on source file ID numbers
train_files_dict = make_split_files_dict(split_top_dir, train_set_dict, extension)
test_set_dict = make_split_files_dict(split_top_dir, test_set_dict, extension)

## Perform augmentations on each training file

Three training images should be created for each training file. One should be a true copy of the training file. Two others should be randomly augmented spectrograms of the training file.

## Create spectrograms of each test file

In [None]:
# Setup to import from other dirs
import sys
import os
REPO_PATH = os.path.abspath('../../..')
sys.path.append(REPO_PATH)

# Augmentation modules
import create_training_data.code.audio_aug as aa
import create_training_data.code.spectrogram_aug as sa

In [None]:
def save_spect(spect, spect_path):
    '''
    Save spectrogram image and return its metadata
    
    Args:
        spect (Spectrogram object): spectrogram
            object to save--must be in PIL format
        spect_path (string): path at which to save
            spectrogram image
            
    Returns:
        dictionary of metadata about image, etc.
    '''
    
    # Save image
    spect.save_image(path = spect_path)
    
    # Create a dictionary for metadata
     metadata_dict = {
        'path':spect.save_path,
        'manipulations':spect.manipulations,
        'sources':spect.sources,
    }
        
    # Add information about what audio labels were in file 
    for label in label_dict.keys():
        metadata_dict[label] = int(label in spect.labels)
    
    return metadata_dict

In [None]:
def augment_file(path, label, label_dict):
    
    audio = (
        aa.Audio(path = path, label = label)
            .apply(func = aa.get_chunk)
            .apply(func = aa.get_chunk)
            .apply(func = aa.cyclic_shift)
            .apply(func = aa.time_stretch_divisions)
            .apply(func = aa.pitch_shift_divisions)
            .apply(func = aa.random_filter)
            .apply(func = aa.sum_chunks,
                label_dict = label_dict)
            .apply(func = aa.sum_chunks,
                label_dict = label_dict)
    )
    
    spect = (
        sa.Spectrogram(audio = audio)
            .apply(func = sa.make_linear_spectrogram)
            .apply(func = sa.remove_random_hi_lo_bands)
            .apply(func = sa.resize_random_bands, rows_or_cols = 'rows')
            .apply(func = sa.resize_random_bands, rows_or_cols = 'cols')
            .apply(func = sa.resize_spect_random_interpolation, width = 299, height = 299)
            .apply(func = sa.color_jitter, hue = None)
        )
    
    return spect

In [None]:
def three_spects(path, label, label_dict, save_dir, num_manips = 2):
    '''
    Create one pristine and multiple manipulated spectrograms from an audio file
    
    Creates an unmanipulated spectrogram and a chosen number of
    manipulated spectrograms from an audio file
    
    Args:
        path (str): path to 6s-long audio file
        label (str): label for this audio file
        save_dir (str): directory in which to save results
        num_manips (str): number of manipulated 
            spectrograms to create (default 2)
    
    '''
    
    # Create unmanipulated spectrogram
    unmanipulated_spect = aa.Audio(path = path, label = label).apply(func = aa.get_chunk)
    unmanipulated_spect = (
        sa.Spectrogram(audio = unmanipulated_spect)
            .apply(func = sa.make_linear_spectrogram)
            .apply(func = sa.resize_spect_random_interpolation,
                   width=299, height=299,
                   chance_random_interpolation = 0)
    )

    # Create manipulated spectrograms
    manipulated_spects = []
    for _ in num_manips:
        manipulated_spect = augment_file(path = path, label = label, label_dict = label_dict)
        manipulated_spects.append(manipulated_spect)
    
    # Create columns for csv about spectrogram
    df = pd.DataFrame(columns=['path', 'manipulations', 'sources', *label_dict.keys()])
    
    # Save spectrogram and add to csv columns
    id_num = # TODO: Extract ID number from `path` in a stable way
    metadata = save_spect(unmanipulated_spect, filename = os.path.join(save_dir, f'{id}_unmanipulated.png'))
    df = df.append(metadata, ignore_index=True)
    for idx, spect in enumerate(manipulated_spects):
        metadata = save_spect(spect, filename = os.path.join(save_dir, f'{id}_manipulated_{idx}.png'))
        df = df.append(metadata, ignore_index=True)
    
    return df
