# ML004.02: Dataset Builder

This script extracts small wavefile clips that can be used to build a dataset. 

The process is:
- Extract positive (POS) class samples e.g. corresponding to label 'C':
- Extract random (NEG) negative samples
- Extract confounding (CON) samples e.g. corresponding to label 'O' or 'M'

In [1]:
import sys
import numpy as np
import pylab
from pathlib import Path
import os
import sys
import pickle
# datetimes
import datetime
# timezones
from zoneinfo import ZoneInfo
# pickles
import pickle
# to load a wavfile:
from scipy.io import wavfile
import pandas as pd

import os

In [2]:
clip_length = 3.0 # Fixed clip length in seconds
POSITIVE_CLASS = ['C'] # Can be single e.g. ['c'] or multiples e.g. ['c','d']
CONFOUNDING_CLASS = ['O','M']
POSITIVE_PREFIX = 'POS' # How to label output clips for positive samples e.g. POS_000001.wav
NEGATIVE_PREFIX = 'NEG' # How to label output clips for negative samples e.g. NEG_000001.wav
CONFOUNDING_PREFIX = 'CON' # How to label output clips for confounding samples e.g. CON_0000001.wav
MAX_LIMIT = None # What to limit the positive class dataset size to. Set to "None" for no limit.
NEG_POS_BALANCE_RATIO = 1.0 # Ratio of negative to positive samples. 1.0 keeps the set balanced.

In [3]:
annotation_file = 'AcousticAnnotations001.pb'
#data_directory="C:\\CloudData\\2024\\Nepal\\N001\\Audio" 
data_directory = "C:\\Users\\Amogh\\OneDrive - University of Cambridge\\Programming-New\\CaracalChitalDetector\\cnn\\data"
#output_directory = "C:\\CloudData\\2024\\Nepal\\N001\\Dataset"
output_directory = "C:\\Users\\Amogh\\OneDrive - University of Cambridge\\Programming-New\\CaracalChitalDetector\\cnn\\output"

# Step 1: Get the dataset

In [4]:
# load up the annotations
with open(annotation_file,'rb') as handle:
    annotation_df = pickle.load(handle)
print("Retrieved a total of ",len(annotation_df),"annotations")
display(annotation_df)

Retrieved a total of  1742 annotations


Unnamed: 0,LocationName,SourceFile,AnnotationType,RelativeStartTime,RelativeEndTime,StartTime,EndTime,FileStartTime,LowFreq,HighFreq,Annotation
0,CAR101,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:42:15.499436,0 days 00:42:45.632668,2023-12-16 10:42:15.499436+00:00,2023-12-16 10:42:45.632668+00:00,2023-12-16 10:00:00,1365.9,3531.5,L
1,CAR101,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:24:31.149951,0 days 00:25:28.165774,2023-12-16 11:24:31.149951+00:00,2023-12-16 11:25:28.165774+00:00,2023-12-16 11:00:00,1232.7,3065.0,M
2,CAR101,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:25:56.754382,0 days 00:26:34.382789,2023-12-16 11:25:56.754382+00:00,2023-12-16 11:26:34.382789+00:00,2023-12-16 11:00:00,1299.3,3065.0,M
3,CAR101,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:30:24.621957,0 days 00:31:06.248661,2023-12-16 11:30:24.621957+00:00,2023-12-16 11:31:06.248661+00:00,2023-12-16 11:00:00,1266.0,2915.1,M
4,CAR101,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:25:40.272799,0 days 00:26:21.766226,2023-12-18 10:25:40.272799+00:00,2023-12-18 10:26:21.766226+00:00,2023-12-18 10:00:00,366.5,8000.0,L?
...,...,...,...,...,...,...,...,...,...,...,...
1737,CAR7,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:14:41.340044,0 days 00:18:09.809745,2023-12-13 12:14:41.340044+00:00,2023-12-13 12:18:09.809745+00:00,2023-12-13 12:00:00,412.6,3044.4,C
1738,CAR7,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:23:08.861535,0 days 00:23:20.634295,2023-12-13 12:23:08.861535+00:00,2023-12-13 12:23:20.634295+00:00,2023-12-13 12:00:00,0.0,8000.0,L
1739,CAR7,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:39:34.914375,0 days 00:39:37.046799,2023-12-15 18:39:34.914375+00:00,2023-12-15 18:39:37.046799+00:00,2023-12-15 18:00:00,301.0,8000.0,C
1740,CAR7,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:39:58.574604,0 days 00:40:00.351625,2023-12-15 18:39:58.574604+00:00,2023-12-15 18:40:00.351625+00:00,2023-12-15 18:00:00,195.6,8000.0,C


# Step 2: Positive annotations

Filter out the positive annotations

In [5]:
df_pos_filter = annotation_df[annotation_df['Annotation'].isin(POSITIVE_CLASS)]
display(df_pos_filter)

Unnamed: 0,LocationName,SourceFile,AnnotationType,RelativeStartTime,RelativeEndTime,StartTime,EndTime,FileStartTime,LowFreq,HighFreq,Annotation
5,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:26:33.889389,0 days 00:26:35.170793,2024-03-23 14:25:33.889389+00:00,2024-03-23 14:25:35.170793+00:00,2024-03-23 13:59:00,830.3,1284.1,C
6,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:26:35.547677,0 days 00:26:36.716016,2024-03-23 14:25:35.547677+00:00,2024-03-23 14:25:36.716016+00:00,2024-03-23 13:59:00,714.0,1190.0,C
7,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:26:39.504955,0 days 00:26:40.673295,2024-03-23 14:25:39.504955+00:00,2024-03-23 14:25:40.673295+00:00,2024-03-23 13:59:00,714.0,1223.2,C
8,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:26:44.140624,0 days 00:26:45.685847,2024-03-23 14:25:44.140624+00:00,2024-03-23 14:25:45.685847+00:00,2024-03-23 13:59:00,708.5,1217.7,C
9,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:26:48.324032,0 days 00:26:49.680814,2024-03-23 14:25:48.324032+00:00,2024-03-23 14:25:49.680814+00:00,2024-03-23 13:59:00,669.7,1289.7,C
...,...,...,...,...,...,...,...,...,...,...,...
1736,CAR7,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:14:07.526428,0 days 00:14:34.359436,2023-12-13 12:14:07.526428+00:00,2023-12-13 12:14:34.359436+00:00,2023-12-13 12:00:00,569.0,2731.4,C
1737,CAR7,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:14:41.340044,0 days 00:18:09.809745,2023-12-13 12:14:41.340044+00:00,2023-12-13 12:18:09.809745+00:00,2023-12-13 12:00:00,412.6,3044.4,C
1739,CAR7,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:39:34.914375,0 days 00:39:37.046799,2023-12-15 18:39:34.914375+00:00,2023-12-15 18:39:37.046799+00:00,2023-12-15 18:00:00,301.0,8000.0,C
1740,CAR7,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:39:58.574604,0 days 00:40:00.351625,2023-12-15 18:39:58.574604+00:00,2023-12-15 18:40:00.351625+00:00,2023-12-15 18:00:00,195.6,8000.0,C


In [6]:
def load_clip(filename:str,start_offset,end_offset,cliplen):
    """
    Given a filename, return a wavefile and sampling rate
    
    - filename: the wavefile to load
    - start_offset: the offset time in seconds according to the annotation
    - end_offset: the file offset time in seconds according to the annotation
    - cliplen: the desired length of the returned wavefile.
    * If the length between start_offset and end_offset is less than clip_len, we start at start_offset and pad to clip_len
    * If the length between start_offset and end_offset is greater than clip_len, we start at start_offset and bound to clip_len
    
    Returns:
    - wavefile samples
    - sampling rate
    - cutstatus: 
    * True: the annotation has been cut
    * False: The annotation is complete, with padding
    """
    rate,data =wavfile.read(filename)
    annotationLen = end_offset-start_offset
    sampleStart = int(rate*start_offset)
    sampleEnd = sampleStart+int(cliplen*rate)
    dataclipped = data[sampleStart:sampleEnd]
    return dataclipped,rate,(annotationLen>cliplen)



In [7]:
def annotation_file_to_wavfile(annotation_file,basepath):
    """Convert an annotation file to the corresponding wavefile"""
    af = os.path.split(annotation_file)[1]
    wavefile = af.split('.')[0]+'.wav'
    full_filename = os.path.join(basepath,wavefile)
    return full_filename


In [8]:
if MAX_LIMIT is None:
    PositiveTargetLimit = len(df_pos_filter)
else:
    PositiveTargetLimit = MAX_LIMIT
cut_count = 0
for k in range(PositiveTargetLimit):
    # get the annotation
    annotation = df_pos_filter.iloc[k]
    # map to audio filename
    ff = annotation_file_to_wavfile(annotation.SourceFile,data_directory)
    # get the start/end times
    dtStart = annotation.RelativeStartTime.total_seconds()
    dtEnd = annotation.RelativeEndTime.total_seconds()
    # pull out the clip
    d,r,c = load_clip(ff,dtStart,dtEnd,clip_length)
    # save
    strk = f"{POSITIVE_PREFIX}_{k:06d}.wav"
    outfilename = os.path.join(output_directory,strk)
    wavfile.write(outfilename,r,d)
    # update number of annotations that have been cut/truncated
    if c: cut_count+=1
print(cut_count)

93


# Step 3: Confounder annotations

Pull out samples that have been marked as being similar enough to ask as confounders/confuse the classifier.

In [9]:
df_con_filter = annotation_df[annotation_df['Annotation'].isin(CONFOUNDING_CLASS)]
display(df_con_filter)
if MAX_LIMIT is None:
    PositiveTargetLimit = len(df_con_filter)
else:
    PositiveTargetLimit = MAX_LIMIT
cut_count = 0
for k in range(PositiveTargetLimit):
    # get the annotation
    annotation = df_con_filter.iloc[k]
    # map to audio filename
    ff = annotation_file_to_wavfile(annotation.SourceFile,data_directory)
    # get the start/end times
    dtStart = annotation.RelativeStartTime.total_seconds()
    dtEnd = annotation.RelativeEndTime.total_seconds()
    # pull out the clip
    d,r,c = load_clip(ff,dtStart,dtEnd,clip_length)
    # save
    strk = f"{CONFOUNDING_PREFIX}_{k:06d}.wav"
    outfilename = os.path.join(output_directory,strk)
    wavfile.write(outfilename,r,d)
    # update number of annotations that have been cut/truncated
    if c: cut_count+=1
print(cut_count)

Unnamed: 0,LocationName,SourceFile,AnnotationType,RelativeStartTime,RelativeEndTime,StartTime,EndTime,FileStartTime,LowFreq,HighFreq,Annotation
1,CAR101,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:24:31.149951,0 days 00:25:28.165774,2023-12-16 11:24:31.149951+00:00,2023-12-16 11:25:28.165774+00:00,2023-12-16 11:00:00,1232.7,3065.0,M
2,CAR101,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:25:56.754382,0 days 00:26:34.382789,2023-12-16 11:25:56.754382+00:00,2023-12-16 11:26:34.382789+00:00,2023-12-16 11:00:00,1299.3,3065.0,M
3,CAR101,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:30:24.621957,0 days 00:31:06.248661,2023-12-16 11:30:24.621957+00:00,2023-12-16 11:31:06.248661+00:00,2023-12-16 11:00:00,1266.0,2915.1,M
28,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:07:40.680845,0 days 00:07:42.791393,2024-03-25 08:05:40.680845+00:00,2024-03-25 08:05:42.791393+00:00,2024-03-25 07:58:00,1015.4,1406.5,O
29,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:27:01.633504,0 days 00:27:03.581144,2024-03-25 17:12:01.633504+00:00,2024-03-25 17:12:03.581144+00:00,2024-03-25 16:45:00,1014.8,3096.3,O
...,...,...,...,...,...,...,...,...,...,...,...
1731,CAR6,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:24:13.905185,0 days 00:24:16.881694,2023-12-15 02:24:13.905185+00:00,2023-12-15 02:24:16.881694+00:00,2023-12-15 02:00:00,1332.6,3298.2,M
1732,CAR6,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:24:20.302458,0 days 00:24:23.900925,2023-12-15 02:24:20.302458+00:00,2023-12-15 02:24:23.900925+00:00,2023-12-15 02:00:00,1266.0,3398.2,M
1733,CAR7,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:01:43.094771,0 days 00:03:25.581171,2023-12-13 09:01:43.094771+00:00,2023-12-13 09:03:25.581171+00:00,2023-12-13 09:00:00,428.3,4169.3,M
1734,CAR7,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:00:59.508763,0 days 00:02:15.418375,2023-12-13 10:00:59.508763+00:00,2023-12-13 10:02:15.418375+00:00,2023-12-13 10:00:00,728.2,4473.6,M


149


# Step 4: Random negative annotations

These are clips that are taken at random points in the files where no annotations exist. This is useful for sampling arbitrary background noise.

In [46]:
# random seed
np.random.seed(100)
neg_count = 0
neg_target = int(len(df_pos_filter)*NEG_POS_BALANCE_RATIO)
neg_array = []

while (neg_count <= neg_target):
    # Step 1 - pick a random annotation (can be anything)
    sampleIdx = np.random.randint(len(annotation_df))
    positive_annotation = annotation_df.iloc[sampleIdx]
    
    # Step 2 - find all other annotations within the same file
    posSourceFile = positive_annotation.SourceFile
    similar_events_df = annotation_df.query("SourceFile == @posSourceFile" )
    print("Annotations in file",len(similar_events_df))
    
    # Step 3: Load the file (so we can see how long it is)
    # map to audio filename
    ff = annotation_file_to_wavfile(annotation.SourceFile,data_directory)
    r,data = wavfile.read(ff)
    wavLen = len(data)/r
    
    # Step 4: Pick a random starting time
    rStart = datetime.timedelta(seconds = np.random.uniform(low=0,high=wavLen))
    rEnd = rStart +  datetime.timedelta(seconds=clip_length)
    print(rStart,rEnd)
    
    # Step 5: Check for overlap
    overlapDf = similar_events_df.query("SourceFile == @posSourceFile and RelativeStartTime>=@rStart and RelativeStartTime <=@rEnd")
    if (len(overlapDf)> 0):
        print("overlap")
        # there was an overlap - restart the loop
        continue
        
    # Step 6: pull out the clip
    d = data[int(rStart.total_seconds()*r):int(rEnd.total_seconds()*r)]

    # ignore if clip is not of correct length (3 seconds)
    if len(d) < 3*r:
        continue
    
    # save
    strk = f"{NEGATIVE_PREFIX}_{neg_count:06d}.wav"
    outfilename = os.path.join(output_directory,strk)
    wavfile.write(outfilename,r,d)

    # record where this clip came from
    neg_record = {}
    neg_record["index"]=k
    neg_record["wavfile"]=strk
    neg_record['fullfilename']=outfilename
    neg_record["sourceFile"]=posSourceFile
    neg_record["startOffset"]=rStart
    neg_record["endOffset"]=rEnd
    neg_array.append(neg_record)
    neg_count+=1

# save the negative dataframe
negDf =pd.DataFrame(neg_array)
negDf.to_csv("negative_dataset.csv")

Annotations in file 18
0:04:23.200393 0:04:26.200393
Annotations in file 16
0:05:31.287408 0:05:34.287408
Annotations in file 243
0:01:01.455817 0:01:04.455817
overlap
Annotations in file 45
0:04:23.040961 0:04:26.040961
Annotations in file 18
0:02:57.546480 0:03:00.546480
Annotations in file 175
0:03:45.528600 0:03:48.528600
overlap
Annotations in file 185
0:04:04.830150 0:04:07.830150
overlap
Annotations in file 28
0:01:12.678315 0:01:15.678315
Annotations in file 18
0:00:02.281270 0:00:05.281270
Annotations in file 243
0:06:23.777103 0:06:26.777103
Annotations in file 25
0:01:07.428388 0:01:10.428388
Annotations in file 175
0:02:02.100245 0:02:05.100245
overlap
Annotations in file 138
0:02:49.297113 0:02:52.297113
Annotations in file 175
0:04:11.752974 0:04:14.752974
overlap
Annotations in file 7
0:02:11.809662 0:02:14.809662
Annotations in file 175
0:03:43.039249 0:03:46.039249
overlap
Annotations in file 175
0:00:02.230805 0:00:05.230805
overlap
Annotations in file 57
0:04:13.6065