# ML004.02: Dataset Builder

This script extracts small wavefile clips that can be used to build a dataset. 

The process is:
- Extract positive (POS) class samples e.g. corresponding to label 'C':
- Extract random (NEG) negative samples
- Extract confounding (CON) samples e.g. corresponding to label 'O' or 'M'

In [1]:
import numpy as np
import os
import datetime
from zoneinfo import ZoneInfo
import pickle
from scipy.io import wavfile
import pandas as pd

In [2]:
clip_length = 3.0 # Fixed clip length in seconds
POSITIVE_CLASS = ['C'] # Can be single e.g. ['c'] or multiples e.g. ['c','d']
CONFOUNDING_CLASS = ['O','M']
POSITIVE_PREFIX = 'POS' # How to label output clips for positive samples e.g. POS_000001.wav
NEGATIVE_PREFIX = 'NEG' # How to label output clips for negative samples e.g. NEG_000001.wav
CONFOUNDING_PREFIX = 'CON' # How to label output clips for confounding samples e.g. CON_0000001.wav
MAX_LIMIT = None # What to limit the positive class dataset size to. Set to "None" for no limit.
NEG_POS_BALANCE_RATIO = 1.0 # Ratio of negative to positive samples. 1.0 keeps the set balanced.

In [3]:
annotation_file = 'AcousticAnnotations001.pb'
#data_directory="C:\\CloudData\\2024\\Nepal\\N001\\Audio" 
data_directory = "C:\\Users\\Amogh\\OneDrive - University of Cambridge\\Programming-New\\CaracalChitalDetector\\cnn\\data"
#output_directory = "C:\\CloudData\\2024\\Nepal\\N001\\Dataset"
output_directory = "C:\\Users\\Amogh\\OneDrive - University of Cambridge\\Programming-New\\CaracalChitalDetector\\cnn\\output"

# Step 1: Get the dataset

In [4]:
# load up the annotations
with open(annotation_file,'rb') as handle:
    annotation_df = pickle.load(handle)
print("Retrieved a total of ",len(annotation_df),"annotations")
display(annotation_df)

Retrieved a total of  1346 annotations


Unnamed: 0,LocationName,SourceFile,AnnotationType,RelativeStartTime,RelativeEndTime,StartTime,EndTime,FileStartTime,LowFreq,HighFreq,Annotation
0,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:26:33.889389,0 days 00:26:35.170793,2024-03-23 14:25:33.889389+00:00,2024-03-23 14:25:35.170793+00:00,2024-03-23 13:59:00,830.3,1284.1,C
1,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:26:35.547677,0 days 00:26:36.716016,2024-03-23 14:25:35.547677+00:00,2024-03-23 14:25:36.716016+00:00,2024-03-23 13:59:00,714.0,1190.0,C
2,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:26:39.504955,0 days 00:26:40.673295,2024-03-23 14:25:39.504955+00:00,2024-03-23 14:25:40.673295+00:00,2024-03-23 13:59:00,714.0,1223.2,C
3,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:26:44.140624,0 days 00:26:45.685847,2024-03-23 14:25:44.140624+00:00,2024-03-23 14:25:45.685847+00:00,2024-03-23 13:59:00,708.5,1217.7,C
4,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:26:48.324032,0 days 00:26:49.680814,2024-03-23 14:25:48.324032+00:00,2024-03-23 14:25:49.680814+00:00,2024-03-23 13:59:00,669.7,1289.7,C
...,...,...,...,...,...,...,...,...,...,...,...
1341,CAR217,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:24:39.926739,0 days 00:24:41.560243,2024-03-27 18:09:39.926739+00:00,2024-03-27 18:09:41.560243+00:00,2024-03-27 17:45:00,639.4,1343.8,C
1342,CAR217,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:24:42.628304,0 days 00:24:44.890079,2024-03-27 18:09:42.628304+00:00,2024-03-27 18:09:44.890079+00:00,2024-03-27 17:45:00,704.4,1289.6,C
1343,CAR217,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:24:47.654472,0 days 00:24:49.853420,2024-03-27 18:09:47.654472+00:00,2024-03-27 18:09:49.853420+00:00,2024-03-27 17:45:00,791.1,1311.3,C
1344,CAR217,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:25:10.146571,0 days 00:25:12.157038,2024-03-27 18:10:10.146571+00:00,2024-03-27 18:10:12.157038+00:00,2024-03-27 17:45:00,715.2,1332.9,C


# Step 2: Positive annotations

Filter out the positive annotations

In [5]:
df_pos_filter = annotation_df[annotation_df['Annotation'].isin(POSITIVE_CLASS)]
display(df_pos_filter)

Unnamed: 0,LocationName,SourceFile,AnnotationType,RelativeStartTime,RelativeEndTime,StartTime,EndTime,FileStartTime,LowFreq,HighFreq,Annotation
0,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:26:33.889389,0 days 00:26:35.170793,2024-03-23 14:25:33.889389+00:00,2024-03-23 14:25:35.170793+00:00,2024-03-23 13:59:00,830.3,1284.1,C
1,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:26:35.547677,0 days 00:26:36.716016,2024-03-23 14:25:35.547677+00:00,2024-03-23 14:25:36.716016+00:00,2024-03-23 13:59:00,714.0,1190.0,C
2,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:26:39.504955,0 days 00:26:40.673295,2024-03-23 14:25:39.504955+00:00,2024-03-23 14:25:40.673295+00:00,2024-03-23 13:59:00,714.0,1223.2,C
3,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:26:44.140624,0 days 00:26:45.685847,2024-03-23 14:25:44.140624+00:00,2024-03-23 14:25:45.685847+00:00,2024-03-23 13:59:00,708.5,1217.7,C
4,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:26:48.324032,0 days 00:26:49.680814,2024-03-23 14:25:48.324032+00:00,2024-03-23 14:25:49.680814+00:00,2024-03-23 13:59:00,669.7,1289.7,C
...,...,...,...,...,...,...,...,...,...,...,...
1341,CAR217,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:24:39.926739,0 days 00:24:41.560243,2024-03-27 18:09:39.926739+00:00,2024-03-27 18:09:41.560243+00:00,2024-03-27 17:45:00,639.4,1343.8,C
1342,CAR217,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:24:42.628304,0 days 00:24:44.890079,2024-03-27 18:09:42.628304+00:00,2024-03-27 18:09:44.890079+00:00,2024-03-27 17:45:00,704.4,1289.6,C
1343,CAR217,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:24:47.654472,0 days 00:24:49.853420,2024-03-27 18:09:47.654472+00:00,2024-03-27 18:09:49.853420+00:00,2024-03-27 17:45:00,791.1,1311.3,C
1344,CAR217,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:25:10.146571,0 days 00:25:12.157038,2024-03-27 18:10:10.146571+00:00,2024-03-27 18:10:12.157038+00:00,2024-03-27 17:45:00,715.2,1332.9,C


In [6]:
def load_clip(filename:str,start_offset,end_offset,cliplen):
    """
    Given a filename, return a wavefile and sampling rate
    
    - filename: the wavefile to load
    - start_offset: the offset time in seconds according to the annotation
    - end_offset: the file offset time in seconds according to the annotation
    - cliplen: the desired length of the returned wavefile.
    * If the length between start_offset and end_offset is less than clip_len, we start at start_offset and pad to clip_len
    * If the length between start_offset and end_offset is greater than clip_len, we start at start_offset and bound to clip_len
    
    Returns:
    - wavefile samples
    - sampling rate
    - cutstatus: 
    * True: the annotation has been cut
    * False: The annotation is complete, with padding
    """
    rate,data =wavfile.read(filename)
    annotationLen = end_offset-start_offset
    sampleStart = int(rate*start_offset)
    sampleEnd = sampleStart+int(cliplen*rate)
    dataclipped = data[sampleStart:sampleEnd]
    return dataclipped,rate,(annotationLen>cliplen)



In [7]:
def annotation_file_to_wavfile(annotation_file,basepath):
    """Convert an annotation file to the corresponding wavefile"""
    af = os.path.split(annotation_file)[1]
    wavefile = af.split('.')[0]+'.wav'
    full_filename = os.path.join(basepath,wavefile)
    return full_filename


In [8]:
if MAX_LIMIT is None:
    PositiveTargetLimit = len(df_pos_filter)
else:
    PositiveTargetLimit = MAX_LIMIT
cut_count = 0
for k in range(PositiveTargetLimit):
    # get the annotation
    annotation = df_pos_filter.iloc[k]
    # map to audio filename
    ff = annotation_file_to_wavfile(annotation.SourceFile,data_directory)
    # get the start/end times
    dtStart = annotation.RelativeStartTime.total_seconds()
    dtEnd = annotation.RelativeEndTime.total_seconds()
    # pull out the clip
    d,r,c = load_clip(ff,dtStart,dtEnd,clip_length)
    # save
    strk = f"{POSITIVE_PREFIX}_{k:06d}.wav"
    outfilename = os.path.join(output_directory,strk)
    wavfile.write(outfilename,r,d)
    # update number of annotations that have been cut/truncated
    if c: cut_count+=1
print(cut_count)

36


# Step 3: Confounder annotations

Pull out samples that have been marked as being similar enough to ask as confounders/confuse the classifier.

In [9]:
df_con_filter = annotation_df[annotation_df['Annotation'].isin(CONFOUNDING_CLASS)]
display(df_con_filter)
if MAX_LIMIT is None:
    PositiveTargetLimit = len(df_con_filter)
else:
    PositiveTargetLimit = MAX_LIMIT
cut_count = 0
for k in range(PositiveTargetLimit):
    # get the annotation
    annotation = df_con_filter.iloc[k]
    # map to audio filename
    ff = annotation_file_to_wavfile(annotation.SourceFile,data_directory)
    # get the start/end times
    dtStart = annotation.RelativeStartTime.total_seconds()
    dtEnd = annotation.RelativeEndTime.total_seconds()
    # pull out the clip
    d,r,c = load_clip(ff,dtStart,dtEnd,clip_length)
    # save
    strk = f"{CONFOUNDING_PREFIX}_{k:06d}.wav"
    outfilename = os.path.join(output_directory,strk)
    wavfile.write(outfilename,r,d)
    # update number of annotations that have been cut/truncated
    if c: cut_count+=1
print(cut_count)

Unnamed: 0,LocationName,SourceFile,AnnotationType,RelativeStartTime,RelativeEndTime,StartTime,EndTime,FileStartTime,LowFreq,HighFreq,Annotation
23,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:07:40.680845,0 days 00:07:42.791393,2024-03-25 08:05:40.680845+00:00,2024-03-25 08:05:42.791393+00:00,2024-03-25 07:58:00,1015.4,1406.5,O
24,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:27:01.633504,0 days 00:27:03.581144,2024-03-25 17:12:01.633504+00:00,2024-03-25 17:12:03.581144+00:00,2024-03-25 16:45:00,1014.8,3096.3,O
25,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:27:07.287943,0 days 00:27:08.921447,2024-03-25 17:12:07.287943+00:00,2024-03-25 17:12:08.921447+00:00,2024-03-25 16:45:00,941.3,3084.0,O
26,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:27:11.057569,0 days 00:27:12.879554,2024-03-25 17:12:11.057569+00:00,2024-03-25 17:12:12.879554+00:00,2024-03-25 16:45:00,892.3,3047.3,O
27,CAR204,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:27:15.078503,0 days 00:27:16.712007,2024-03-25 17:12:15.078503+00:00,2024-03-25 17:12:16.712007+00:00,2024-03-25 16:45:00,782.1,3120.8,O
...,...,...,...,...,...,...,...,...,...,...,...
982,CAR217,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:07:35.378038,0 days 00:07:35.890233,2024-03-25 17:52:35.378038+00:00,2024-03-25 17:52:35.890233+00:00,2024-03-25 17:45:00,937.5,2501.6,O
983,CAR217,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:07:39.609755,0 days 00:07:40.073170,2024-03-25 17:52:39.609755+00:00,2024-03-25 17:52:40.073170+00:00,2024-03-25 17:45:00,1031.3,2230.3,O
1008,CAR217,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:40:44.487790,0 days 00:40:45.463399,2024-03-27 12:25:44.487790+00:00,2024-03-27 12:25:45.463399+00:00,2024-03-27 11:45:00,1194.3,1726.3,O
1009,CAR217,C:\Users\Amogh\OneDrive - University of Cambri...,Acoustic,0 days 00:40:47.902420,0 days 00:40:48.951200,2024-03-27 12:25:47.902420+00:00,2024-03-27 12:25:48.951200+00:00,2024-03-27 11:45:00,1199.5,1736.8,O


3


# Step 4: Random negative annotations

These are clips that are taken at random points in the files where no annotations exist. This is useful for sampling arbitrary background noise.

In [10]:
# random seed
np.random.seed(100)
neg_count = 0
neg_target = int(len(df_pos_filter)*NEG_POS_BALANCE_RATIO)
neg_array = []

while (neg_count <= neg_target):
    # Step 1 - pick a random annotation (can be anything)
    sampleIdx = np.random.randint(len(annotation_df))
    positive_annotation = annotation_df.iloc[sampleIdx]
    
    # Step 2 - find all other annotations within the same file
    posSourceFile = positive_annotation.SourceFile
    similar_events_df = annotation_df.query("SourceFile == @posSourceFile" )
    print("Annotations in file",len(similar_events_df))
    
    # Step 3: Load the file (so we can see how long it is)
    # map to audio filename
    ff = annotation_file_to_wavfile(annotation.SourceFile,data_directory)
    r,data = wavfile.read(ff)
    wavLen = len(data)/r
    
    # Step 4: Pick a random starting time
    rStart = datetime.timedelta(seconds = np.random.uniform(low=0,high=wavLen))
    rEnd = rStart +  datetime.timedelta(seconds=clip_length)
    print(rStart,rEnd)
    
    # Step 5: Check for overlap
    overlapDf = similar_events_df.query("SourceFile == @posSourceFile and RelativeStartTime>=@rStart and RelativeStartTime <=@rEnd")
    if (len(overlapDf)> 0):
        print("overlap")
        # there was an overlap - restart the loop
        continue
        
    # Step 6: pull out the clip
    d = data[int(rStart.total_seconds()*r):int(rEnd.total_seconds()*r)]

    # ignore if clip is not of correct length (3 seconds)
    if len(d) < 3*r:
        continue
    
    # save
    strk = f"{NEGATIVE_PREFIX}_{neg_count:06d}.wav"
    outfilename = os.path.join(output_directory,strk)
    wavfile.write(outfilename,r,d)

    # record where this clip came from
    neg_record = {}
    neg_record["index"]=k
    neg_record["wavfile"]=strk
    neg_record['fullfilename']=outfilename
    neg_record["sourceFile"]=posSourceFile
    neg_record["startOffset"]=rStart
    neg_record["endOffset"]=rEnd
    neg_array.append(neg_record)
    neg_count+=1

# save the negative dataframe
negDf =pd.DataFrame(neg_array)
negDf.to_csv("negative_dataset.csv")

Annotations in file 175
0:12:00.036932 0:12:03.036932
Annotations in file 16
0:36:25.118218 0:36:28.118218
Annotations in file 243
0:06:45.352641 0:06:48.352641
Annotations in file 45
0:28:54.975681 0:28:57.975681
Annotations in file 175
0:05:53.608546 0:05:56.608546
Annotations in file 243
0:02:44.718914 0:02:47.718914
overlap
Annotations in file 185
0:22:33.703836 0:22:36.703836
overlap
Annotations in file 138
0:09:28.274807 0:09:31.274807
Annotations in file 25
0:07:24.746750 0:07:27.746750
Annotations in file 175
0:13:25.353489 0:13:28.353489
Annotations in file 138
0:18:36.656402 0:18:39.656402
Annotations in file 175
0:27:40.521942 0:27:43.521942
Annotations in file 243
0:07:33.720890 0:07:36.720890
Annotations in file 43
0:07:42.033965 0:07:45.033965
Annotations in file 8
0:10:52.932064 0:10:55.932064
Annotations in file 57
0:06:07.469712 0:06:10.469712
Annotations in file 45
0:25:48.982651 0:25:51.982651
Annotations in file 185
0:04:31.977527 0:04:34.977527
overlap
Annotations 