# ML004.01: Annotation Aggregation

This script ingests .txt files corresponding to Raven annotations of chital deer and assembles them into a single pandas dataframe for dataset extraction/analysis



# Imports

In [7]:
import pandas as pd
import os
import datetime
from zoneinfo import ZoneInfo # we sometimes get weird timezones if we use pytz
import pickle

# Parameters

In [15]:
#root_paths = ["C:\\CloudData\\2024\\Nepal\\N001\\Annotations"]
root_paths = ["C:\\Users\\Amogh\\OneDrive - University of Cambridge\\Programming-New\\CaracalChitalDetector\\cnn\\annotations"]
#root_paths = ["C:\\Users\\Amogh\\OneDrive - University of Cambridge\\Programming-New\\CaracalChitalDetector\\data\\Test set\\1 hour files"]
annotation_timezone = "UTC" # timezone of the annotation data itself
output_timezone = "UTC" # We want everything to be UTC for consistency
input_extension = ".txt"
annotation_type = "Acoustic" # 
TimeFormat = '%Y%m%d$%H%M%S' # format of datecode in the input file
outputfile = "AcousticAnnotations001.pb" # Contains only 2024 files
outputfile_all = "all_annotations.pb"  # Contains 2023 and 2024 data

# Step 1:
Crawl the directory/directories and find all the individual files. We assume any .txt file is a raven annotation file.

In [16]:
result = []
for root_path in root_paths:
    result += [os.path.join(dp, f) for dp, dn, filenames in os.walk(root_path) for f in filenames if os.path.splitext(f)[1] == input_extension]
print("loaded",len(result),"annotation files")

loaded 26 annotation files


In [17]:
print(result[0])
full_filename=result[0]
fields = os.path.split(full_filename)[1].split(".")[0].split("_")
print(fields)

filedt = datetime.datetime.strptime(fields[1],TimeFormat)
print(filedt)

C:\Users\Amogh\OneDrive - University of Cambridge\Programming-New\CaracalChitalDetector\cnn\annotations\CAR204_20240323$135900_1711181640.Table.1.selections.txt
['CAR204', '20240323$135900', '1711181640']
2024-03-23 13:59:00


# Step 2:

Parse each file and cat onto a common dataframe. We need to take into account:

1. The datetime of the source file. This is because all the annotations are relative to this within the file
2. The station ID. This gives us the (semantic) location of the extracted sounds. 

We then export the whole pandas dataframe to a pickled object so we can query/parse it rapidly in the future

In [18]:
dictlist = []

for full_filename in result:
    
    try:
        fields = os.path.split(full_filename)[1].split(".")[0].split("_")
        station_id = fields[0]
        filestrdt = fields[1]
        filebasetime = fields[2]
        filedt = datetime.datetime.strptime(filestrdt,TimeFormat)
        localdt = filedt.replace(tzinfo=ZoneInfo(annotation_timezone))
        rootdt = localdt.astimezone(ZoneInfo(output_timezone))
    except:
        print("Invalid file pattern",full_filename)
        pass
    try:
        df = pd.read_csv(full_filename,sep="\t")
    except:
        print("Issue with reading file:",full_filename)
    for idx,row in df.iterrows():
        try:
            rowdict = {}
            rowdict['LocationName'] = station_id
            rowdict['SourceFile'] = full_filename
            rowdict['AnnotationType'] = annotation_type
            rowdict['RelativeStartTime'] = datetime.timedelta(seconds=row['Begin Time (s)'])
            rowdict['RelativeEndTime'] = datetime.timedelta(seconds=row['End Time (s)'])
            rowdict['StartTime'] = rootdt+datetime.timedelta(seconds=row['Begin Time (s)'])
            rowdict['EndTime'] = rootdt+datetime.timedelta(seconds=row['End Time (s)'])
            rowdict['FileStartTime']=filedt
            rowdict['LowFreq'] = row['Low Freq (Hz)']
            rowdict['HighFreq'] = row['High Freq (Hz)']
            if 'Annotation' in df.columns:
                rowdict['Annotation']= str(row['Annotation'])
            elif 'Annotations' in df.columns:
                rowdict['Annotation']= str(row['Annotations'])
            elif 'Annotate' in df.columns:
                rowdict['Annotation']= str(row['Annotate'])
            elif 'Species' in df.columns:
                rowdict['Annotation']= str(row['Species'])
            elif 'type' in df.columns:
                rowdict['Annotation']= str(row['type'])
            else:
                print("Skipping row",idx,row,full_filename)
                continue
            dictlist.append(rowdict)
        except:
            print("Issue with parsing row",idx,row,full_filename)

# we can now turn our list of dicts back into a dataframe
outputdf = pd.DataFrame(dictlist)
# and export it into nice formats that can easily be reloaded/parsed
print("retrieved a total of ",len(outputdf),"annotations")
with open(outputfile,'wb') as handle:
    pickle.dump(outputdf,handle)

outputdf.to_csv("AcousticAnnotations001.csv")

retrieved a total of  1346 annotations


# Step 3: Reload and verify

Reload the annotation dataframe to check that it is correct and useful for downstream tasks

In [19]:
with open(outputfile,'rb') as handle:
    reloadDF = pickle.load(handle)
print(f"Loaded dataframe with {len(reloadDF)} annotations.")

Loaded dataframe with 1346 annotations.


# Step 4: Distribution of annotations

Here we look through the whole set of annotations and look at the count of each type of annotation.

In [20]:
from collections import Counter
import pprint
annotation_list = reloadDF['Annotation'].tolist()
counts = Counter(annotation_list)
pprint.pprint(counts)

Counter({'C': 852, 'O': 409, 'M': 82, 'nan': 2, '#': 1})
