In [11]:
import numpy as np
import soundfile as sf
import noisereduce as nr
import librosa
import birdnetlib
import os 
import matplotlib.pyplot as plt
import pandas as pd
import shutil
from collections import Counter
from tensorflow.lite.python.interpreter import Interpreter
from pprint import pprint 
import pydub 
import scipy 

In [12]:
from birdnetlib import Recording
from birdnetlib.analyzer import Analyzer
from birdnetlib.batch import DirectoryMultiProcessingAnalyzer
from birdnetlib.species import SpeciesList
from birdnetlib.watcher import DirectoryWatcher
from datetime import datetime 
import re 
import csv 

In [13]:
from xenopy import Query 

# q = Query(cnt="Camobdia", box="11.403694,105.398278,11.406554,105.396771")
# q.retrieve_meta(verbose=True) 
# q.retrieve_recordings(outdir="output/xeno_canto", multiprocess=True, nproc=10)

In [14]:
metadata_df = pd.read_csv("metadata.csv")
metadata_df["Date"] = pd.to_datetime(metadata_df["Date"], dayfirst=True)

def get_file_metadata(filename, df):
    date_match = re.search(r'_(\d{8})_(\d{6})', filename)
    if not date_match:
        return None, None, None

    dt = datetime.strptime(date_match.group(1) + date_match.group(2), "%Y%m%d%H%M%S")

    if dt.hour < 6: 
        recording_date = (dt.date() + pd.Timedelta(days=1)) # Timedelta represents difference of 1 day 
    else:
        recording_date = dt.date()

    aru_match = re.search(r'(\d+_S\d+)', filename) # update regex !  
    if not aru_match:
        return None, None, None

    aru_id = aru_match.group(1)

    matches = df[
        (df["Device Code"] == aru_id) &
        (df["Date"].dt.date == recording_date)
    ]

    if matches.empty:
        return None, None, None

    row = matches.iloc[0] # integer location based indexing 
    return row["Latitude"], row["Longitude"], row["Time"]


In [15]:
def clear_directory(dir_path):
    if os.path.exists(dir_path):
        shutil.rmtree(dir_path) # remove every file in a given directory
    os.mkdir(dir_path)

In [16]:
def extract_file_date(filename):
    pattern = r"(\d{8})[_-](\d{6})" # search for pattern in file name of year/month/day_hour/minute/second
    match = re.search(pattern, filename)
    date, time = match.groups()
    return datetime.strptime(date + time, "%Y%m%d%H%M%S")

In [17]:
def print_file_detections(recording):
    print("\n" + recording.path)
    pprint(recording.date)
    print("predicted species: \n")
    lat, lon, day_or_night = get_file_metadata(os.path.basename(recording.path), metadata_df)
    species_pred = SpeciesList() 
    species_predicted = species_pred.return_list(lon=lon, lat=lat, 
                                                    date=recording.date, threshold=0.75) 
    pprint(species_predicted)
    print("detected species: \n")
    pprint(recording.detections)

In [None]:
def analyse_file():
    
    analysers = {
        "default" : Analyzer(),
        "full_species" : Analyzer(custom_species_list_path="species_lists/full_species_list.txt"),
        "main_species" : Analyzer(custom_species_list_path="species_lists/main_species_list.txt"),
    }
    
    analyser_colours = {
        "default" : "black",
        "full_species" : "red", 
        "main_species" : "cyan",
    }
    
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    
    export_directory = f"extractions/high_conf_{timestamp}"
    os.makedirs(export_directory, exist_ok=True) # set exist_ok = true to suppress error even if directory already exists 
    
    csv_path = os.path.join(export_directory, f"detections_{timestamp}.csv") # put csv of detections in same directory as corresponding extractions
    csv_headers = [
        "file_name", "best_analyzer", "common_name", "scientific_name",
        "confidence", "start_time", "end_time", "audio_path",
        "spectrogram_path", "lat", "lon", "date", "day_or_night", "in_full_species_list"
    ]
    
    # load in full species set for filtering (scientific names only)
    with open("species_lists/full_species_list.txt", "r", encoding="utf-8") as f:
        full_species_set = set()
        for line in f:
            line = line.strip()
            if line:
                parts = line.split("_", 1)  # split into scientific and common name
                if len(parts) == 2:
                    scientific_name = parts[0].strip().lower()
                    full_species_set.add(scientific_name)
                    
    with open("species_lists/main_species_list.txt", "r", encoding="utf-8") as f:
        main_species_set = set()
        for line in f:
            line = line.strip()
            if line:
                parts = line.split("_", 1)  # split into scientific and common name
                if len(parts) == 2:
                    scientific_name = parts[0].strip().lower()
                    main_species_set.add(scientific_name)
    
    with open(csv_path, mode="w", newline="", encoding="utf-8") as csv_file: # w = write mode 
        writer = csv.DictWriter(csv_file, fieldnames=csv_headers)
        writer.writeheader() # writes header row to csv
    
        file_directory = "aru"
        
        file_paths = [os.path.join(file_directory, f) for f in os.listdir(file_directory)
            if f.lower().endswith((".wav", ".mp3"))] # store full path for every file in aru directory 
        
        for file_path in file_paths:
            file_name = os.path.basename(file_path)
            file_date = extract_file_date(file_name)
            lat, lon, day_or_night = get_file_metadata(file_name, metadata_df)
            print(f"\nanalysing {file_name} ({day_or_night}) at lat {lat} and lon {lon}")
            
            file_detections = [] # association list of all detections in a file with what analyser they used 
            
            for analyser_name, analyser in analysers.items():
                try: 
                    if analyser.has_custom_species_list: # analyser with custom species list cannot use lat, lon
                        recording = Recording(
                            analyzer=analyser, 
                            path=file_path,
                            date=file_date,
                            min_conf=0.95,
                            return_all_detections=False 
                        )
                    else:
                        recording = Recording(
                            analyzer=analyser, 
                            path=file_path,
                            date=file_date,
                            lat=lat,
                            lon=lon, 
                            min_conf=0.95,
                            return_all_detections=False 
                        )
                    
                    recording.analyze() 
                    for detection in recording.detections:
                        detection["analyser"] = analyser_name
                        detection["recording"] = recording
                        file_detections.append(detection)
                except Exception as e:
                    print(f"analysis of {file_name} using {analyser_name} failed with: {e}")
                    
            best_file_detections = {}
            for detection in file_detections:
                species_name = detection.get("common_name")
                species_detected = (species_name, round(detection.get("start_time"), 2)) # round detection start time to 2 decimal places
                if (species_detected not in best_file_detections) or (detection["confidence"] > best_file_detections[species_detected]["confidence"]):
                    best_file_detections[species_detected] = detection # if this detection has the highest confidence value for this snippet, store this
                    
            for species in best_file_detections.keys():
                detection = best_file_detections[species]
                species_name, _ = species
                
                if detection["analyser"] == "default":
                    for specific_analyser in ["main_species", "full_species"]:  # main > full > default
                        for detect in file_detections:

                            if detect["analyser"] == "full_species":
                                sci = detect.get("scientific_name", "").strip().lower()
                                if sci not in full_species_set:
                                    continue
                                
                            if detect["analyser"] == "main_species":
                                sci = detect.get("scientific_name", "").strip().lower()
                                if sci not in main_species_set:
                                    continue

                            if (detect["analyser"] == specific_analyser) and (detect.get("common_name") == species_name):
                                best_file_detections[species] = detect
                                break


            for detection in best_file_detections.values(): # extracting (species, start time) for highest confidence clips
                scientific_name_clean = detection.get("scientific_name", "").strip().lower()
                common_name_clean = detection.get("common_name", "").strip()
                
                in_full_species_list = scientific_name_clean in full_species_set
                if not in_full_species_list:
                    print(f"including {common_name_clean} ({scientific_name_clean} not in full species list)")
                    #continue 
                else:
                    print(f"keeping {common_name_clean} [{detection.get('scientific_name', '')}]")
                    #continue

                
                recording = detection["recording"]
                species_name = re.sub(r"[^a-zA-Z0-9_]", "_", detection.get("common_name")) # if the common name has weird things in it, replace with _ 
                
                species_directory = os.path.join(export_directory, species_name)
                os.makedirs(species_directory, exist_ok=True) # make directories to store species detections by file name 
                
                try:
                    recording.extract_detections_as_audio(directory=species_directory, format="mp3", min_conf=0.95, padding_secs=2)
                    recording.extract_detections_as_spectrogram(directory=species_directory, min_conf=0.95, padding_secs=2)
                    
                    writer.writerow({
                        "file_name" : file_name, 
                        "best_analyzer" : detection["analyser"], 
                        "common_name" : detection["common_name"], 
                        "scientific_name" : detection["scientific_name"],
                        "confidence" : detection["confidence"], 
                        "start_time" : detection["start_time"], 
                        "end_time" : detection["end_time"], 
                        "audio_path" : species_directory,
                        "spectrogram_path" : species_directory, 
                        "lat" : lat, 
                        "lon" : lon, 
                        "date" : file_date.strftime("%Y-%m-%d"), 
                        "day_or_night": day_or_night,
                        "in_full_species_list" : in_full_species_list,
                    })
                except Exception as e:
                    print(f"extraction of {file_name} using failed with: {e}")
                    
    return csv_path

In [19]:
def analyse_openbills():
    analyser = Analyzer(custom_species_list_path="species_lists/asian_openbill_list.txt")
    
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    
    export_directory = f"extractions/high_conf_{timestamp}"
    os.makedirs(export_directory, exist_ok=True) # set exist_ok = true to suppress error even if directory already exists 
    
    csv_path = os.path.join(export_directory, f"detections_{timestamp}.csv") # put csv of detections in same directory as corresponding extractions
    csv_headers = [
        "file_name", "best_analyzer", "common_name", "scientific_name",
        "confidence", "start_time", "end_time", "audio_path",
        "spectrogram_path", "lat", "lon", "date", "day_or_night", "in_full_species_list"
    ]
    
    with open(csv_path, mode="w", newline="", encoding="utf-8") as csv_file: # w = write mode 
        writer = csv.DictWriter(csv_file, fieldnames=csv_headers)
        writer.writeheader() # writes header row to csv
    
        file_directory = "aru"
        
        file_paths = [os.path.join(file_directory, f) for f in os.listdir(file_directory)
            if f.lower().endswith((".wav", ".mp3"))] # store full path for every file in aru directory 
        
        for file_path in file_paths:
            file_name = os.path.basename(file_path)
            file_date = extract_file_date(file_name)
            lat, lon, day_or_night = get_file_metadata(file_name, metadata_df)
            print(f"\nanalysing {file_name} ({day_or_night}) at lat {lat} and lon {lon}")
            
            file_detections = []
            
            recording = Recording(
                analyzer=analyser, 
                path=file_path,
                date=file_date,
                min_conf=0.1,
                return_all_detections=False
            )
            
            recording.analyze() 
            for detection in recording.detections:
                detection["analyser"] = "openbill"
                detection["recording"] = recording
                file_detections.append(detection)
            
                species_name = re.sub(r"[^a-zA-Z0-9_]", "_", detection.get("common_name")) # if the common name has weird things in it, replace with _ 
                
                species_directory = os.path.join(export_directory, species_name)
                os.makedirs(species_directory, exist_ok=True) # make directories to store species detections by file name 
                
                try:
                    recording.extract_detections_as_audio(directory=species_directory, format="mp3", min_conf=0.1, padding_secs=2)
                    recording.extract_detections_as_spectrogram(directory=species_directory, min_conf=0.1, padding_secs=2)
                        
                    writer.writerow({
                        "file_name" : file_name, 
                        "best_analyzer" : detection["analyser"], 
                        "common_name" : detection["common_name"], 
                        "scientific_name" : detection["scientific_name"],
                        "confidence" : detection["confidence"], 
                        "start_time" : detection["start_time"], 
                        "end_time" : detection["end_time"], 
                        "audio_path" : species_directory,
                        "spectrogram_path" : species_directory, 
                        "lat" : lat, 
                        "lon" : lon, 
                        "date" : file_date.strftime("%Y-%m-%d"), 
                        "day_or_night": day_or_night,
                        "in_full_species_list" : "true",
                    })
                except Exception as e:
                    print(f"extraction of {file_name} using failed with: {e}")
                    
    return csv_path

In [20]:
#csv_path = analyse_file()
csv_path = analyse_openbills()
print(csv_path)

Labels loaded.
load model True
Model loaded.
Labels loaded.
load_species_list_model
Meta model loaded.
Anastomus oscitans_Asian Openbill

Ardea oscitans_Asian Openbill
2 species loaded.

analysing 3_S7901_20250204_134500_UTC_7_.wav (8:00 am to 5:00 pm) at lat 11.403694 and lon 105.398278
read_audio_data


    TF 2.20. Please use the LiteRT interpreter from the ai_edge_litert package.
    See the [migration guide](https://ai.google.dev/edge/litert/migration)
    for details.
    


read_audio_data: complete, read  200 chunks.
analyze_recording 3_S7901_20250204_134500_UTC_7_.wav

analysing 3_S7901_20250204_064500_UTC_7_.wav (8:00 am to 5:00 pm) at lat 11.403694 and lon 105.398278
read_audio_data
read_audio_data: complete, read  200 chunks.
analyze_recording 3_S7901_20250204_064500_UTC_7_.wav

analysing 4_S7902_20250204_120000_UTC_7_.wav (8:00 am to 5:00 pm) at lat 11.403694 and lon 105.398278
read_audio_data
read_audio_data: complete, read  200 chunks.
analyze_recording 4_S7902_20250204_120000_UTC_7_.wav

analysing 5_S7903_20250204_121500__0700_.wav (8:00 am to 5:00 pm) at lat 11.403694 and lon 105.398278
read_audio_data
read_audio_data: complete, read  200 chunks.
analyze_recording 5_S7903_20250204_121500__0700_.wav

analysing 4_S7902_20250205_003000_UTC_7_.wav (4:30 pm to 5:30 pm) at lat 11.403694 and lon 105.398278
read_audio_data
read_audio_data: complete, read  200 chunks.
analyze_recording 4_S7902_20250205_003000_UTC_7_.wav

analysing 1_S7899_20250204_044500