In [None]:
!pip install soundfile
!pip install pydub

import os
import sys
import h5py
import ast
import time
from tqdm import tqdm

import numpy as np
import pandas as pd

import keras
from keras.models import load_model
import tensorflow as tf

## Connect to google drive

In [None]:
# Run this cell to mount to google drive if using Google Colab.
from google.colab import drive

drive.mount('/content/drive')
os.chdir('/content/drive/')

In [3]:
# Auto re-load utils from util py files
%load_ext autoreload
%autoreload 2

In [None]:
# Load functions in util files
%cd "path/to/py/util/folder/"
!ls # verify in the correct folder

from dataset_process_utils import *
from classification_utils import *

## Load model

In [5]:
model_filepath = "path/to/trained/model.hdf5"

In [None]:
model = None
keras.backend.clear_session()
model = load_model(model_filepath)
print("model loaded:", model_filepath)

## Save audio to pickle for future predictions

In [None]:
pkl_folder = "path/to/pkl/folder/to/store/sample/to/detect/"

sound_folder = "path/to/audio/mp3/folder/" # jesper ridge folder path
temp_folder = "temp_wav/"

if not os.path.exists(pkl_folder):
  print("Creating folder:", pkl_folder)
  os.mkdir(pkl_folder)

max_freq = 2500
sec_used = 1
typeUsed = [b'Anaxyrus-boreas',b'Pseudacris-sierra',b'Rana-boylii', b'Rana-catesbeiana', b'Rana-draytonii', b'noise-or-background']

In [None]:
day_folders = os.listdir(sound_folder)
processed_days = os.listdir(pkl_folder)
processed_days = [x.split(".")[0].split("_")[0] for x in processed_days]
print(day_folders[:10], len(day_folders))
print(processed_days[:10], len(processed_days))
day_folders_todo = list(set(day_folders) - set(processed_days))
print(day_folders_todo[:10], len(day_folders_todo))

In [None]:
day_folders_2021= [x for x in day_folders_todo if "2021" in x]
print(day_folders_2021[:5], len(day_folders_2021))

['2021-06-19', '2021-10-09', '2021-12-02', '2021-01-31', '2021-07-13'] 113


In [None]:
index = 0
limit_hour = 19 # process one hour at a time to avoide OOM
print("Max frequency:", max_freq)
print("Collecting samples for hour:", limit_hour)

for day_folder in day_folders_2021:
  print("Processing day:", day_folder, index)
  index += 1
  mp3_files = os.listdir(sound_folder + day_folder)

  # Use only 19:xx so far
  mp3_files= [x for x in mp3_files if f"{limit_hour}:" in x]
  print(mp3_files[:10], len(mp3_files))
  
  if len(mp3_files) == 0:
    print("No sound file match criteria, continue to next")
    continue

  day_path_to_store = pkl_folder + day_folder + f"{limit_hour}.pkl"
  folder_start_time = time.time()

  frames = []
  for mp3_file in mp3_files:
    mp3_path = sound_folder + day_folder + "/" + mp3_file
    # Retrieve spectrogram and related info
    spec_to_predict, cur_intervals = retrieve_spec_from_mp3(
        mp3_path, temp_folder, max_freq, sec_used, log_scale = False
    )
    file_names_to_dump = np.repeat(mp3_path, len(spec_to_predict))
    
    cur_df = pd.DataFrame(
        [
            file_names_to_dump,
            cur_intervals,
            spec_to_predict
        ]
    ).T
    cur_df = cur_df.set_axis(
        [
            "file_path",
            "interval",
            "spectrogram",
        ],
        axis=1,
        copy=False,
    )
    frames.append(cur_df)

  # Save the whole day
  day_df = pd.concat(frames)
  day_df.to_pickle(day_path_to_store)  
  del day_df
  print("Processed day with time:", time.time() - folder_start_time, day_path_to_store)

## Predict with processed pickle files

In [None]:
pkl_folder = "jr_spec_pkl_2022_nolog_freq2500/"
temp_folder = "temp_wav/"
result_folder = "results_" + model_filepath.split("/")[-1].split(".")[0] + "/"

if not os.path.exists(result_folder):
  print("Creating folder:", result_folder)
  os.mkdir(result_folder)
print(result_folder)

max_freq = 2500
sec_used = 1
typeUsed = [b'Anaxyrus-boreas',b'Pseudacris-sierra',b'Rana-boylii', b'Rana-catesbeiana', b'Rana-draytonii', b'noise-or-background']

In [None]:
days_to_predict = [x.split(".")[0] for x in os.listdir(pkl_folder)]
# processed_days = os.listdir(result_folder)
subfolders= [f.path for f in os.scandir(result_folder) if f.is_dir()]
processed_days = [y for x in subfolders for y in os.listdir(x)] +  os.listdir(result_folder)
processed_days = [x.split(".")[0] for x in processed_days]
print("All days to predicts", days_to_predict[:5], len(days_to_predict))
print("predicted days", processed_days[:5], len(processed_days))
day_pickle_todo = list(set(days_to_predict) - set(processed_days))
print("Days to predict", day_pickle_todo[:5], len(day_pickle_todo))

# Filter to work on days
day_pickle_todo= [x + ".pkl" for x in day_pickle_todo if "2022" in x]
print("Days to predict", day_pickle_todo[:5], len(day_pickle_todo))

In [None]:
# index = 0
for day_pickle in tqdm(day_pickle_todo):
  # print("Processing pickle:", day_pickle, index)
  
  # Load data from pickle
  pickle_filepath = pkl_folder + day_pickle
  try:
    unpickled_df = pd.read_pickle(pickle_filepath)
  except EOFError:
    print("EOF Error, skipping:", pickle_filepath)
    continue
  
  intervals = np.array(unpickled_df["interval"])
  specs = np.array(unpickled_df["spectrogram"].tolist())
  specs = np.array(normalize(specs))
  file_paths = np.array(unpickled_df["file_path"])
  
  # Retrieve spectrogram, predict, and retrieve related info
  day_df = predict_mp3(file_paths, temp_folder, model, typeUsed, max_freq,
                       sec_used, spec_to_predict = specs,
                       cur_intervals = intervals, repeat = False)

  # Store to subfolders with month
  month_folder = result_folder + day_pickle[:7] + "/"
  if not os.path.exists(month_folder):
    print("Creating folder:", month_folder)
    os.mkdir(month_folder)
  day_result = month_folder + day_pickle.split(".")[0] + ".csv"
  day_df.to_csv(day_result, sep='\t', index = False)
  # print("Processed day with time:", time.time() - pickle_start_time, day_result)

  del intervals
  del specs
  del unpickled_df