# WhisperSeg Application to Rose's Canary Data

First install the required packages. This isn't an exhaustive list. For the exhaustive list, please refer to "requirements.txt" in this repo.

In [2]:
!pip install transformers ctranslate2 ipywidgets tqdm scipy numpy librosa matplotlib



In [3]:
import librosa
import numpy as np
from model import WhisperSegmenterFast
from audio_utils import WhisperSegFeatureExtractor
import matplotlib.pyplot as plt
from scipy.io.wavfile import write
from tqdm import tqdm
import os

 Now we will define the class that will be responsible for the processing of audio files. The main functions of this class are as follows:

 1. segment_song: This will apply WhisperSeg to the audio file and will return the samples in the raw audio that correspond to song.

 2. silencer: The samples of the raw wav file that correspond to noise (detected from the segment_song function above) will be replaced with silences. This returns a wav file with the noise replaced with silence
 
 3. create_sonogram: create a sonogram of wav file returned from "silencer"

In [None]:
class Segmenter: 
    def __init__(self, sr, segmenter, feature_extractor, min_frequency, spec_time_step, min_segment_length, eps, num_trials, wavfiles) -> None:
        '''
        Contains arguments necessary for WhisperSeg segmentation. For more details please refer to the readme of this repo
        '''
        self.sr = sr
        self.segmenter = segmenter
        self.feature_extractor = feature_extractor
        self.min_frequency = min_frequency
        self.spec_time_step = spec_time_step
        self.min_segment_length = min_segment_length
        self.eps = eps
        self.num_trials = num_trials

        self.wavfiles = wavfiles

        # Will contain the detected song's onset times and offset times (parts of the wav file that correspond to song)
        self.onset_list = []
        self.offset_list = []

    def segment_song(self, wavfile_path):
        '''
         Runs WhisperSeg on the audio file to determine the samples that correspond to song.
        '''

        onset_list = []
        offset_list = []

        audio, _ = librosa.load(wavfile_path, sr = self.sr)

        prediction = self.segmenter.segment(audio, sr = sr, min_frequency = self.min_frequency, spec_time_step = self.spec_time_step,
                        min_segment_length = self.min_segment_length, eps = self.eps, num_trials = self.num_trials)
        
        onset_list.append(prediction['onset'])
        offset_list.append(prediction['offset'])

        return self.sr*np.array(onset_list), self.sr*np.array(offset_list)

    
    def silencer(self, wavfile, samples_onsets, samples_offsets):
        '''
        This function will take a noisy wavfile and find the samples according to the samples_onsets and samples_offsets arrays. I will then replace all samples outside of these regions with zeros (silences)
        '''

        # Flatten the arrays to ensure they are one-dimensional
        samples_onsets = np.array(samples_onsets).flatten()
        samples_offsets = np.array(samples_offsets).flatten()

        # Create an array of silence with the same shape and data type as the input wavfile
        silenced_audio = np.zeros_like(wavfile)

        # Copy detected regions to the silenced array
        for start, end in zip(samples_onsets, samples_offsets):
            # Explicitly convert start and end to integers to avoid indexing errors
            start_idx = int(start)
            end_idx = int(end)
            silenced_audio[start_idx:end_idx] = wavfile[start_idx:end_idx]

        return silenced_audio
    
    def create_sonogram(self, audio):
        '''
        Creates a sonogram corresponding to the silenced audio
        '''
        sonogram = self.feature_extractor(audio, sampling_rate=self.sr, padding = "do_not_pad" )["input_features"][0]

        return sonogram


Now let's apply this class to real data

In [1]:
sr = 32000
min_frequency = 0
spec_time_step = 0.001
min_segment_length = 0.005
eps = 0.01
num_trials = 3

We will use the version of WhisperSeg finetuned on canaries

In [None]:
segmenter = WhisperSegmenterFast( "nccratliri/whisperseg-canary-ct2", device="cpu" )

More parameter initialization (for more details refer to the readme). We will also define the WhisperSeg object 

In [None]:
# Default values
window_size = 15
spec_width = 1000
min_frequency = 0
max_frequency = None


In [None]:
feature_extractor = WhisperSegFeatureExtractor(sr, window_size / spec_width, min_frequency, max_frequency )


Rose's data is structured first by Bird ID and then by day of data acquisition. We will extract the bird identities and the days of acquisition  and then apply WhisperSeg to the

In [None]:
# This will be a single bird's filepath
bird_paths = '/Users/AnanyaKapoor/Volumes/Extreme SSD/USA5207'


In [None]:
# First let's extract the full filepaths of each wav file

bird_filepaths = []

for filename in os.listdir(bird_paths):
    file_path = os.path.join(bird_paths, filename)
    bird_filepaths.append(file_path)


# For each wav file we will apply WhisperSeg, extract the cleaned up audio file, and then create a sonogram from that audio file. 
for bird_path in bird_filepaths:
    # I first want to extract the days of data acquisiton (for example, day 38, 39 from USA5207)
    all_days = bird_path

    day_filepaths = []

    for filename in os.listdir(all_days):
        file_path = os.path.join(all_days, filename)
        day_filepaths.append(file_path)

    # Extract bird name
    # Split the path by '/'
    parts = all_days.split('/')

    # Get the last element of the list
    bird_name = parts[-1]
    
    os.makedirs(f'png_files/{bird_name}', exist_ok=True) # Will save images of the original wav file sonogram and the sonogram of the cleaned up audio file (used for visual inspection mostly)
    os.makedirs(f'new_wav_files/{bird_name}', exist_ok=True) # Will save the new wav files. That way you can create new sonograms with different signal processing parameters. 
    
    for day_path in day_filepaths:
        # Extract each wav file filepath from each day folder. 
        filepaths = []

        for filename in os.listdir(day_path):
            file_path = os.path.join(day_path, filename)
            filepaths.append(file_path)

        # If there are no wav files in a folder then just skip that jawn. 
        if len(filepaths) == 0:
            continue
        
        # Define the segmenter (will process wav files within each day for a particular bird. Example: Will process all wav files from Day 38 of Bird USA5207)
        seg_jawn = Segmenter(sr = sr, segmenter = segmenter, feature_extractor=feature_extractor, min_frequency=min_frequency, spec_time_step=spec_time_step, min_segment_length= min_segment_length, eps = eps, num_trials=num_trials, wavfiles= filepaths)

        # For each wav file
        for i in tqdm(np.arange(len(seg_jawn.wavfiles))):
            # Extract the full filepath of the song
            audio_path = seg_jawn.wavfiles[i]
            # Load in the wav file 
            audio, _ = librosa.load(audio_path, sr = sr )
            # Apply WhisperSeg to the data to extract the samples of the raw wav file that correspond to detected song
            sample_onsets, sample_offsets = seg_jawn.segment_song(audio_path)
            # Replace all the values for the samples in detected not song with 0s (silences)
            silenced_audio = seg_jawn.silencer(audio, sample_onsets, sample_offsets)
            # Create a sonogram from the original wav file
            orig_spec = seg_jawn.create_sonogram(audio)
            # Create a sonogram from the cleaned up audio file (silenced_audio)
            silenced_spec = seg_jawn.create_sonogram(silenced_audio)

            # Plotting (for every 50th wav file)
            if i%50 == 0:

                fig, axes = plt.subplots(2, 1, figsize=(10, 8))

                # Original audio spectrogram
                axes[0].imshow(orig_spec, origin='lower', cmap='viridis')
                axes[0].set_title('Original Audio Spectrogram')
                axes[0].set_xlabel('Time')
                axes[0].set_ylabel('Frequency Bin')

                # Silenced audio spectrogram
                axes[1].imshow(silenced_spec, origin='lower', cmap='viridis')
                axes[1].set_title('Spectrogram from Audio with Noise Removed')
                axes[1].set_xlabel('Time')
                axes[1].set_ylabel('Frequency Bin')

                # Display the plot
                plt.tight_layout()

                parts = day_path.split('/')
                day_value = parts[-1]

                os.makedirs(f'png_files/{bird_name}/day_{day_value}', exist_ok=True)

                # Split the path by '/'
                parts = audio_path.split('/')

                # Get the last element of the list
                last_part = parts[-1]

                song_name = last_part.replace('.wav', '.png')

                plt.savefig(f'png_files/{bird_name}/day_{day_value}/{song_name}')
                plt.close()

            # SAVE THE WAV FILES

            ## Split the path by '/'
            parts = audio_path.split('/')

            ## Get the last element of the list
            last_part = parts[-1]

            os.makedirs(f'new_wav_files/{bird_name}/day_{day_value}', exist_ok=True)

            write(f'new_wav_files/{bird_name}/day_{day_value}/{last_part}', sr, silenced_audio)
