VAD audio chunking (#135)

* added audio chunker, added energy based vad, added tests * fixed compilation * fixed compilation * extracted prepareSeekClips function * review changes * Support chunking VAD for paths * Updates from review * Support clip timestamps with vad * fix compilation error * PR review and cleanup * Fix test normalization order * UI and qol tweaks for example app * Fix test normalization * Reduce accuracy requirement for vad chunker * Fix example app sidebar visibility * Further test normailziation fixes --------- Co-authored-by: ZachNagengast <znagengast@gmail.com>
argmaxinc · May 23, 2024 · 09aa70b · 09aa70b
1 parent d180062
commit 09aa70b
Show file tree

Hide file tree

Showing 18 changed files with 1,248 additions and 298 deletions.
diff --git a/Examples/WhisperAX/WhisperAX.xcodeproj/project.pbxproj b/Examples/WhisperAX/WhisperAX.xcodeproj/project.pbxproj
@@ -893,14 +893,15 @@
 				LD_RUNPATH_SEARCH_PATHS = "@executable_path/Frameworks";
 				"LD_RUNPATH_SEARCH_PATHS[sdk=macosx*]" = "@executable_path/../Frameworks";
 				MACOSX_DEPLOYMENT_TARGET = 14.0;
-				MARKETING_VERSION = 0.2.0;
-				PRODUCT_BUNDLE_IDENTIFIER = com.argmax.whisperkit.WhisperAX;
+				MARKETING_VERSION = 0.3.0;
+				PRODUCT_BUNDLE_IDENTIFIER = "com.argmax.whisperkit.WhisperAX${DEVELOPMENT_TEAM}";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SDKROOT = auto;
 				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx";
 				SUPPORTS_MACCATALYST = NO;
 				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = YES;
 				SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
 				SWIFT_VERSION = 5.0;
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
@@ -938,7 +939,7 @@
 				LD_RUNPATH_SEARCH_PATHS = "@executable_path/Frameworks";
 				"LD_RUNPATH_SEARCH_PATHS[sdk=macosx*]" = "@executable_path/../Frameworks";
 				MACOSX_DEPLOYMENT_TARGET = 14.0;
-				MARKETING_VERSION = 0.2.0;
+				MARKETING_VERSION = 0.3.0;
 				PRODUCT_BUNDLE_IDENTIFIER = com.argmax.whisperkit.WhisperAX;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SDKROOT = auto;

diff --git a/Examples/WhisperAX/WhisperAX/Views/ContentView.swift b/Examples/WhisperAX/WhisperAX/Views/ContentView.swift
diff --git a/Examples/WhisperAX/WhisperAXWatchApp/WhisperAXExampleView.swift b/Examples/WhisperAX/WhisperAXWatchApp/WhisperAXExampleView.swift
@@ -580,24 +580,11 @@ struct WhisperAXWatchView: View {
         }
 
         if useVAD {
-            // Retrieve the current relative energy values from the audio processor
-            let currentRelativeEnergy = whisperKit.audioProcessor.relativeEnergy
-
-            // Calculate the number of energy values to consider based on the duration of the next buffer
-            // Each energy value corresponds to 1 buffer length (100ms of audio), hence we divide by 0.1
-            let energyValuesToConsider = Int(nextBufferSeconds / 0.1)
-
-            // Extract the relevant portion of energy values from the currentRelativeEnergy array
-            let nextBufferEnergies = currentRelativeEnergy.suffix(energyValuesToConsider)
-
-            // Determine the number of energy values to check for voice presence
-            // Considering up to the last 1 second of audio, which translates to 10 energy values
-            let numberOfValuesToCheck = max(10, nextBufferEnergies.count - 10)
-
-            // Check if any of the energy values in the considered range exceed the silence threshold
-            // This indicates the presence of voice in the buffer
-            let voiceDetected = nextBufferEnergies.prefix(numberOfValuesToCheck).contains { $0 > Float(silenceThreshold) }
-
+            let voiceDetected = AudioProcessor.isVoiceDetected(
+                in: whisperKit.audioProcessor.relativeEnergy,
+                nextBufferInSeconds: nextBufferSeconds,
+                silenceThreshold: Float(silenceThreshold)
+            )
             // Only run the transcribe if the next buffer has voice
             guard voiceDetected else {
                 await MainActor.run {

diff --git a/Sources/WhisperKit/Core/AudioChunker.swift b/Sources/WhisperKit/Core/AudioChunker.swift
@@ -0,0 +1,110 @@
+//  For licensing see accompanying LICENSE.md file.
+//  Copyright © 2024 Argmax, Inc. All rights reserved.
+
+import Accelerate
+import AVFoundation
+import Foundation
+
+/// Responsible for chunking audio into smaller pieces
+@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
+public protocol AudioChunking {
+    func chunkAll(audioArray: [Float], maxChunkLength: Int, decodeOptions: DecodingOptions?) async throws -> [AudioChunk]
+}
+
+@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
+public extension AudioChunking {
+    func updateSeekOffsetsForResults(
+        chunkedResults: [Result<[TranscriptionResult], Swift.Error>],
+        audioChunks: [AudioChunk]
+    ) -> [TranscriptionResult] {
+        var updatedTranscriptionResults = [TranscriptionResult]()
+        for (index, chunkedResult) in chunkedResults.enumerated() {
+            switch chunkedResult {
+                case let .success(results):
+                    let seekTime = Float(audioChunks[index].seekOffsetIndex) / Float(WhisperKit.sampleRate)
+                    for result in results {
+                        var updatedSegments = [TranscriptionSegment]()
+                        for segment in result.segments {
+                            let updatedSegment = updateSegmentTimings(segment: segment, seekTime: seekTime)
+                            updatedSegments.append(updatedSegment)
+                        }
+                        var updatedResult = result
+                        updatedResult.seekTime = seekTime
+                        updatedResult.segments = updatedSegments
+                        updatedTranscriptionResults.append(updatedResult)
+                    }
+                case let .failure(error):
+                    Logging.debug("Error transcribing chunk \(index): \(error)")
+            }
+        }
+        return updatedTranscriptionResults
+    }
+}
+
+/// A audio chunker that splits audio into smaller pieces based on voice activity detection
+@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
+open class VADAudioChunker: AudioChunking {
+    /// prevent hallucinations at the end of the clip by stopping up to 1.0s early
+    private let windowPadding: Int
+    private let vad = EnergyVAD()
+
+    init(windowPadding: Int = 16000) {
+        self.windowPadding = windowPadding
+    }
+
+    private func splitOnMiddleOfLongestSilence(audioArray: [Float], startIndex: Int, endIndex: Int) -> Int {
+        // NOTE: we want to check just the 2nd part for the silence to attempt to get closest to a max length chunk
+        let audioMidIndex = startIndex + (endIndex - startIndex) / 2
+        let vadAudioSlice = Array(audioArray[audioMidIndex..<endIndex])
+        let voiceActivity = vad.voiceActivity(in: vadAudioSlice)
+        if let silence = vad.findLongestSilence(in: voiceActivity) {
+            // if silence is detected we take the middle point of the silent chunk
+            let silenceMidIndex = silence.startIndex + (silence.endIndex - silence.startIndex) / 2
+            return audioMidIndex + vad.voiceActivityIndexToAudioSampleIndex(silenceMidIndex)
+        }
+        return endIndex
+    }
+
+    public func chunkAll(audioArray: [Float], maxChunkLength: Int, decodeOptions: DecodingOptions?) async throws -> [AudioChunk] {
+        // If the audio array length is less than or equal to maxLength, return it as a single chunk
+        if audioArray.count <= maxChunkLength {
+            return [AudioChunk(seekOffsetIndex: 0, audioSamples: audioArray)]
+        }
+
+        // First create chunks from seek clips
+        let seekClips = prepareSeekClips(contentFrames: audioArray.count, decodeOptions: decodeOptions)
+
+        var chunkedAudio = [AudioChunk]()
+        for (seekClipStart, seekClipEnd) in seekClips {
+            // Loop through the current clip until we reach the end
+            // Typically this will be the full audio file, unless seek points are explicitly provided
+            var startIndex = seekClipStart
+            while startIndex < seekClipEnd - windowPadding {
+                let currentFrameLength = startIndex - seekClipStart
+                if startIndex >= currentFrameLength, startIndex < 0 {
+                    throw WhisperError.audioProcessingFailed("startIndex is outside the buffer size")
+                }
+
+                // Make sure we still need chunking for this seek clip, otherwise use the original seek clip end
+                var endIndex = seekClipEnd
+                if startIndex + maxChunkLength < endIndex {
+                    // Adjust the end index based on VAD
+                    endIndex = splitOnMiddleOfLongestSilence(
+                        audioArray: audioArray,
+                        startIndex: startIndex,
+                        endIndex: min(audioArray.count, startIndex + maxChunkLength)
+                    )
+                }
+
+                guard endIndex > startIndex else {
+                    break
+                }
+                Logging.debug("Found chunk from \(formatTimestamp(Float(startIndex) / Float(WhisperKit.sampleRate))) to \(formatTimestamp(Float(endIndex) / Float(WhisperKit.sampleRate)))")
+                let audioSlice = AudioChunk(seekOffsetIndex: startIndex, audioSamples: Array(audioArray[startIndex..<endIndex]))
+                chunkedAudio.append(audioSlice)
+                startIndex = endIndex
+            }
+        }
+        return chunkedAudio
+    }
+}
diff --git a/Sources/WhisperKit/Core/AudioProcessor.swift b/Sources/WhisperKit/Core/AudioProcessor.swift
@@ -64,7 +64,7 @@ public protocol AudioProcessing {
 
     /// Stops recording and cleans up resources
     func stopRecording()
-    
+
     /// Resume recording audio from the specified input device, appending to continuous `audioArray` after pause
     func resumeRecordingLive(inputDeviceID: DeviceID?, callback: (([Float]) -> Void)?) throws
 }
@@ -74,7 +74,7 @@ public extension AudioProcessing {
     func startRecordingLive(inputDeviceID: DeviceID? = nil, callback: (([Float]) -> Void)?) throws {
         try startRecordingLive(inputDeviceID: inputDeviceID, callback: callback)
     }
-    
+
     func resumeRecordingLive(inputDeviceID: DeviceID? = nil, callback: (([Float]) -> Void)?) throws {
         try resumeRecordingLive(inputDeviceID: inputDeviceID, callback: callback)
     }
@@ -157,7 +157,7 @@ public extension AudioProcessing {
 
 @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
 public class AudioProcessor: NSObject, AudioProcessing {
-    private var lastInputDevice:DeviceID?
+    private var lastInputDevice: DeviceID?
     public var audioEngine: AVAudioEngine?
     public var audioSamples: ContiguousArray<Float> = []
     public var audioEnergy: [(rel: Float, avg: Float, max: Float, min: Float)] = []
@@ -292,6 +292,84 @@ public class AudioProcessor: NSObject, AudioProcessing {
 
     // MARK: - Utility
 
+    /// Detect voice activity in the given buffer of relative energy values.
+    /// - Parameters:
+    ///   - relativeEnergy: relative energy values
+    ///   - nextBufferInSeconds: duration of the next buffer in seconds
+    ///   - energyValuesToConsider: number of energy values to consider
+    ///   - silenceThreshold: silence threshold
+    /// - Returns: true if voice is detected, false otherwise
+    public static func isVoiceDetected(
+        in relativeEnergy: [Float],
+        nextBufferInSeconds: Float,
+        silenceThreshold: Float
+    ) -> Bool {
+        // Calculate the number of energy values to consider based on the duration of the next buffer
+        // Each energy value corresponds to 1 buffer length (100ms of audio), hence we divide by 0.1
+        let energyValuesToConsider = Int(nextBufferInSeconds / 0.1)
+
+        // Extract the relevant portion of energy values from the currentRelativeEnergy array
+        let nextBufferEnergies = relativeEnergy.suffix(energyValuesToConsider)
+
+        // Determine the number of energy values to check for voice presence
+        // Considering up to the last 1 second of audio, which translates to 10 energy values
+        let numberOfValuesToCheck = max(10, nextBufferEnergies.count - 10)
+
+        // Check if any of the energy values in the considered range exceed the silence threshold
+        // This indicates the presence of voice in the buffer
+        return nextBufferEnergies.prefix(numberOfValuesToCheck).contains { $0 > silenceThreshold }
+    }
+
+    /// Calculate non-silent chunks of an audio.
+    /// - Parameter signal: audio signal
+    /// - Returns: an array of tuples indicating the start and end indices of non-silent chunks
+    public static func calculateNonSilentChunks(
+        in signal: [Float]
+    ) -> [(startIndex: Int, endIndex: Int)] {
+        EnergyVAD().calculateActiveChunks(in: signal)
+    }
+
+    /// Calculate voice activity in chunks of an audio based on energy threshold.
+    /// - Parameters:
+    ///   - signal: Audio signal
+    ///   - chunkCount: Number of chunks
+    ///   - frameLengthSamples: Frame length in samples
+    ///   - frameOverlapSamples: frame overlap in samples, this is helpful to catch large energy values at the very end of a frame
+    ///   - energyThreshold: Energy threshold for silence detection, default is 0.05. Chunks with energy below this threshold are considered silent.
+    /// - Returns: An array of booleans indicating whether each chunk is non-silent
+    public static func calculateVoiceActivityInChunks(
+        of signal: [Float],
+        chunkCount: Int,
+        frameLengthSamples: Int,
+        frameOverlapSamples: Int = 0,
+        energyThreshold: Float = 0.022
+    ) -> [Bool] {
+        var chunkEnergies = [Float]()
+        for chunkIndex in 0..<chunkCount {
+            let startIndex = chunkIndex * frameLengthSamples
+            let endIndex = min(startIndex + frameLengthSamples + frameOverlapSamples, signal.count)
+            let chunk = Array(signal[startIndex..<endIndex])
+            let avgEnergy = calculateAverageEnergy(of: chunk)
+            chunkEnergies.append(avgEnergy)
+        }
+
+        let vadResult = chunkEnergies.map { $0 > energyThreshold }
+
+        return vadResult
+    }
+
+    /// Calculate average energy of a signal chunk.
+    /// - Parameter signal: Chunk of audio signal.
+    /// - Returns: Average (RMS) energy of the signal chunk.
+    public static func calculateAverageEnergy(of signal: [Float]) -> Float {
+        var rmsEnergy: Float = 0.0
+        vDSP_rmsqv(signal, 1, &rmsEnergy, vDSP_Length(signal.count))
+        return rmsEnergy
+    }
+
+    /// Calculate energy of a signal chunk.
+    /// - Parameter signal: Chunk of audio signal.
+    /// - Returns: Tuple containing average (RMS energy), maximum, and minimum values.
     public static func calculateEnergy(of signal: [Float]) -> (avg: Float, max: Float, min: Float) {
         var rmsEnergy: Float = 0.0
         var minEnergy: Float = 0.0
@@ -310,7 +388,7 @@ public class AudioProcessor: NSObject, AudioProcessing {
     }
 
     public static func calculateRelativeEnergy(of signal: [Float], relativeTo reference: Float?) -> Float {
-        let signalEnergy = calculateEnergy(of: signal).avg
+        let signalEnergy = calculateAverageEnergy(of: signal)
 
         // Make sure reference is greater than 0
         // Default 1e-3 measured empirically in a silent room
@@ -595,19 +673,18 @@ public extension AudioProcessor {
 
         // Set the callback
         audioBufferCallback = callback
-        
+
         lastInputDevice = inputDeviceID
     }
-    
+
     func resumeRecordingLive(inputDeviceID: DeviceID? = nil, callback: (([Float]) -> Void)? = nil) throws {
         try? setupAudioSessionForDevice()
-        
-        if inputDeviceID == lastInputDevice{
+
+        if inputDeviceID == lastInputDevice {
             try audioEngine?.start()
         } else {
             audioEngine = try setupEngine(inputDeviceID: inputDeviceID)
         }
-
 
         // Set the callback only if the provided callback is not nil
         if let callback = callback {

diff --git a/Sources/WhisperKit/Core/AudioStreamTranscriber.swift b/Sources/WhisperKit/Core/AudioStreamTranscriber.swift
@@ -50,7 +50,7 @@ public actor AudioStreamTranscriber {
         decodingOptions: DecodingOptions,
         requiredSegmentsForConfirmation: Int = 2,
         silenceThreshold: Float = 0.3,
-        compressionCheckWindow: Int = 20,
+        compressionCheckWindow: Int = 60,
         useVAD: Bool = true,
         stateChangeCallback: AudioStreamTranscriberCallback?
     ) {
@@ -139,24 +139,11 @@ public actor AudioStreamTranscriber {
         }
 
         if useVAD {
-            // Retrieve the current relative energy values from the audio processor
-            let currentRelativeEnergy = audioProcessor.relativeEnergy
-
-            // Calculate the number of energy values to consider based on the duration of the next buffer
-            // Each energy value corresponds to 1 buffer length (100ms of audio), hence we divide by 0.1
-            let energyValuesToConsider = Int(nextBufferSeconds / 0.1)
-
-            // Extract the relevant portion of energy values from the currentRelativeEnergy array
-            let nextBufferEnergies = currentRelativeEnergy.suffix(energyValuesToConsider)
-
-            // Determine the number of energy values to check for voice presence
-            // Considering up to the last 1 second of audio, which translates to 10 energy values
-            let numberOfValuesToCheck = max(10, nextBufferEnergies.count - 10)
-
-            // Check if any of the energy values in the considered range exceed the silence threshold
-            // This indicates the presence of voice in the buffer
-            let voiceDetected = nextBufferEnergies.prefix(numberOfValuesToCheck).contains { $0 > Float(silenceThreshold) }
-
+            let voiceDetected = AudioProcessor.isVoiceDetected(
+                in: audioProcessor.relativeEnergy,
+                nextBufferInSeconds: nextBufferSeconds,
+                silenceThreshold: silenceThreshold
+            )
             // Only run the transcribe if the next buffer has voice
             if !voiceDetected {
                 Logging.debug("No voice detected, skipping transcribe")