Skip to content

Commit

Permalink
VAD audio chunking (#135)
Browse files Browse the repository at this point in the history
* added audio chunker, added energy based vad, added tests

* fixed compilation

* fixed compilation

* extracted prepareSeekClips function

* review changes

* Support chunking VAD for paths

* Updates from review

* Support clip timestamps with vad

* fix compilation error

* PR review and cleanup

* Fix test normalization order

* UI and qol tweaks for example app

* Fix test normalization

* Reduce accuracy requirement for vad chunker

* Fix example app sidebar visibility

* Further test normailziation fixes

---------

Co-authored-by: ZachNagengast <znagengast@gmail.com>
  • Loading branch information
jkrukowski and ZachNagengast committed May 23, 2024
1 parent d180062 commit 09aa70b
Show file tree
Hide file tree
Showing 18 changed files with 1,248 additions and 298 deletions.
7 changes: 4 additions & 3 deletions Examples/WhisperAX/WhisperAX.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -893,14 +893,15 @@
LD_RUNPATH_SEARCH_PATHS = "@executable_path/Frameworks";
"LD_RUNPATH_SEARCH_PATHS[sdk=macosx*]" = "@executable_path/../Frameworks";
MACOSX_DEPLOYMENT_TARGET = 14.0;
MARKETING_VERSION = 0.2.0;
PRODUCT_BUNDLE_IDENTIFIER = com.argmax.whisperkit.WhisperAX;
MARKETING_VERSION = 0.3.0;
PRODUCT_BUNDLE_IDENTIFIER = "com.argmax.whisperkit.WhisperAX${DEVELOPMENT_TEAM}";
PRODUCT_NAME = "$(TARGET_NAME)";
SDKROOT = auto;
SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx";
SUPPORTS_MACCATALYST = NO;
SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = YES;
SWIFT_EMIT_LOC_STRINGS = YES;
SWIFT_OPTIMIZATION_LEVEL = "-Onone";
SWIFT_VERSION = 5.0;
TARGETED_DEVICE_FAMILY = "1,2";
};
Expand Down Expand Up @@ -938,7 +939,7 @@
LD_RUNPATH_SEARCH_PATHS = "@executable_path/Frameworks";
"LD_RUNPATH_SEARCH_PATHS[sdk=macosx*]" = "@executable_path/../Frameworks";
MACOSX_DEPLOYMENT_TARGET = 14.0;
MARKETING_VERSION = 0.2.0;
MARKETING_VERSION = 0.3.0;
PRODUCT_BUNDLE_IDENTIFIER = com.argmax.whisperkit.WhisperAX;
PRODUCT_NAME = "$(TARGET_NAME)";
SDKROOT = auto;
Expand Down
262 changes: 191 additions & 71 deletions Examples/WhisperAX/WhisperAX/Views/ContentView.swift

Large diffs are not rendered by default.

23 changes: 5 additions & 18 deletions Examples/WhisperAX/WhisperAXWatchApp/WhisperAXExampleView.swift
Original file line number Diff line number Diff line change
Expand Up @@ -580,24 +580,11 @@ struct WhisperAXWatchView: View {
}

if useVAD {
// Retrieve the current relative energy values from the audio processor
let currentRelativeEnergy = whisperKit.audioProcessor.relativeEnergy

// Calculate the number of energy values to consider based on the duration of the next buffer
// Each energy value corresponds to 1 buffer length (100ms of audio), hence we divide by 0.1
let energyValuesToConsider = Int(nextBufferSeconds / 0.1)

// Extract the relevant portion of energy values from the currentRelativeEnergy array
let nextBufferEnergies = currentRelativeEnergy.suffix(energyValuesToConsider)

// Determine the number of energy values to check for voice presence
// Considering up to the last 1 second of audio, which translates to 10 energy values
let numberOfValuesToCheck = max(10, nextBufferEnergies.count - 10)

// Check if any of the energy values in the considered range exceed the silence threshold
// This indicates the presence of voice in the buffer
let voiceDetected = nextBufferEnergies.prefix(numberOfValuesToCheck).contains { $0 > Float(silenceThreshold) }

let voiceDetected = AudioProcessor.isVoiceDetected(
in: whisperKit.audioProcessor.relativeEnergy,
nextBufferInSeconds: nextBufferSeconds,
silenceThreshold: Float(silenceThreshold)
)
// Only run the transcribe if the next buffer has voice
guard voiceDetected else {
await MainActor.run {
Expand Down
110 changes: 110 additions & 0 deletions Sources/WhisperKit/Core/AudioChunker.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
// For licensing see accompanying LICENSE.md file.
// Copyright © 2024 Argmax, Inc. All rights reserved.

import Accelerate
import AVFoundation
import Foundation

/// Responsible for chunking audio into smaller pieces
@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
public protocol AudioChunking {
func chunkAll(audioArray: [Float], maxChunkLength: Int, decodeOptions: DecodingOptions?) async throws -> [AudioChunk]
}

@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
public extension AudioChunking {
func updateSeekOffsetsForResults(
chunkedResults: [Result<[TranscriptionResult], Swift.Error>],
audioChunks: [AudioChunk]
) -> [TranscriptionResult] {
var updatedTranscriptionResults = [TranscriptionResult]()
for (index, chunkedResult) in chunkedResults.enumerated() {
switch chunkedResult {
case let .success(results):
let seekTime = Float(audioChunks[index].seekOffsetIndex) / Float(WhisperKit.sampleRate)
for result in results {
var updatedSegments = [TranscriptionSegment]()
for segment in result.segments {
let updatedSegment = updateSegmentTimings(segment: segment, seekTime: seekTime)
updatedSegments.append(updatedSegment)
}
var updatedResult = result
updatedResult.seekTime = seekTime
updatedResult.segments = updatedSegments
updatedTranscriptionResults.append(updatedResult)
}
case let .failure(error):
Logging.debug("Error transcribing chunk \(index): \(error)")
}
}
return updatedTranscriptionResults
}
}

/// A audio chunker that splits audio into smaller pieces based on voice activity detection
@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
open class VADAudioChunker: AudioChunking {
/// prevent hallucinations at the end of the clip by stopping up to 1.0s early
private let windowPadding: Int
private let vad = EnergyVAD()

init(windowPadding: Int = 16000) {
self.windowPadding = windowPadding
}

private func splitOnMiddleOfLongestSilence(audioArray: [Float], startIndex: Int, endIndex: Int) -> Int {
// NOTE: we want to check just the 2nd part for the silence to attempt to get closest to a max length chunk
let audioMidIndex = startIndex + (endIndex - startIndex) / 2
let vadAudioSlice = Array(audioArray[audioMidIndex..<endIndex])
let voiceActivity = vad.voiceActivity(in: vadAudioSlice)
if let silence = vad.findLongestSilence(in: voiceActivity) {
// if silence is detected we take the middle point of the silent chunk
let silenceMidIndex = silence.startIndex + (silence.endIndex - silence.startIndex) / 2
return audioMidIndex + vad.voiceActivityIndexToAudioSampleIndex(silenceMidIndex)
}
return endIndex
}

public func chunkAll(audioArray: [Float], maxChunkLength: Int, decodeOptions: DecodingOptions?) async throws -> [AudioChunk] {
// If the audio array length is less than or equal to maxLength, return it as a single chunk
if audioArray.count <= maxChunkLength {
return [AudioChunk(seekOffsetIndex: 0, audioSamples: audioArray)]
}

// First create chunks from seek clips
let seekClips = prepareSeekClips(contentFrames: audioArray.count, decodeOptions: decodeOptions)

var chunkedAudio = [AudioChunk]()
for (seekClipStart, seekClipEnd) in seekClips {
// Loop through the current clip until we reach the end
// Typically this will be the full audio file, unless seek points are explicitly provided
var startIndex = seekClipStart
while startIndex < seekClipEnd - windowPadding {
let currentFrameLength = startIndex - seekClipStart
if startIndex >= currentFrameLength, startIndex < 0 {
throw WhisperError.audioProcessingFailed("startIndex is outside the buffer size")
}

// Make sure we still need chunking for this seek clip, otherwise use the original seek clip end
var endIndex = seekClipEnd
if startIndex + maxChunkLength < endIndex {
// Adjust the end index based on VAD
endIndex = splitOnMiddleOfLongestSilence(
audioArray: audioArray,
startIndex: startIndex,
endIndex: min(audioArray.count, startIndex + maxChunkLength)
)
}

guard endIndex > startIndex else {
break
}
Logging.debug("Found chunk from \(formatTimestamp(Float(startIndex) / Float(WhisperKit.sampleRate))) to \(formatTimestamp(Float(endIndex) / Float(WhisperKit.sampleRate)))")
let audioSlice = AudioChunk(seekOffsetIndex: startIndex, audioSamples: Array(audioArray[startIndex..<endIndex]))
chunkedAudio.append(audioSlice)
startIndex = endIndex
}
}
return chunkedAudio
}
}
95 changes: 86 additions & 9 deletions Sources/WhisperKit/Core/AudioProcessor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public protocol AudioProcessing {

/// Stops recording and cleans up resources
func stopRecording()

/// Resume recording audio from the specified input device, appending to continuous `audioArray` after pause
func resumeRecordingLive(inputDeviceID: DeviceID?, callback: (([Float]) -> Void)?) throws
}
Expand All @@ -74,7 +74,7 @@ public extension AudioProcessing {
func startRecordingLive(inputDeviceID: DeviceID? = nil, callback: (([Float]) -> Void)?) throws {
try startRecordingLive(inputDeviceID: inputDeviceID, callback: callback)
}

func resumeRecordingLive(inputDeviceID: DeviceID? = nil, callback: (([Float]) -> Void)?) throws {
try resumeRecordingLive(inputDeviceID: inputDeviceID, callback: callback)
}
Expand Down Expand Up @@ -157,7 +157,7 @@ public extension AudioProcessing {

@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
public class AudioProcessor: NSObject, AudioProcessing {
private var lastInputDevice:DeviceID?
private var lastInputDevice: DeviceID?
public var audioEngine: AVAudioEngine?
public var audioSamples: ContiguousArray<Float> = []
public var audioEnergy: [(rel: Float, avg: Float, max: Float, min: Float)] = []
Expand Down Expand Up @@ -292,6 +292,84 @@ public class AudioProcessor: NSObject, AudioProcessing {

// MARK: - Utility

/// Detect voice activity in the given buffer of relative energy values.
/// - Parameters:
/// - relativeEnergy: relative energy values
/// - nextBufferInSeconds: duration of the next buffer in seconds
/// - energyValuesToConsider: number of energy values to consider
/// - silenceThreshold: silence threshold
/// - Returns: true if voice is detected, false otherwise
public static func isVoiceDetected(
in relativeEnergy: [Float],
nextBufferInSeconds: Float,
silenceThreshold: Float
) -> Bool {
// Calculate the number of energy values to consider based on the duration of the next buffer
// Each energy value corresponds to 1 buffer length (100ms of audio), hence we divide by 0.1
let energyValuesToConsider = Int(nextBufferInSeconds / 0.1)

// Extract the relevant portion of energy values from the currentRelativeEnergy array
let nextBufferEnergies = relativeEnergy.suffix(energyValuesToConsider)

// Determine the number of energy values to check for voice presence
// Considering up to the last 1 second of audio, which translates to 10 energy values
let numberOfValuesToCheck = max(10, nextBufferEnergies.count - 10)

// Check if any of the energy values in the considered range exceed the silence threshold
// This indicates the presence of voice in the buffer
return nextBufferEnergies.prefix(numberOfValuesToCheck).contains { $0 > silenceThreshold }
}

/// Calculate non-silent chunks of an audio.
/// - Parameter signal: audio signal
/// - Returns: an array of tuples indicating the start and end indices of non-silent chunks
public static func calculateNonSilentChunks(
in signal: [Float]
) -> [(startIndex: Int, endIndex: Int)] {
EnergyVAD().calculateActiveChunks(in: signal)
}

/// Calculate voice activity in chunks of an audio based on energy threshold.
/// - Parameters:
/// - signal: Audio signal
/// - chunkCount: Number of chunks
/// - frameLengthSamples: Frame length in samples
/// - frameOverlapSamples: frame overlap in samples, this is helpful to catch large energy values at the very end of a frame
/// - energyThreshold: Energy threshold for silence detection, default is 0.05. Chunks with energy below this threshold are considered silent.
/// - Returns: An array of booleans indicating whether each chunk is non-silent
public static func calculateVoiceActivityInChunks(
of signal: [Float],
chunkCount: Int,
frameLengthSamples: Int,
frameOverlapSamples: Int = 0,
energyThreshold: Float = 0.022
) -> [Bool] {
var chunkEnergies = [Float]()
for chunkIndex in 0..<chunkCount {
let startIndex = chunkIndex * frameLengthSamples
let endIndex = min(startIndex + frameLengthSamples + frameOverlapSamples, signal.count)
let chunk = Array(signal[startIndex..<endIndex])
let avgEnergy = calculateAverageEnergy(of: chunk)
chunkEnergies.append(avgEnergy)
}

let vadResult = chunkEnergies.map { $0 > energyThreshold }

return vadResult
}

/// Calculate average energy of a signal chunk.
/// - Parameter signal: Chunk of audio signal.
/// - Returns: Average (RMS) energy of the signal chunk.
public static func calculateAverageEnergy(of signal: [Float]) -> Float {
var rmsEnergy: Float = 0.0
vDSP_rmsqv(signal, 1, &rmsEnergy, vDSP_Length(signal.count))
return rmsEnergy
}

/// Calculate energy of a signal chunk.
/// - Parameter signal: Chunk of audio signal.
/// - Returns: Tuple containing average (RMS energy), maximum, and minimum values.
public static func calculateEnergy(of signal: [Float]) -> (avg: Float, max: Float, min: Float) {
var rmsEnergy: Float = 0.0
var minEnergy: Float = 0.0
Expand All @@ -310,7 +388,7 @@ public class AudioProcessor: NSObject, AudioProcessing {
}

public static func calculateRelativeEnergy(of signal: [Float], relativeTo reference: Float?) -> Float {
let signalEnergy = calculateEnergy(of: signal).avg
let signalEnergy = calculateAverageEnergy(of: signal)

// Make sure reference is greater than 0
// Default 1e-3 measured empirically in a silent room
Expand Down Expand Up @@ -595,19 +673,18 @@ public extension AudioProcessor {

// Set the callback
audioBufferCallback = callback

lastInputDevice = inputDeviceID
}

func resumeRecordingLive(inputDeviceID: DeviceID? = nil, callback: (([Float]) -> Void)? = nil) throws {
try? setupAudioSessionForDevice()
if inputDeviceID == lastInputDevice{

if inputDeviceID == lastInputDevice {
try audioEngine?.start()
} else {
audioEngine = try setupEngine(inputDeviceID: inputDeviceID)
}


// Set the callback only if the provided callback is not nil
if let callback = callback {
Expand Down
25 changes: 6 additions & 19 deletions Sources/WhisperKit/Core/AudioStreamTranscriber.swift
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ public actor AudioStreamTranscriber {
decodingOptions: DecodingOptions,
requiredSegmentsForConfirmation: Int = 2,
silenceThreshold: Float = 0.3,
compressionCheckWindow: Int = 20,
compressionCheckWindow: Int = 60,
useVAD: Bool = true,
stateChangeCallback: AudioStreamTranscriberCallback?
) {
Expand Down Expand Up @@ -139,24 +139,11 @@ public actor AudioStreamTranscriber {
}

if useVAD {
// Retrieve the current relative energy values from the audio processor
let currentRelativeEnergy = audioProcessor.relativeEnergy

// Calculate the number of energy values to consider based on the duration of the next buffer
// Each energy value corresponds to 1 buffer length (100ms of audio), hence we divide by 0.1
let energyValuesToConsider = Int(nextBufferSeconds / 0.1)

// Extract the relevant portion of energy values from the currentRelativeEnergy array
let nextBufferEnergies = currentRelativeEnergy.suffix(energyValuesToConsider)

// Determine the number of energy values to check for voice presence
// Considering up to the last 1 second of audio, which translates to 10 energy values
let numberOfValuesToCheck = max(10, nextBufferEnergies.count - 10)

// Check if any of the energy values in the considered range exceed the silence threshold
// This indicates the presence of voice in the buffer
let voiceDetected = nextBufferEnergies.prefix(numberOfValuesToCheck).contains { $0 > Float(silenceThreshold) }

let voiceDetected = AudioProcessor.isVoiceDetected(
in: audioProcessor.relativeEnergy,
nextBufferInSeconds: nextBufferSeconds,
silenceThreshold: silenceThreshold
)
// Only run the transcribe if the next buffer has voice
if !voiceDetected {
Logging.debug("No voice detected, skipping transcribe")
Expand Down
Loading

0 comments on commit 09aa70b

Please sign in to comment.