Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Playground.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
74F3B7C12E1CF4F400C544D1 /* AudioProcess.swift in Sources */ = {isa = PBXBuildFile; fileRef = 74F3B7C02E1CF4F400C544D1 /* AudioProcess.swift */; };
74F860942E29A9D20007163C /* ProcessTapper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 74F860932E29A9D20007163C /* ProcessTapper.swift */; };
74F860962E2B19060007163C /* CoreAudioUtils.swift in Sources */ = {isa = PBXBuildFile; fileRef = 74F860952E2B19060007163C /* CoreAudioUtils.swift */; };
74F897792E4F9B130045252E /* TranscriptionModeSelection.swift in Sources */ = {isa = PBXBuildFile; fileRef = 74F897782E4F9B130045252E /* TranscriptionModeSelection.swift */; };
/* End PBXBuildFile section */

/* Begin PBXCopyFilesBuildPhase section */
Expand Down Expand Up @@ -79,6 +80,7 @@
74F3B7C02E1CF4F400C544D1 /* AudioProcess.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AudioProcess.swift; sourceTree = "<group>"; };
74F860932E29A9D20007163C /* ProcessTapper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ProcessTapper.swift; sourceTree = "<group>"; };
74F860952E2B19060007163C /* CoreAudioUtils.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = CoreAudioUtils.swift; sourceTree = "<group>"; };
74F897782E4F9B130045252E /* TranscriptionModeSelection.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TranscriptionModeSelection.swift; sourceTree = "<group>"; };
/* End PBXFileReference section */

/* Begin PBXFrameworksBuildPhase section */
Expand Down Expand Up @@ -130,6 +132,7 @@
1677AFE42B5769E5008C61C0 /* Views */ = {
isa = PBXGroup;
children = (
74F897782E4F9B130045252E /* TranscriptionModeSelection.swift */,
74312CDD2E1DA46C000D994A /* StreamResultView.swift */,
1677AFE52B57704E008C61C0 /* ContentView.swift */,
74F3B7BB2E1C7C8B00C544D1 /* ToastMessage.swift */,
Expand Down Expand Up @@ -292,6 +295,7 @@
746E4C062E39874F009623D7 /* DefaultEnvInitializer.swift in Sources */,
1677AFC22B57618A008C61C0 /* Playground.swift in Sources */,
748BA5502E1B2EC6008DA1B8 /* StreamViewModel.swift in Sources */,
74F897792E4F9B130045252E /* TranscriptionModeSelection.swift in Sources */,
746E4C0A2E398757009623D7 /* PlaygroundEnvInitializer.swift in Sources */,
74F3B7BC2E1C7C8B00C544D1 /* ToastMessage.swift in Sources */,
74312CDE2E1DA46C000D994A /* StreamResultView.swift in Sources */,
Expand Down
4 changes: 4 additions & 0 deletions Playground/Info.plist
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,9 @@
<key>NSPrivacyAccessedAPIType</key>
<string>NSPrivacyAccessedAPICategoryUserDefaults</string>
</dict>
<key>UIBackgroundModes</key>
<array>
<string>audio</string>
</array>
</dict>
</plist>
29 changes: 26 additions & 3 deletions Playground/ViewModels/StreamViewModel.swift
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ class StreamViewModel: ObservableObject {
let sdkCoordinator: ArgmaxSDKCoordinator

private var streamTasks: [Task<Void, Never>] = []
// Throttle guards to avoid overwhelming the UI with high-frequency updates
private var lastEnergyUpdateAt: TimeInterval = 0
private var lastHypothesisUpdateAtBySource: [String: TimeInterval] = [:]

// Currently active streaming sources, set only in startTranscribing
private var curActiveStreamSrcs: [any StreamSourceProtocol] = []
Expand Down Expand Up @@ -282,9 +285,16 @@ class StreamViewModel: ObservableObject {
private func handleResult(_ result: LiveResult, for sourceId: String) {
switch result {
case .hypothesis(let text, _):
let now = Date().timeIntervalSince1970
let last = lastHypothesisUpdateAtBySource[sourceId] ?? 0
// Update at most 10 times per second per source
guard now - last >= 0.1 else { return }
lastHypothesisUpdateAtBySource[sourceId] = now
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
guard trimmed != (isDeviceSource(sourceId) ? deviceResult?.hypothesisText : systemResult?.hypothesisText) else { return }
updateStreamResult(sourceId: sourceId) { oldResult in
var newResult = oldResult
newResult.hypothesisText = text.trimmingCharacters(in: .whitespacesAndNewlines)
newResult.hypothesisText = trimmed
return newResult
}

Expand All @@ -311,10 +321,23 @@ class StreamViewModel: ObservableObject {
@MainActor
private func updateAudioMetrics(for source: ArgmaxSource, audioData: [Float]) {
if case .device = source.streamType, let whisperKitPro = self.sdkCoordinator.whisperKit {
let now = Date().timeIntervalSince1970
guard now - lastEnergyUpdateAt >= 0.1 else { return }
lastEnergyUpdateAt = now

// Limit the amount of energy samples passed to the UI for performance
let energies = whisperKitPro.audioProcessor.relativeEnergy
#if os(iOS)
let newBufferEnergy = Array(energies.suffix(256))
#else
let newBufferEnergy = energies
#endif
let sampleCount = whisperKitPro.audioProcessor.audioSamples.count

updateStreamResult(sourceId: source.id) { oldResult in
var newResult = oldResult
newResult.bufferEnergy = whisperKitPro.audioProcessor.relativeEnergy
newResult.bufferSeconds = Double(whisperKitPro.audioProcessor.audioSamples.count) / Double(WhisperKit.sampleRate)
newResult.bufferEnergy = newBufferEnergy
newResult.bufferSeconds = Double(sampleCount) / Double(WhisperKit.sampleRate)
return newResult
}
}
Expand Down
139 changes: 74 additions & 65 deletions Playground/Views/ContentView.swift
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import Hub
///
/// The view integrates with several key components:
/// - `StreamViewModel`: Manages real-time audio streaming and transcription
/// - `TranscribeViewModel`: Handles file-based transcription and recording workflows
/// - `TranscribeViewModel`: Handles file-based transcription and recording workflows
/// - `ArgmaxSDKCoordinator`: Coordinates access to WhisperKit and SpeakerKit instances
/// - Audio discovery services for device and process selection (macOS)
///
Expand Down Expand Up @@ -75,6 +75,8 @@ struct ContentView: View {
@AppStorage("silenceThreshold") private var silenceThreshold: Double = 0.2
@AppStorage("maxSilenceBufferLength") private var maxSilenceBufferLength: Double = 10.0
@AppStorage("transcribeInterval") private var transcribeInterval: Double = 0.1
@AppStorage("minProcessInterval") private var minProcessInterval: Double = 0.0
@AppStorage("transcriptionMode") private var transcriptionModeRawValue: String = TranscriptionModeSelection.voiceTriggered.rawValue
@AppStorage("useVAD") private var useVAD: Bool = true
@AppStorage("tokenConfirmationsNeeded") private var tokenConfirmationsNeeded: Double = 2
@AppStorage("concurrentWorkerCount") private var concurrentWorkerCount: Double = 4
Expand All @@ -91,6 +93,16 @@ struct ContentView: View {
@AppStorage("fastLoadDecoderComputeUnits") private var fastLoadDecoderComputeUnits: MLComputeUnits = .cpuAndNeuralEngine
#endif
@AppStorage("trackingPermissionStatePro") private var trackingPermissionStateRawValue: Int = TrackingPermissionState.undetermined.rawValue

/// Computed property to work with transcription mode as an enum
private var transcriptionMode: TranscriptionModeSelection {
get {
TranscriptionModeSelection(rawValue: transcriptionModeRawValue) ?? .voiceTriggered
}
set {
transcriptionModeRawValue = newValue.rawValue
}
}

// MARK: Standard properties

Expand Down Expand Up @@ -139,7 +151,6 @@ struct ContentView: View {

// MARK: Alerts

@State private var showReportingAlert = false
@State private var showShortAudioWarningAlert: Bool = false
@State private var showPermissionAlert: Bool = false
@State private var permissionAlertMessage: String = ""
Expand Down Expand Up @@ -184,18 +195,6 @@ struct ContentView: View {
set: { newValue in
trackingPermissionStateRawValue = newValue ? TrackingPermissionState.granted.rawValue : TrackingPermissionState.denied.rawValue
Logging.debug(newValue)

if newValue {
sdkCoordinator.setupArgmax()
analyticsLogger.configureIfNeeded()
} else {
Task {
if await ArgmaxSDK.enabled() {
await ArgmaxSDK.close()
}
Logging.debug("Shutting down ArgmaxSDK")
}
}
}
)
}
Expand Down Expand Up @@ -348,18 +347,6 @@ struct ContentView: View {
#endif
.navigationSplitViewColumnWidth(min: 300, ideal: 350)
.padding(.horizontal)
.alert(isPresented: $showReportingAlert) {
Alert(
title: Text("Performance Reporting"),
message: Text("Help us catch bugs early and improve reliability by enabling reporting and performance monitoring. Required to enable experimental features. Learn more at [argmaxinc.com/privacy](https://www.argmaxinc.com/privacy)"),
primaryButton: .default(Text("Enable reporting")) {
updateTracking(state: .granted)
},
secondaryButton: .cancel(Text("Opt Out")) {
updateTracking(state: .denied)
}
)
}
} detail: {
VStack {
#if os(iOS)
Expand Down Expand Up @@ -448,12 +435,6 @@ struct ContentView: View {
showWhisperKitComputeUnits = true
speakerKitComputeUnitsExpanded = false

showReportingAlert = (trackingPermissionStateRawValue == 0) // undetermined
if trackingPermissionStateRawValue == TrackingPermissionState.granted.rawValue {
sdkCoordinator.setupArgmax()
analyticsLogger.configureIfNeeded()
}

// Check if Pro models are supported on this OS version
if #unavailable(macOS 15, iOS 18, watchOS 11, visionOS 2) {
showOSVersionAlert = true
Expand Down Expand Up @@ -1425,27 +1406,59 @@ struct ContentView: View {
}
.padding(.horizontal)

VStack {
Text("Silence Threshold")
Section(header: Text("Stream Mode Settings")) {
HStack {
Slider(value: $silenceThreshold, in: 0...1, step: 0.05)
Text(silenceThreshold.formatted(.number))
.frame(width: 30)
InfoButton("Relative silence threshold for the audio. \n Baseline is set by the quietest 100ms in the previous 2 seconds.")
Picker("Mode", selection: Binding(
get: { TranscriptionModeSelection(rawValue: transcriptionModeRawValue) ?? .voiceTriggered },
set: { transcriptionModeRawValue = $0.rawValue }
)) {
ForEach(TranscriptionModeSelection.allCases) { mode in
Text(mode.displayName).tag(mode)
}
}
.pickerStyle(MenuPickerStyle())
Spacer()
InfoButton(transcriptionMode.description)
}
}
.padding(.horizontal)

VStack {
Text("Max Silence Buffer Size")
HStack {
Slider(value: $maxSilenceBufferLength, in: 10...60, step: 1)
Text(maxSilenceBufferLength.formatted(.number))
.frame(width: 30)
InfoButton("Seconds of silence to buffer before audio is sent for transcription.")

if transcriptionMode == .voiceTriggered {
VStack {
Text("Silence Threshold")
HStack {
Slider(value: $silenceThreshold, in: 0...1, step: 0.05)
Text(silenceThreshold.formatted(.number.precision(.fractionLength(1))))
.frame(width: 30)
.lineLimit(1)
InfoButton("Relative silence threshold for the audio. \n Baseline is set by the quietest 100ms in the previous 2 seconds.")
}
}
.padding(.horizontal)

VStack {
Text("Max Silence Buffer Size")
HStack {
Slider(value: $maxSilenceBufferLength, in: 10...60, step: 1)
Text(maxSilenceBufferLength.formatted(.number.precision(.fractionLength(0))))
.frame(width: 30)
.lineLimit(1)
InfoButton("Seconds of silence to buffer before audio is sent for transcription.")
}
}
.padding(.horizontal)

VStack {
Text("Min Process Interval")
HStack {
Slider(value: $minProcessInterval, in: 0...15, step: 1)
Text(minProcessInterval.formatted(.number.precision(.fractionLength(0))))
.frame(width: 30)
.lineLimit(1)
InfoButton("Minimum interval the incoming stream data is fed to transcription pipeline.")
}
}
.padding(.horizontal)
}
}
.padding(.horizontal)

VStack {
Text("Transcribe Interval")
Expand All @@ -1458,21 +1471,6 @@ struct ContentView: View {
}
.padding(.horizontal)

Section(header: Text("Performance Reporting")) {
VStack(alignment: .leading) {
HStack {
Text("Enable Reporting")
InfoButton("Help us catch bugs early and improve reliability by enabling reporting and performance monitoring.")
Spacer()
Toggle("", isOn: trackingPermissionBinding)
}
Link(destination: URL(string: "https://www.argmaxinc.com/privacy")!) {
Text("Learn more at argmaxinc.com/privacy")
}
}
.padding(.horizontal)
.padding(.top)
}
Section(header: Text("Diarization Settings")) {
HStack {
Picker("Diarization", selection: $diarizationMode) {
Expand Down Expand Up @@ -2074,11 +2072,21 @@ struct ContentView: View {
isRecording = true
}

let streamMode: StreamTranscriptionMode
switch transcriptionMode {
case .alwaysOn:
streamMode = .alwaysOn
case .voiceTriggered:
streamMode = .voiceTriggered(silenceThreshold: Float(silenceThreshold), maxBufferLength: Float(maxSilenceBufferLength), minProcessInterval: Float(minProcessInterval))
case .batteryOptimized:
streamMode = .batteryOptimized
}

try await streamViewModel.startTranscribing(
options: DecodingOptionsPro(
base: decodingOptions,
transcribeInterval: transcribeInterval,
streamTranscriptionMode: .voiceTriggered(silenceThreshold: Float(silenceThreshold), maxBufferLength: Float(maxSilenceBufferLength))
streamTranscriptionMode: streamMode
)
)
} catch {
Expand Down Expand Up @@ -2188,6 +2196,7 @@ struct ContentView: View {
"compression_check_window": "\(compressionCheckWindow)",
"sample_length": "\(sampleLength)",
"silence_threshold": "\(silenceThreshold)",
"transcription_mode": "\(transcriptionMode.rawValue)",
"use_vad": "\(useVAD)",
"token_confirmations_needed": "\(tokenConfirmationsNeeded)",
"chunking_strategy": "\(chunkingStrategy)",
Expand Down
7 changes: 3 additions & 4 deletions Playground/Views/StreamResultView.swift
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,13 @@ struct StreamResultLine: View {
.id("bottom")
}
.onChange(of: result.confirmedText) {
withAnimation(.easeOut(duration: 0.3)) {
withAnimation(.easeOut(duration: 0.15)) {
proxy.scrollTo("bottom", anchor: .bottom)
}
}
// Avoid animating on every hypothesis token; keep scroll position but don't animate
.onChange(of: result.hypothesisText) {
withAnimation(.easeOut(duration: 0.3)) {
proxy.scrollTo("bottom", anchor: .bottom)
}
proxy.scrollTo("bottom", anchor: .bottom)
}
}
}
Expand Down
30 changes: 30 additions & 0 deletions Playground/Views/TranscriptionModeSelection.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/// Enumeration representing the available transcription modes for stream processing.
enum TranscriptionModeSelection: String, CaseIterable, Identifiable {
case alwaysOn = "alwaysOn"
case voiceTriggered = "voiceTriggered"
case batteryOptimized = "batteryOptimized"

var id: String { rawValue }

var displayName: String {
switch self {
case .alwaysOn:
return "Always-On"
case .voiceTriggered:
return "Voice-Triggered"
case .batteryOptimized:
return "Battery-Optimized"
}
}

var description: String {
switch self {
case .alwaysOn:
return "Continuous real-time transcription with lowest latency. Uses more system resources."
case .voiceTriggered:
return "Processes only audio above energy threshold. Conserves battery while staying responsive."
case .batteryOptimized:
return "Intelligent streaming with dynamic optimizations for maximum battery life."
}
}
}
16 changes: 7 additions & 9 deletions Playground/Views/VoiceEnergyView.swift
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import Foundation
import SwiftUI

/// A SwiftUI view that visualizes audio buffer energy levels with threshold-based color coding.
/// A SwiftUI view that visualizes audio buffer energy levels with threshold-based color coding.
/// This component provides real-time visual feedback for audio input levels and voice activity detection.
///
/// ## Features
Expand All @@ -28,14 +28,12 @@ struct VoiceEnergyView: View {

var body: some View {
ScrollView(.horizontal) {
HStack(spacing: 1) {
ForEach(Array(bufferEnergy.enumerated())[0...], id: \.element) { _, energy in
ZStack {
RoundedRectangle(cornerRadius: 2)
.frame(width: 2, height: CGFloat(energy) * 24)
}
.frame(maxHeight: 24)
.background(energy > Float(silenceThreshold) ? Color.green.opacity(0.2) : Color.red.opacity(0.2))
LazyHStack(spacing: 1) {
ForEach(Array(bufferEnergy.enumerated()), id: \.offset) { _, energy in
RoundedRectangle(cornerRadius: 2)
.frame(width: 2, height: max(0, min(CGFloat(energy), 1)) * 24)
.frame(maxHeight: 24)
.background(energy > Float(silenceThreshold) ? Color.green.opacity(0.2) : Color.red.opacity(0.2))
}
}
}
Expand Down