# Audio Diarization and Transcription Pipeline

This notebook provides a simple workflow to:
1. Perform speaker diarization using PyAnnote
2. Extract speaker-specific audio
3. Transcribe speaker audio with audio-transcribe
4. Convert JSONL to SRT with timeline mapping

Note: Requires tnh-scholar package to be installed.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Import required libraries
import json
import subprocess
import sys
import tempfile
from io import BytesIO
from pathlib import Path
from typing import List

from IPython.display import Audio, display

from tnh_scholar.audio_processing.diarization import (
    diarize,
    resume_diarization,
)
from tnh_scholar.audio_processing.diarization.audio import AudioHandler
from tnh_scholar.audio_processing.diarization.config import (
    ChunkConfig,
    DiarizationConfig,
    LanguageConfig,
    SpeakerConfig,
)
from tnh_scholar.audio_processing.diarization.models import AugDiarizedSegment
from tnh_scholar.audio_processing.diarization.pyannote_adapter import PyannoteAdapter
from tnh_scholar.audio_processing.diarization.strategies import LanguageProbe, WhisperLanguageDetector
from tnh_scholar.audio_processing.diarization.strategies.speaker_blocker import group_speaker_blocks
from tnh_scholar.audio_processing.diarization.strategies.time_gap import TimeGapChunker
from tnh_scholar.audio_processing.diarization.timeline_mapper import TimelineMapper
from tnh_scholar.audio_processing.diarization.viewer import close_segment_viewer, launch_segment_viewer
from tnh_scholar.audio_processing.timed_object.timed_text import Granularity, TimedText
from tnh_scholar.audio_processing.transcription import patch_whisper_options
from tnh_scholar.audio_processing.transcription.srt_processor import (
    SRTConfig,
    SRTProcessor,
)
from tnh_scholar.audio_processing.transcription.text_segment_builder import TextSegmentBuilder
from tnh_scholar.audio_processing.transcription.transcription_service import (
    TranscriptionResult,
    TranscriptionServiceFactory,
)
from tnh_scholar.audio_processing.utils import (
    get_audio_from_file,
    get_segment_audio,
    play_diarization_segment,
)
from tnh_scholar.utils.file_utils import (
    write_str_to_file,
)


In [3]:
import traceback
import warnings


# Handle warnings with traceback
def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
    log = file if hasattr(file, 'write') else sys.stderr
    traceback.print_stack(file=log)
    log.write(warnings.formatwarning(message, category, filename, lineno, line))

warnings.showwarning = warn_with_traceback

In [4]:
import logging

from tnh_scholar.logging_config import setup_logging

setup_logging(log_level=logging.DEBUG)

<Logger tnh (DEBUG)>

In [19]:
# Configuration - Update these values
# Path to the directory containing audio files

BASE_DIR = Path.home() / "Desktop/sr_d_transcriptions/audio_transcriptions"

# Audio file to process (run this notebook once per file)
AUDIO_FILE_STR = "sr_d_omega_clean.flac"

DIARIZATION_FILE_STR = AUDIO_FILE_STR

SPEAKER_COUNT = 1 # Must be 1, 2 or None. If speakers > 2 use None for best result.

GENERATE_NEW_DIARIZATION = False

DIARIZE_SINGLE_SPEAKER = True

SRT_INCLUDE_SPEAKER = False

LANGUAGE = 'en'

TARGET_CHUNK_TIME = 2 * 60  # seconds

MIN_CHUNK_TIME = 10 # seconds

TRANSCRIBER = "assemblyai"

completed = True


In [20]:
metadata = "" 

In [21]:
diarize_config = DiarizationConfig(
    chunk = ChunkConfig(
        target_duration=TARGET_CHUNK_TIME * 1000,
        min_duration= MIN_CHUNK_TIME * 1000, 
    ),
    speaker = SpeakerConfig(
        single_speaker=DIARIZE_SINGLE_SPEAKER,
    ),
    language = LanguageConfig(),
)

In [22]:
# Set up paths
audio_file_path = BASE_DIR / AUDIO_FILE_STR
diarize_audio_file_path = BASE_DIR / DIARIZATION_FILE_STR

file_ext_str = audio_file_path.suffix

if not audio_file_path.exists():
    raise FileNotFoundError(f"No file found: {audio_file_path}")

diarization_results_path = diarize_audio_file_path.parent / "raw_diarization_results.json"

In [23]:
def load_diarization_result(file_path):
    """Load diarization result from JSON file or sample data."""
    if not file_path:
        raise ValueError("File_path must be provided.")

    with open(file_path, 'r') as f:
        data = json.load(f)

    return data

In [24]:
# Run PyAnnote diarization
if GENERATE_NEW_DIARIZATION:
    completed = False
    print(f"Starting diarization for {diarize_audio_file_path}...")
    result = diarize(diarize_audio_file_path, num_speakers=SPEAKER_COUNT, output_path=diarization_results_path)

    # If the job is still running, you'll get a job ID
    if isinstance(result, str):
        job_id = result
        print(f"Diarization job started with ID: {job_id}")
        print("Wait for completion and then run the next cell with this job ID")
    else:
        completed = True
        print("Diarization process finished on initial run.")

In [25]:
# Only run this if you got a job ID in the previous cell
# Replace with your actual job ID from the previous step
# job_id = "your-job-id-here"  # e.g., "994c79b7-5f32-4715-aa34-33f00e216369"

# Check status

if not completed:
    status = check_job_status(job_id)
    print(f"Current status: {status.get('status', 'unknown')}")

    # Resume if needed
    if status.get('status') != 'succeeded':
        print("Resuming diarization...")
        result = resume_diarization(audio_file_path, job_id)
        print("Diarization completed")
    else:
        print("Diarization already completed")

In [26]:
transcription_options_aai = {"language_code": LANGUAGE, "language_detection": False}

In [27]:
ts_service = TranscriptionServiceFactory.create_service(provider=TRANSCRIBER)

2025-08-16 21:37:47,362 - tnh.tnh_scholar.audio_processing.transcription.assemblyai_service - [1;32mDEBUG[0m - Initialized AssemblyAI service with SDK[0m


In [28]:
transcript = ts_service.transcribe(audio_file_path, transcription_options_aai)

2025-08-16 21:37:50,726 - tnh.tnh_scholar.audio_processing.transcription.assemblyai_service - [36mINFO[0m - Starting synchronous transcription with AssemblyAI SDK[0m


In [29]:
full_seg = transcript.utterance_timing

In [41]:
transcript.raw_result['chapters']

In [30]:

assert full_seg is not None
new_seg = TimedText(segments=full_seg.segments, granularity=Granularity.SEGMENT)

In [31]:
assert full_seg is not None
full_out = new_seg.export_text()

In [32]:
print(full_out)

[A] Dear beloved spiritual family, I know you're here, and I'm very happy and grateful. I'm very aware that it took a lot of efforts and arrangement for you to be here. It took a lot of effort on our part to come here. So every day I cherish it. I haven't been feeling so well the last three, four days. My Lyme disease has been acting up. So I have a lot of physical pain and many other things. But each day I am keenly aware that we have this opportunity to be here together. So I do my best to take care of my health and at the same time, to show up for you. And it gives me such joy and meaning to be with you. When I. After I finished medical school in San Francisco, I went to Lou, my residency right outside of San Francisco. And one day, a doctor whom I was working with, it was at a HIV clinic. And he told me, oh, Dr. Wen, you know, there's a Zen master coming to the States, and he has several retreats. Maybe you would like to go. And he gave me this pamphlet, and I looked at it, and it 

In [33]:
path_out = BASE_DIR / audio_file_path.with_suffix(".txt")
write_str_to_file(path_out, full_out, overwrite=True)

In [23]:
srt_config = SRTConfig(include_speaker=SRT_INCLUDE_SPEAKER) 
srt_processor = SRTProcessor(srt_config)

In [24]:
srt_out = srt_processor.generate(full_seg)

In [None]:
print(srt_out)

In [None]:
# Load and process the diarization results

print(f"Loading diarization results from {diarization_results_path}")
chunker = TimeGapChunker(config=diarize_config)
segment_adapter = PyannoteAdapter(config=diarize_config)
result = load_diarization_result(file_path=diarization_results_path)
data = result['output']
segments = segment_adapter.to_segments(data)
chunk_list = chunker.extract(segments)

for chunk in chunk_list:
    print(f"  chunk: {chunk}")

In [18]:
diarize_raw = data['diarization']

In [19]:
diarize_raw[0]['start']

0.565

In [None]:
data

In [None]:
chunk_list

In [22]:
len(segments)

1215

In [None]:
segments[0]

In [None]:
segments[110]

In [None]:
long_list = [seg for seg in segments if seg.duration_sec > 4.0]
long_list_info = [
    (i, 
    seg.duration_sec, seg.start.to_seconds(), 
    seg.end.to_seconds(), seg.speaker
    ) 
    for i, seg in enumerate(long_list) 
]
long_list_info

In [41]:
speaker_blocks = group_speaker_blocks(segments, config=diarize_config)

In [None]:
len(speaker_blocks)

In [77]:
pid = launch_segment_viewer(speaker_blocks[:250], audio_file_path)

Launching Streamlit viewer with data: /var/folders/rn/6vvb1zdx0z59xqgkpcy8_fx00000gq/T/tmpibtq9pzx.json



  You can now view your Streamlit app in your browser.

  Local URL: http://localhost:8501
  Network URL: http://10.249.8.237:8501



2025-08-12 08:36:24.409 Examining the path of torch.classes raised: Tried to instantiate class '__path__._path', but it does not exist! Ensure that it is registered via torch::class_


In [76]:
close_segment_viewer(pid)

Closed Streamlit viewer (PID 35020)
  Stopping...


In [None]:
[(block.speaker, block.duration) for block in speaker_blocks]

In [171]:
len(long_list)

233

In [252]:
test_idx = 167
seg = segments[test_idx]
print(seg)
play_diarization_segment(seg, base_audio)

speaker='SPEAKER_04' start=TimeMs(358.205s) end=TimeMs(359.185s) audio_map_start=TimeMs(169.100s) gap_before=False spacing_time=TimeMs(0.000s)


Input #0, wav, from '/var/folders/rn/6vvb1zdx0z59xqgkpcy8_fx00000gq/T/tmpj3bctyj0.wav':
  Duration: 00:00:00.98, bitrate: 3072 kb/s
  Stream #0:0: Audio: pcm_s32le ([1][0][0][0] / 0x0001), 48000 Hz, 2 channels, s32, 3072 kb/s
   0.88 M-A: -0.000 fd=   0 aq=    0KB vq=    0KB sq=    0B 




In [174]:
detector = WhisperLanguageDetector()

probe = LanguageProbe(
    config=diarize_config, 
    detector=detector,
)

In [175]:
seg_audio = get_segment_audio(seg, base_audio)

In [176]:
seg_audio

<tnh_scholar.utils.tnh_audio_segment.TNHAudioSegment at 0x1413c4a10>

In [177]:

aug_seg = AugDiarizedSegment.from_segment(segments[test_idx], audio=seg_audio)

In [178]:
aug_seg

AugDiarizedSegment(speaker='SPEAKER_02', start=TimeMs(0.565s), end=TimeMs(13.465s), audio_map_start=None, gap_before=False, spacing_time=0, gap_before_new=False, spacing_time_new=TimeMs(0.000s), audio=<tnh_scholar.utils.tnh_audio_segment.TNHAudioSegment object at 0x1413c4a10>)

In [None]:
probe.segment_language(aug_segment=aug_seg)

In [None]:
import concurrent.futures
import time

from openai import RateLimitError

segments_to_probe = long_list

def probe_segment_safe(probe, aug_segment):
    try:
        return probe.segment_language(aug_segment=aug_segment)
    except RateLimitError:
        print("Rate limit hit, sleeping and retrying...")
        time.sleep(10)  # Wait and retry
        try:
            return probe.segment_language(aug_segment=aug_segment)
        except Exception as e:
            print(f"Failed again: {e}")
            return None
    except Exception as e:
        print(f"Error: {e}")
        return None

# Example: probe all segments in long_list (or chunk_list, or your own list)
max_workers = 1000  # Adjust based on your rate limit
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(
        probe_segment_safe, 
        probe, 
        AugDiarizedSegment.from_segment(seg, audio=get_segment_audio(seg, base_audio))
        )
        for seg in segments_to_probe]  # Adjust range as needed
    for future in concurrent.futures.as_completed(futures):
        results.append(future.result())

print("Language probe results:", results)

In [None]:
[chunk.accumulated_time for chunk in chunk_list]

In [None]:
[len(chunk.segments) for chunk in chunk_list]

In [None]:
# Extract speaker audio segments
print("Extracting speaker audio segments to local ByteIO objects")
audio_handler = AudioHandler()
total_chunks = len(chunk_list) 

for i, chunk in enumerate(chunk_list, start=1):
    print(f"Building chunk {i} of {total_chunks}")
    audio_handler.build_audio_chunk(chunk, audio_file=audio_file_path)

In [183]:
len(chunk_list)

22

In [184]:
chunk_list[0]

DiarizationChunk(start_time=565, end_time=126485, audio=AudioChunk(data=<_io.BytesIO object at 0x142e93ec0>, start_ms=565, end_ms=126485, sample_rate=None, channels=None, format='flac'), segments=[DiarizedSegment(speaker='SPEAKER_02', start=TimeMs(0.565s), end=TimeMs(13.465s), audio_map_start=0, gap_before=False, spacing_time=TimeMs(0.000s)), DiarizedSegment(speaker='SPEAKER_04', start=TimeMs(12.865s), end=TimeMs(12.885s), audio_map_start=12900, gap_before=False, spacing_time=TimeMs(-0.600s)), DiarizedSegment(speaker='SPEAKER_02', start=TimeMs(14.325s), end=TimeMs(18.705s), audio_map_start=14360, gap_before=False, spacing_time=TimeMs(1.440s)), DiarizedSegment(speaker='SPEAKER_04', start=TimeMs(16.765s), end=TimeMs(16.805s), audio_map_start=18740, gap_before=False, spacing_time=TimeMs(-1.940s)), DiarizedSegment(speaker='SPEAKER_04', start=TimeMs(17.445s), end=TimeMs(22.325s), audio_map_start=19420, gap_before=False, spacing_time=TimeMs(0.640s)), DiarizedSegment(speaker='SPEAKER_02', sta

In [185]:
chunk_list[0].total_duration_sec

125.92

In [186]:
audio_list = [chunk.audio for chunk in chunk_list]

In [None]:
audio_list

In [188]:
aud_chunk = audio_list[0]

In [189]:
aud_chunk

AudioChunk(data=<_io.BytesIO object at 0x142e93ec0>, start_ms=565, end_ms=126485, sample_rate=None, channels=None, format='flac')

In [None]:
play_audio_mp4(aud_chunk.data)

In [None]:
ts_service = TranscriptionServiceFactory.create_service(provider=TRANSCRIBER)

transcription_options_whisper = {
    "language": LANGUAGE, "timestamp_granularities": ["word"], "prompt": metadata
    }
transcription_options_whisper = patch_whisper_options(
    transcription_options_whisper, file_extension=file_ext_str
    )
transcription_options_aai = {"language_code": LANGUAGE, "language_detection": False}

2025-07-24 21:30:14,547 - tnh.tnh_scholar.audio_processing.transcription.assemblyai_service - [1;32mDEBUG[0m - Created logger with name: tnh.tnh_scholar.audio_processing.transcription.assemblyai_service[0m
2025-07-24 21:30:14,549 - tnh.tnh_scholar.audio_processing.transcription.whisper_service - [1;32mDEBUG[0m - API key updated[0m


In [45]:
transcription_options = transcription_options_whisper \
    if TRANSCRIBER == 'whisper' \
        else transcription_options_aai

In [46]:
chunks_to_process = chunk_list

In [None]:
transcripts: List[TranscriptionResult]= []
for i, chunk in enumerate(chunks_to_process, start=1):
    print(f"processing chunk: {i}")
    audio = chunk.audio
    if not audio:
        raise ValueError("No audio data for chunk.")
    audio_obj = audio.data
    print(f"Running transcript generation with {TRANSCRIBER} service...")
    print(f"Audio file: {audio_obj}")
    transcript = ts_service.transcribe(audio_obj, transcription_options)
    print(transcript)
    transcripts.append(transcript)
    
print("Transcription loop complete.")

In [None]:
transcript.raw_result

In [49]:
len(transcripts)

2

In [54]:
mapper = TimelineMapper()

2025-07-24 21:45:57,423 - tnh.tnh_scholar.audio_processing.diarization.timeline_mapper - [1;32mDEBUG[0m - Created logger with name: tnh.tnh_scholar.audio_processing.diarization.timeline_mapper[0m


In [55]:
timings = []
for chunk, transcript in zip(chunk_list, transcripts):
    tt = transcript.word_timing
    if tt is not None:
        new_timing = mapper.remap(tt, chunk)
        timings.append(new_timing)
    else:
        raise ValueError("No timed text for words.")

In [56]:
len(timings)

2

In [None]:
timings[0]

In [58]:
complete_timing = TimedText.merge(timings)

In [None]:
complete_timing

In [64]:
segment_builder = TextSegmentBuilder(max_duration_ms=4*1000, target_characters=42, ignore_speaker=True)

In [65]:
full_seg = segment_builder.create_segments(complete_timing)

In [None]:
full_seg

In [67]:
srt_config = SRTConfig(include_speaker=SRT_INCLUDE_SPEAKER) 
srt_processor = SRTProcessor(srt_config)

In [68]:
srt_out = srt_processor.generate(full_seg)

In [None]:
print(srt_out)

In [None]:
play_audio_mp4(aud_chunk.data)

In [None]:
display(Audio(str(audio_file_path)))

In [71]:
test_str = "srt_out"

In [72]:
new_ext = ".srt"
new_stem = f"{audio_file_path.stem}_{test_str}"
srt_path = audio_file_path.with_name(new_stem + new_ext)


write_str_to_file(srt_path, srt_out, overwrite=True)

# END OF PROCESS PIPE

In [3]:
# Post-processing: Translate all final SRT files to English
print("\n===== Translating SRT files to English =====")

        
# Find all _final.srt files in each speaker directory
for srt_file in BASE_DIR.glob("*.srt"):
    print(f"file: {srt_file}")
          
    en_srt_file = srt_file.with_name(f"{srt_file.stem}_en.srt")
    
    # Skip if English version already exists
    if en_srt_file.exists():
        print(f"English SRT already exists: {en_srt_file}")
        continue
        
    # Run srt-translate
    cmd = f"srt-translate '{srt_file}' -o '{en_srt_file}' -t en"
    print(f"Running: {cmd}")
    
    try:
        subprocess.run(cmd, shell=True, check=True)
        print(f"Successfully translated: {srt_file} -> {en_srt_file}")
    except subprocess.CalledProcessError as e:
        print(f"Error translating {srt_file}: {e}")

print("===== Translation complete =====")


===== Translating SRT files to English =====


NameError: name 'BASE_DIR' is not defined

In [4]:
# --- Settings ---
srt_folder = BASE_DIR  # <-- Change this to your actual folder
srt_processor = SRTProcessor()

# --- Processing Loop ---
for srt_file in srt_folder.glob("*.srt"):
    # Read original SRT content
    srt_content = srt_file.read_text(encoding="utf-8")

    # Parse to TimedText
    timed_text = srt_processor.parse(srt_content)

    # Re-generate SRT without speaker labels
    cleaned_srt = srt_processor.generate(timed_text, include_speaker=False)

    # Rename original file to *_sp.srt
    speaker_file = srt_file.with_stem(f"{srt_file.stem}_sp")
    srt_file.rename(speaker_file)

    # Save cleaned SRT under original filename
    srt_file.write_text(cleaned_srt, encoding="utf-8")

print("Cleaning and renaming completed.")

Cleaning and renaming completed.


In [None]:
# # Process each speaker's audio
# for speaker, blocks in mapped_blocks.items():
#     speaker_audio_path = export_dir / f"{speaker}.mp3"
#     speaker_output_dir = export_dir / "audio_transcriptions" / speaker
#     audio_transcribe_output_dir = export_dir / "audio_transcriptions"
#     ensure_directory_exists(speaker_output_dir)
    
#     print(f"\nProcessing {speaker}...")
    
#     # Run audio-transcribe on the speaker's audio file
#     cmd = f"audio-transcribe -f {speaker_audio_path} --output_dir {audio_transcribe_output_dir} --split --transcribe"
#     print(f"Running: {cmd}")
#     subprocess.run(cmd, shell=True, check=True)
    

In [1]:
test_list = []

In [2]:
test_list[-1]

IndexError: list index out of range