In [85]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [199]:
import os
import re
import subprocess
import sys
from io import BytesIO
from pathlib import Path

from IPython.display import Audio
from pydub import AudioSegment


In [200]:
# Import the transcription service modules
from tnh_scholar.audio_processing.transcription_service import (
    DiarizationChunker,
    TranscriptionFormatConverter,
    TranscriptionServiceFactory,
)
from tnh_scholar.cli_tools.audio_transcribe.diarize import (
    check_job_status,
    diarize,
    resume_diarization,
)


In [201]:
working_dir = Path.home() / "Desktop/transcription_wouter"

In [None]:
audio_file_base_path = working_dir \
    / "qa_sr_abbess.mp3"  
if not audio_file_base_path.exists():
    raise FileNotFoundError("Audio file not found.")

In [None]:
def gen_srt(audio_file_obj, provider="whisper", language=None, local_convert=False):
    """
    generate srt
    """
    format_type = "srt"
    # Create the transcription service
    service = TranscriptionServiceFactory.create_service(provider=provider)

    # Print some info
    print(f"Running {format_type.upper()} generation with {provider} service...")
    print(f"Audio file: {audio_file_obj}")

    transcription_options = {"language": language} if language else None
    
    # Generate the formatted transcription
    # use the local format converter if specified
    if local_convert:
        converter = TranscriptionFormatConverter()
        transcript = service.transcribe(audio_file_obj, options=transcription_options)
        return converter.convert(transcript)
        
    return service.transcribe_to_format(
        audio_file_obj, 
        format_type=format_type,
        transcription_options=transcription_options
    )

In [None]:
def process_audio_chunks(
    audio_path, chunks, audio_format=None, language=None, local_convert=False
    ):
    """
    Process audio file by chunks and generate SRTs with adjusted timestamps.
    
    Args:
        audio_path: Path to the audio file
        chunks: List of Chunk objects with timing information
        
    Returns:
        Combined SRT string with properly adjusted timestamps
    """
    
    if audio_format is None:
        audio_format = audio_path.suffix[1:]
        print(f"Using audio format: {audio_format}")
        
    # Load the full audio file
    print(f"Loading audio file: {audio_path}")
    full_audio = AudioSegment.from_file(audio_path)
    
    # Process each chunk
    all_srts = []
    
    for i, chunk in enumerate(chunks):
        chunk_duration = chunk.end_time - chunk.start_time
        print(f"Processing chunk {i+1}/{len(chunks)}: {chunk.start_time}ms "
              f"to {chunk.end_time}ms")
        print(f"chunk duration: {chunk_duration}")
        
        # Get subset of audio
        chunk_audio = full_audio[chunk.start_time:chunk.end_time]
        
        # Convert to file-like object
        chunk_file = BytesIO()
        chunk_audio.export(chunk_file, format=audio_format)
        chunk_file.seek(0)  # Reset file pointer to beginning
        
        # Add a filename for whisper to recognize
        chunk_file.name = f"chunk_{i}.{audio_format}"  

        # Generate SRT for this chunk
        chunk_srt = gen_srt(chunk_file, language=language, local_convert=local_convert)
        
        # Adjust timestamps in the SRT based on chunk start time
        adjusted_srt = adjust_srt_timestamps(chunk_srt, chunk.start_time)
        
        all_srts.append(adjusted_srt)
        
    # Combine all SRTs, renumbering entries
    combined_srt = combine_srts(all_srts)
    
    return all_srts, combined_srt

def adjust_srt_timestamps(srt_content, offset_ms):
    """Adjust SRT timestamps by adding the offset (in ms)"""
    
    def add_offset_to_timestamp(timestamp_str, offset_ms):
        """Add millisecond offset to an SRT timestamp string (HH:MM:SS,mmm)"""
        h, m, rest = timestamp_str.split(':')
        s, ms = rest.split(',')
        
        # Convert to total milliseconds
        total_ms = int(h) * 3600000 + int(m) * 60000 + int(s) * 1000 + int(ms) + offset_ms
        
        # Convert back to SRT format
        new_h = total_ms // 3600000
        total_ms %= 3600000
        new_m = total_ms // 60000
        total_ms %= 60000
        new_s = total_ms // 1000
        new_ms = total_ms % 1000
        
        return f"{new_h:02d}:{new_m:02d}:{new_s:02d},{new_ms:03d}"
    
    # Pattern for SRT timestamp lines
    pattern = r'(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})'
    
    def replace_timestamps(match):
        start_time = match.group(1)
        end_time = match.group(2)
        new_start = add_offset_to_timestamp(start_time, offset_ms)
        new_end = add_offset_to_timestamp(end_time, offset_ms)
        return f"{new_start} --> {new_end}"
    
    # Replace all timestamp pairs in the SRT
    return re.sub(pattern, replace_timestamps, srt_content)

def combine_srts(srt_list):
    """Combine multiple SRT strings, renumbering the entries sequentially"""
    result = []
    entry_num = 1
    
    for srt in srt_list:
        # Split into entries (blocks separated by blank lines)
        entries = srt.strip().split("\n\n")
        
        for entry in entries:
            if not entry.strip():
                continue
                
            # Split the entry into lines
            lines = entry.split("\n")
                
            # Replace the index number (first line) with sequential number
            lines[0] = str(entry_num)
            entry_num += 1
            
            # Add updated entry to result
            result.append("\n".join(lines))
    
    # Join all entries with blank lines in between
    return "\n\n".join(result)

In [206]:
audio_file_path = Path("/Users/phapman/Desktop/transcription_wouter/qa_sr_abbess_wh_sh.mp3")

In [180]:
audio_file_path.suffix[1:]

'mp3'

In [181]:
result = diarize(audio_file_path)

2025-04-26 07:44:26,056 - tnh.tnh_scholar.cli_tools.audio_transcribe.pyannote_interface - [36mINFO[0m - Temporary media ID created: media://diarization-20250426074425[0m
2025-04-26 07:44:26,060 - tnh.tnh_scholar.cli_tools.audio_transcribe.pyannote_interface - [36mINFO[0m - Uploading file to Pyannote.ai: /Users/phapman/Desktop/transcription_wouter/qa_sr_abbess_wh_sh.mp3[0m
2025-04-26 07:44:27,929 - tnh.tnh_scholar.cli_tools.audio_transcribe.pyannote_interface - [36mINFO[0m - File uploaded successfully[0m
2025-04-26 07:44:27,931 - tnh.tnh_scholar.cli_tools.audio_transcribe.pyannote_interface - [36mINFO[0m - Starting diarization job for media://diarization-20250426074425[0m
2025-04-26 07:44:28,453 - tnh.tnh_scholar.cli_tools.audio_transcribe.pyannote_interface - [36mINFO[0m - Diarization job started with ID: bcd1adc9-bf6c-41e8-a49d-31d0412c244c[0m
2025-04-26 07:44:28,454 - tnh.tnh_scholar.cli_tools.audio_transcribe.pyannote_interface - [36mINFO[0m - Initial status: create

In [None]:
result = resume_diarization(audio_file_path, 'd4d35761-ac95-4ddd-b468-5a7471855219')

2025-04-22 10:28:39,410 - tnh.tnh_scholar.cli_tools.audio_transcribe.diarize - [36mINFO[0m - Resuming diarization job: d4d35761-ac95-4ddd-b468-5a7471855219[0m
2025-04-22 10:28:39,411 - tnh.tnh_scholar.cli_tools.audio_transcribe.pyannote_interface - [36mINFO[0m - Polling job d4d35761-ac95-4ddd-b468-5a7471855219 until completion (timeout: 180s)[0m
2025-04-22 10:28:40,085 - tnh.tnh_scholar.cli_tools.audio_transcribe.pyannote_interface - [36mINFO[0m - Job d4d35761-ac95-4ddd-b468-5a7471855219 completed successfully[0m


In [182]:
result

{'diarization': [{'speaker': 'SPEAKER_01', 'start': 98.525, 'end': 102.145},
  {'speaker': 'SPEAKER_01', 'start': 102.565, 'end': 105.225},
  {'speaker': 'SPEAKER_01', 'start': 106.625, 'end': 114.405},
  {'speaker': 'SPEAKER_01', 'start': 114.665, 'end': 116.025},
  {'speaker': 'SPEAKER_01', 'start': 117.125, 'end': 126.005},
  {'speaker': 'SPEAKER_01', 'start': 126.285, 'end': 127.225},
  {'speaker': 'SPEAKER_01', 'start': 128.245, 'end': 134.945},
  {'speaker': 'SPEAKER_01', 'start': 135.105, 'end': 145.225},
  {'speaker': 'SPEAKER_01', 'start': 145.385, 'end': 150.965},
  {'speaker': 'SPEAKER_01', 'start': 151.505, 'end': 154.405},
  {'speaker': 'SPEAKER_01', 'start': 154.945, 'end': 161.125},
  {'speaker': 'SPEAKER_01', 'start': 161.325, 'end': 171.825},
  {'speaker': 'SPEAKER_01', 'start': 172.285, 'end': 173.125},
  {'speaker': 'SPEAKER_01', 'start': 173.745, 'end': 180.385},
  {'speaker': 'SPEAKER_01', 'start': 180.625, 'end': 183.065},
  {'speaker': 'SPEAKER_01', 'start': 183.

In [188]:
chunker = DiarizationChunker(target_duration=60 * 1000, single_speaker=True, min_chunk_duration=60 * 1000)

In [189]:
segs = chunker.to_segments(result)

In [190]:
segs

[Segment(speaker='SPEAKER_00', start=98525, end=102145),
 Segment(speaker='SPEAKER_00', start=102565, end=105225),
 Segment(speaker='SPEAKER_00', start=106625, end=114405),
 Segment(speaker='SPEAKER_00', start=114665, end=116025),
 Segment(speaker='SPEAKER_00', start=117125, end=126005),
 Segment(speaker='SPEAKER_00', start=126285, end=127225),
 Segment(speaker='SPEAKER_00', start=128245, end=134945),
 Segment(speaker='SPEAKER_00', start=135105, end=145225),
 Segment(speaker='SPEAKER_00', start=145385, end=150965),
 Segment(speaker='SPEAKER_00', start=151505, end=154405),
 Segment(speaker='SPEAKER_00', start=154945, end=161125),
 Segment(speaker='SPEAKER_00', start=161325, end=171825),
 Segment(speaker='SPEAKER_00', start=172285, end=173125),
 Segment(speaker='SPEAKER_00', start=173745, end=180385),
 Segment(speaker='SPEAKER_00', start=180625, end=183065),
 Segment(speaker='SPEAKER_00', start=183625, end=191625),
 Segment(speaker='SPEAKER_00', start=191965, end=198965),
 Segment(speake

In [191]:
chunks = chunker.extract_chunks(segs)

In [192]:
len(chunks)

6

In [193]:
chunks[1]

Chunk(start_time=102355, end_time=172055, segments=[Segment(speaker='SPEAKER_00', start=102565, end=105225), Segment(speaker='SPEAKER_00', start=106625, end=114405), Segment(speaker='SPEAKER_00', start=114665, end=116025), Segment(speaker='SPEAKER_00', start=117125, end=126005), Segment(speaker='SPEAKER_00', start=126285, end=127225), Segment(speaker='SPEAKER_00', start=128245, end=134945), Segment(speaker='SPEAKER_00', start=135105, end=145225), Segment(speaker='SPEAKER_00', start=145385, end=150965), Segment(speaker='SPEAKER_00', start=151505, end=154405), Segment(speaker='SPEAKER_00', start=154945, end=161125), Segment(speaker='SPEAKER_00', start=161325, end=171825)])

In [194]:
[(ch.start_time, ch.end_time, ch.duration_sec, ch.segments[0].speaker) for ch in chunks]

[(0, 102355, 102.355, 'SPEAKER_00'),
 (102355, 172055, 69.7, 'SPEAKER_00'),
 (172055, 250505, 78.45, 'SPEAKER_00'),
 (250505, 311105, 60.6, 'SPEAKER_00'),
 (311105, 373965, 62.86, 'SPEAKER_00'),
 (373965, 479725, 105.76, 'SPEAKER_00')]

In [195]:
chunks[1].segments[-1]

Segment(speaker='SPEAKER_00', start=161325, end=171825)

In [196]:
chunks[2].segments[0]

Segment(speaker='SPEAKER_00', start=172285, end=173125)

In [207]:
all_srts, combined = process_audio_chunks(audio_file_path, chunks, language="vi", local_convert=True)

Using audio format: mp3
Loading audio file: /Users/phapman/Desktop/transcription_wouter/qa_sr_abbess_wh_sh.mp3
Processing chunk 1/6: 0ms to 102355ms
chunk duration: 102355


2025-04-26 07:50:33,375 - tnh.tnh_scholar.audio_processing.transcription_service.whisper_service - [36mINFO[0m - Transcribing audio with Whisper API using model: whisper-1[0m


Running SRT generation with whisper service...
Audio file: /Users/phapman/Desktop/transcription_wouter/qa_sr_abbess_wh_sh.mp3


  Expected `str` but got `float` with value `102.3499984741211` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
2025-04-26 07:50:37,236 - tnh.tnh_scholar.audio_processing.transcription_service.whisper_service - [36mINFO[0m - Transcription completed successfully[0m


now
later
Processing chunk 2/6: 102355ms to 172055ms
chunk duration: 69700


2025-04-26 07:50:37,461 - tnh.tnh_scholar.audio_processing.transcription_service.whisper_service - [36mINFO[0m - Transcribing audio with Whisper API using model: whisper-1[0m


Running SRT generation with whisper service...
Audio file: /Users/phapman/Desktop/transcription_wouter/qa_sr_abbess_wh_sh.mp3


  Expected `str` but got `float` with value `69.69999694824219` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
2025-04-26 07:50:41,337 - tnh.tnh_scholar.audio_processing.transcription_service.whisper_service - [36mINFO[0m - Transcription completed successfully[0m


now
later
Processing chunk 3/6: 172055ms to 250505ms
chunk duration: 78450


2025-04-26 07:50:41,581 - tnh.tnh_scholar.audio_processing.transcription_service.whisper_service - [36mINFO[0m - Transcribing audio with Whisper API using model: whisper-1[0m


Running SRT generation with whisper service...
Audio file: /Users/phapman/Desktop/transcription_wouter/qa_sr_abbess_wh_sh.mp3


  Expected `str` but got `float` with value `78.44999694824219` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
2025-04-26 07:50:45,725 - tnh.tnh_scholar.audio_processing.transcription_service.whisper_service - [36mINFO[0m - Transcription completed successfully[0m


now
later
Processing chunk 4/6: 250505ms to 311105ms
chunk duration: 60600
Running SRT generation with whisper service...
Audio file: /Users/phapman/Desktop/transcription_wouter/qa_sr_abbess_wh_sh.mp3


2025-04-26 07:50:45,926 - tnh.tnh_scholar.audio_processing.transcription_service.whisper_service - [36mINFO[0m - Transcribing audio with Whisper API using model: whisper-1[0m
  Expected `str` but got `float` with value `60.599998474121094` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
2025-04-26 07:50:48,992 - tnh.tnh_scholar.audio_processing.transcription_service.whisper_service - [36mINFO[0m - Transcription completed successfully[0m


now
later
Processing chunk 5/6: 311105ms to 373965ms
chunk duration: 62860


2025-04-26 07:50:49,201 - tnh.tnh_scholar.audio_processing.transcription_service.whisper_service - [36mINFO[0m - Transcribing audio with Whisper API using model: whisper-1[0m


Running SRT generation with whisper service...
Audio file: /Users/phapman/Desktop/transcription_wouter/qa_sr_abbess_wh_sh.mp3


  Expected `str` but got `float` with value `62.86000061035156` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
2025-04-26 07:50:52,277 - tnh.tnh_scholar.audio_processing.transcription_service.whisper_service - [36mINFO[0m - Transcription completed successfully[0m


now
later
Processing chunk 6/6: 373965ms to 479725ms
chunk duration: 105760


2025-04-26 07:50:52,559 - tnh.tnh_scholar.audio_processing.transcription_service.whisper_service - [36mINFO[0m - Transcribing audio with Whisper API using model: whisper-1[0m


Running SRT generation with whisper service...
Audio file: /Users/phapman/Desktop/transcription_wouter/qa_sr_abbess_wh_sh.mp3


  Expected `str` but got `float` with value `105.76000213623047` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
2025-04-26 07:50:56,447 - tnh.tnh_scholar.audio_processing.transcription_service.whisper_service - [36mINFO[0m - Transcription completed successfully[0m


now
later


In [208]:
print(combined)

1
00:00:10,800 --> 00:00:16,680
Chúc mọi người có một ngày tốt đẹp vui vẻ và hạnh phúc

2
00:00:42,040 --> 00:00:42,119
Chúc

3
00:00:42,119 --> 00:00:42,159
mọi

4
00:00:42,159 --> 00:00:42,180
người

5
00:00:42,180 --> 00:00:48,599
có

6
00:00:48,599 --> 00:00:48,599
một

7
00:00:48,599 --> 00:00:48,599
ngày

8
00:00:48,599 --> 00:00:48,599
tốt

9
00:00:48,599 --> 00:00:48,599
đẹp

10
00:00:48,599 --> 00:00:48,599
vui

11
00:00:48,599 --> 00:00:48,599
vẻ

12
00:00:48,599 --> 00:00:48,599
và

13
00:00:48,599 --> 00:00:48,599
hạnh

14
00:00:48,599 --> 00:00:48,599
phúc

15
00:01:06,419 --> 00:01:07,819
Chúc

16
00:01:07,819 --> 00:01:07,819
mọi

17
00:01:07,819 --> 00:01:07,819
người

18
00:01:07,819 --> 00:01:07,819
có

19
00:01:07,819 --> 00:01:07,819
một

20
00:01:07,819 --> 00:01:07,819
ngày

21
00:01:07,819 --> 00:01:10,400
tốt

22
00:01:10,400 --> 00:01:10,459
đẹp

23
00:01:16,059 --> 00:01:17,540
vui

24
00:01:17,540 --> 00:01:17,540
vẻ

25
00:01:17,540 --> 00:01:17,540
và

26
0

In [178]:
print(all_srts[0])

1
00:00:00,000 --> 00:00:05,000
Chúc mọi người có một ngày tốt đẹp, vui vẻ và hạnh phúc!

2
00:00:30,000 --> 00:00:34,000
Chúc mọi người có một ngày tốt đẹp, vui vẻ và hạnh phúc!

3
00:01:00,000 --> 00:01:05,000
Chúc mọi người có một ngày tốt đẹp, vui vẻ và hạnh phúc!

4
00:01:30,000 --> 00:01:46,000
12333

5
00:01:46,000 --> 00:01:48,000
Giờ, con kính Bạch Sư Hồng,

6
00:01:48,000 --> 00:01:50,000
con kính Tư Quý Thầy Quý Sư Cô và toàn thể đại chúng,

7
00:01:50,000 --> 00:01:53,000
thưa sư cô trụ trì!

8
00:01:53,000 --> 00:01:59,000
Chúng con rất biết ơn hôm nay sư cô đã cho chúng con một buổi chia sẻ

9
00:01:59,000 --> 00:02:03,000
một tuần lễ của khoa học nấu ăn trong chánh niệm

10
00:02:03,000 --> 00:02:06,000
do Tư Viện Lọc Uyển tổ chức.

11
00:02:06,000 --> 00:02:12,000
Và hôm nay là ngày 27 tháng 2 năm 2025.

12
00:02:12,000 --> 00:02:16,000
Chúng ta đang ở trong thiền đường Thái Bình Dương của Tư Viện Lọc Uyển.

13
00:02:16,000 --> 00:02:20,000
Và hôm nay buổi chia sẻ sẽ đư

In [149]:
print(combined)

1
00:00:00,000 --> 00:00:23,900
when you are walking

2
00:00:23,900 --> 00:00:25,420
and watching your feed

3
00:00:25,900 --> 00:00:27,660
every day

4
00:00:27,660 --> 00:00:28,299
you will have more than enough

5
00:00:28,299 --> 00:00:29,340
of energy

6
00:00:29,340 --> 00:00:30,180
to know

7
00:00:30,180 --> 00:00:31,160
more

8
00:00:31,160 --> 00:00:31,580
and

9
00:00:31,580 --> 00:00:33,279
instant%

10
00:00:33,279 --> 00:00:33,759
of

11
00:00:33,759 --> 00:00:34,880
yourself

12
00:00:34,880 --> 00:00:34,939
you have

13
00:00:34,939 --> 00:00:35,259
not

14
00:00:35,259 --> 00:00:35,779
enough

15
00:00:35,779 --> 00:00:36,540
energy

16
00:00:36,540 --> 00:00:36,779
to know

17
00:00:36,779 --> 00:00:37,099
both

18
00:00:37,099 --> 00:00:37,560
and

19
00:00:37,560 --> 00:00:37,820
more

20
00:00:37,820 --> 00:00:38,639
and

21
00:00:38,639 --> 00:00:39,340
instantly

22
00:00:39,340 --> 00:00:41,139
the

23
00:00:41,139 --> 00:00:41,660
same

24
00:00:41,660 --> 00

In [84]:
from tnh_scholar.utils.file_utils import write_str_to_file

out_srt = working_dir / "Dharma Talk Br. Phap Hoi (for transcription) 2-bit.srt"
write_str_to_file(out_srt, combined, overwrite=True)