## **Long-form Transcription with Thonburian Whisper**

In [None]:
%%capture
!pip install transformers
!pip install attacut
!pip install ssg
!pip install datasets
!pip install pyarrow==15.0.2
!pip install pydub
!pip install ipywebrtc

In [None]:
%%capture
!git clone https://github.com/biodatlab/thonburian-whisper/
!cp ./thonburian-whisper/longform_transcription/sentence_segment.py .
!cp ./thonburian-whisper/longform_transcription/utils.py .

In [None]:
import torch
import pandas as pd
from datasets import Audio, Dataset
from transformers import pipeline
from sentence_segment import SyllableSegmentation
from utils import convert_mp4_to_wav, perform_vad, generate_srt, burn_srt_to_video
from pydub import AudioSegment

In [None]:
class LongformTranscriber:
    """
    A class for transcribing long-form audio files using a pre-trained ASR model.

    sr (int): Sampling rate for audio processing.
    model_path (str): Path to the pre-trained ASR model.
    chunk_length_s (int): Length of audio chunks for processing in seconds.
    batch_size (int): Batch size for ASR inference.
    language (str): Language code for transcription (e.g., "th" for Thai).
    segment_duration (float): Duration for syllable segmentation in seconds.
    pipe (Pipeline): Hugging Face Transformers pipeline for ASR.
    ss (SyllableSegmentation): Instance of SyllableSegmentation for post-processing.
    """
    def __init__(
        self,
        sr: int = 16000,
        model_path: str = "biodatlab/whisper-th-medium-combined",
        chunk_length_s: int = 30,
        batch_size: int = 4,
        language: str = "th",
        segment_duration: float = 4.0
    ):
        self.sr = sr
        self.model_path = model_path
        self.chunk_length_s = chunk_length_s
        self.batch_size = batch_size
        self.language = language
        self.segment_duration = segment_duration

        # Initialize ASR pipeline
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.pipe = pipeline(
            "automatic-speech-recognition",
            model=self.model_path,
            chunk_length_s=self.chunk_length_s,
            device=device,
            torch_dtype=torch.float16,
        )

        self.ss = SyllableSegmentation()

    def convert_audio_to_wav(self, audio_file, target_sr):
        """
        Convert an audio file to WAV format with a specified sampling rate.
        """
        audio = AudioSegment.from_file(audio_file)
        audio = audio.set_frame_rate(target_sr).set_channels(1)
        output_wav_file = audio_file.rsplit('.', 1)[0] + "_converted.wav"
        audio.export(output_wav_file, format="wav")
        return output_wav_file

    def transcribe(self, audio_path: str):
        """
        Transcribe a long-form audio file.

        Inputs:
            audio_path (str): Path to the input audio file.
        Return:
            list: A list of segments, each containing transcription with start, stop time.
        """
        if audio_path.endswith('.mp4'):
            wav_file = self.convert_mp4_to_wav(audio_path)
        elif audio_path.endswith('.wav'):
            # Check sampling rate and convert if necessary
            audio = AudioSegment.from_wav(audio_path)
            if audio.frame_rate != self.sr:
                wav_file = self.convert_audio_to_wav(audio_path, self.sr)
            else:
                wav_file = audio_path
        else:  # Assuming other audio formats such as .mp3, etc.
            wav_file = self.convert_audio_to_wav(audio_path, self.sr)

        _, chunklist = perform_vad(wav_file, 'temp_directory_for_chunks')

        # for faster inference, create dataset
        audio_dataset = Dataset.from_dict({"audio": [c["fname"] for c in chunklist]}).cast_column("audio", Audio())

        prediction_gen = self.pipe(
            audio_dataset["audio"],
            generate_kwargs={"task": "transcribe", "language": self.language},
            return_timestamps=False,
            batch_size=self.batch_size,
        )

        predictions = [out for out in prediction_gen]
        vad_transcriptions = {
            "start": [(chunk["start"] / self.sr) for chunk in chunklist],
            "end": [(chunk["end"] / self.sr) for chunk in chunklist],
            "prediction": [pred["text"] for pred in predictions]
        }
        uncorrected_segments = self.ss(vad_transcriptions=vad_transcriptions, segment_duration=self.segment_duration)
        return uncorrected_segments

In [None]:
# This is to test recording with IPython widget.
# Discard this cell if you want to use `audio.mp3`.
from ipywebrtc import AudioRecorder, CameraStream
from google.colab import output
output.enable_custom_widget_manager()

camera = CameraStream(constraints={'audio': True, 'video': False})
recorder = AudioRecorder(stream=camera)
recorder

In [None]:
# Save recorded audio to audio.mp3
# Discard this line if you want to transcribe an audio file directly
recorder.save("audio.mp3")

Create `LongformTranscriber` and transcribe `audio.mp3` file

In [None]:
transcriber = LongformTranscriber(
    sr=16000,
    model_path="biodatlab/whisper-th-medium-combined",
    chunk_length_s=30,
    batch_size=4,
    language="th",
    segment_duration=4.0
)

In [None]:
transcriptions = transcriber.transcribe("audio.mp3")
pd.DataFrame(transcriptions)  # transcription in Dataframe format (text, start, end)