## **Long-form Transcription with Thonburian Whisper**

This Jupyter notebook demonstrates the process of performing long-form transcription using Thonburian Whisper, a specialized model for Thai language speech recognition. Thonburian Whisper is an adaptation of OpenAI's Whisper model, fine-tuned on Thai speech data to improve accuracy for Thai language transcription tasks.

In [None]:
%%capture
!pip install transformers
!pip install attacut
!pip install ssg
!pip install datasets
!pip install pyarrow==15.0.2
!pip install pydub
!pip install ipywebrtc
!pip install openai
!pip install yt-dlp --upgrade
!pip install gradio==3.44.3

In [None]:
%%capture
!git clone https://github.com/biodatlab/thonburian-whisper/
!cp ./thonburian-whisper/longform_transcription/sentence_segment.py .
!cp ./thonburian-whisper/longform_transcription/utils.py .

In [None]:
import io
import os
import yt_dlp
import torch
import pandas as pd
from datasets import Audio, Dataset
from transformers import pipeline
from sentence_segment import SyllableSegmentation
from utils import convert_mp4_to_wav, perform_vad, generate_srt, burn_srt_to_video
from pydub import AudioSegment

In [None]:
class LongformTranscriber:
    def __init__(
        self,
        sr: int = 16000,
        model_path: str = "biodatlab/whisper-th-medium-combined",
        chunk_length_s: int = 30,
        batch_size: int = 4,
        language: str = "th",
        segment_duration: float = 4.0
    ):
        self.sr = sr
        self.model_path = model_path
        self.chunk_length_s = chunk_length_s
        self.batch_size = batch_size
        self.language = language
        self.segment_duration = segment_duration

        # Initialize ASR pipeline
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.pipe = pipeline(
            "automatic-speech-recognition",
            model=self.model_path,
            chunk_length_s=self.chunk_length_s,
            device=device,
            torch_dtype=torch.float16,
        )

        self.ss = SyllableSegmentation()

    def convert_audio_to_wav(self, audio_file, target_sr):
        audio = AudioSegment.from_file(audio_file)
        audio = audio.set_frame_rate(target_sr).set_channels(1)
        output_wav_file = audio_file.rsplit('.', 1)[0] + "_converted.wav"
        audio.export(output_wav_file, format="wav")
        return output_wav_file

    def transcribe(self, audio_path: str):
        if audio_path.endswith('.mp4'):
            wav_file = self.convert_mp4_to_wav(audio_path)
        elif audio_path.endswith('.wav'):
            # Check sampling rate and convert if necessary
            audio = AudioSegment.from_wav(audio_path)
            if audio.frame_rate != self.sr:
                wav_file = self.convert_audio_to_wav(audio_path, self.sr)
            else:
                wav_file = audio_path
        else:  # Assuming other audio formats such as .mp3, etc.
            wav_file = self.convert_audio_to_wav(audio_path, self.sr)

        _, chunklist = perform_vad(wav_file, 'temp_directory_for_chunks')

        # for faster inference, create dataset and feed to prediction pipeline
        audio_dataset = Dataset.from_dict({"audio": [c["fname"] for c in chunklist]}).cast_column("audio", Audio())

        prediction_gen = self.pipe(
            audio_dataset["audio"],
            generate_kwargs={"task": "transcribe", "language": self.language},
            return_timestamps=False,
            batch_size=self.batch_size,
        )

        predictions = [out for out in prediction_gen]
        vad_transcriptions = {
            "start": [(chunk["start"] / self.sr) for chunk in chunklist],
            "end": [(chunk["end"] / self.sr) for chunk in chunklist],
            "prediction": [pred["text"] for pred in predictions]
        }
        uncorrected_segments = self.ss(vad_transcriptions=vad_transcriptions, segment_duration=self.segment_duration)
        return uncorrected_segments

Create `LongformTranscriber` and transcribe `audio.mp3` file

In [None]:
transcriber = LongformTranscriber(
    sr=16000,
    model_path="biodatlab/whisper-th-medium-combined",
    chunk_length_s=30,
    batch_size=4,
    language="th",
    segment_duration=4.0
)

In [None]:
def _return_yt_html_embed(yt_url):
    video_id = yt_url.split("?v=")[-1]
    HTML_str = (
        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
        " </center>"
    )
    return HTML_str


def yt_transcribe(yt_url: str):
    try:
        ydl_opts = {
            'format': 'bestaudio/best',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
            'outtmpl': 'audio.%(ext)s',
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(yt_url, download=True)
            video_id = info['id']

        html_embed_str = _return_yt_html_embed(video_id)
        transcripts = transcriber.transcribe("audio.mp3")
        transcripts = pd.DataFrame(transcripts)  # Convert to DataFrame

        # Clean up the downloaded file
        os.remove("audio.mp3")

        return html_embed_str, transcripts
    except Exception as e:
        return f"Error: {str(e)}", "An error occurred while processing the YouTube video."

In [None]:
_, transcripts = yt_transcribe("https://www.youtube.com/watch?v=TtBD1kkmRqw")
pd.DataFrame(transcripts).to_csv("content.csv", index=False)