<a href="https://colab.research.google.com/github/acrching/WhisperX/blob/main/Whisper_streaming%20%2B%20basic%20post%20processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faster-whisper
!pip install streamlink
!pip install ffmpeg-python

In [None]:
import os
import numpy as np
import queue
import threading
import ffmpeg
import streamlink
from faster_whisper import WhisperModel

AUDIO_BUFFER_SIZE = 30  # Buffer size in seconds

class WhisperOnline:
    def __init__(self, model_size, hls_url):
        self.audio_buffer = queue.Queue(maxsize=AUDIO_BUFFER_SIZE * 16000)
        self.model = WhisperModel(model_size, device="cuda", compute_type="float16")
        self.hls_url = hls_url
        self.context_words = []

    def download_audio(self):
        streams = streamlink.streams(self.hls_url)
        stream_url = streams['best'].url

        process = (
            ffmpeg
            .input(stream_url)
            .output('pipe:', format='wav', acodec='pcm_s16le', ac=1, ar='16k')
            .run_async(pipe_stdout=True, pipe_stderr=True)
        )
        while True:
            in_bytes = process.stdout.read(1024)
            if not in_bytes:
                break
            self.audio_buffer.put(np.frombuffer(in_bytes, np.int16).astype(np.float32) / 32768.0)

    def transcribe_audio(self):
        while True:
            if not self.audio_buffer.empty():
                audio_chunk = []
                for _ in range(16000 * 10):
                    if not self.audio_buffer.empty():
                        audio_chunk.append(self.audio_buffer.get())
                    else:
                        break
                if len(audio_chunk) == 0:
                    break
                audio_chunk = np.concatenate(audio_chunk)

                # Transcribe with context
                context = ' '.join(self.context_words) if self.context_words else None
                segments, _ = self.model.transcribe(audio_chunk, initial_prompt=context, beam_size=5)

                new_transcription = []
                for segment in segments:
                    new_transcription.append(segment.text)
                    self.context_words.extend(segment.text.split())

                # Keep only the last 200 words in context
                if len(self.context_words) > 200:
                    self.context_words = self.context_words[-200:]

                # Print the new transcription segment
                new_text = ' '.join(new_transcription)
                print(self.post_process_transcription(new_text))

    def post_process_transcription(self, text):
        # Basic post-processing to correct common errors
        text = text.replace(' ,', ',').replace(' .', '.').replace(' ?', '?').replace(' !', '!')
        text = text.replace('  ', ' ')
        return text

    def run(self):
        download_thread = threading.Thread(target=self.download_audio)
        transcribe_thread = threading.Thread(target=self.transcribe_audio)

        download_thread.start()
        transcribe_thread.start()

        download_thread.join()
        transcribe_thread.join()

if __name__ == "__main__":
    model_size = 'large-v2'  # Model size for faster-whisper
    hls_url = 'YOUR_HLS_STREAM_URL'  # Replace with your HLS stream URL

    whisper_online = WhisperOnline(model_size, hls_url)
    whisper_online.run()