<a href="https://colab.research.google.com/github/c0ffeyy/Proyecto-II/blob/master/whisperx_web_ui_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/rgcodeai/Kit-Whisperx.git
%cd Kit-Whisperx

!pip install gradio ffmpeg whisperx

import gradio as gr
import torch
import time
import os
from src.transcription_utils import transcribe, language_options, model_options, ModelManager

class TranscriptionApp:
    def __init__(self):
        """
        Initializes an instance with a ModelManager for managing AI models,
        sets default device and model based on CUDA availability,
        and prepares a Gradio app and outputs dictionary for UI interactions and storing results.
        """
        self.model_manager = ModelManager()
        self.default_device = "cuda" if torch.cuda.is_available() else "cpu"
        self.default_model = "Large-v2" if torch.cuda.is_available() else "Medium"
        self.app = gr.Blocks()
        self.outputs = {}
        self.last_transcription_time = 0

        # Crear carpeta Temp si no existe
        if not os.path.exists('Temp'):
            os.makedirs('Temp')

    def start_transcription(self, file, device, language, model):
        """Start transcription process."""
        start_time = time.time()

        try:
            results = transcribe(file, device, language, model, self.model_manager)
        except ValueError as e:
            return str(e), 0

        end_time = time.time()
        self.last_transcription_time = round(end_time - start_time, 1)

        if results:
            json_output, txt_path, vtt_path, srt_path = results
            self.outputs = {
                'TXT': txt_path,
                'SRT': srt_path,
                'JSON': json_output,
                'VTT': vtt_path
            }
            return self.update_output_text('TXT'), self.last_transcription_time
        return "No transcription available.", self.last_transcription_time


    def update_output_text(self, format_choice):
        """Update the text area based on the format choice."""
        if format_choice and self.outputs.get(format_choice):
            file_path = self.outputs[format_choice]
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    return file.read()
            except FileNotFoundError:
                return "File not found."
        return "No file available or format not selected."

    # User interface for the transcription kit using Gradio
    def setup_ui(self):
        with self.app:
            gr.Markdown("# Kit Transcriptor Whisperx")
            gr.Markdown("❤️ Follow us on [YouTube](https://www.youtube.com/channel/UC_YzjCh-CSSCSGANvt5wBNQ?sub_confirmation=1), [GitHub](https://github.com/rgcodeai) 🌐 More on [Mister Contenidos](https://mistercontenidos.com)")
            with gr.Row():
                with gr.Column():
                    gr.Markdown("### Supported Formats: Audio (mp3, wav) and Video (mp4, avi, mov, flv)")
                    file_input = gr.File(label="Upload your multimedia file", type="filepath")
                    device_dropdown = gr.Dropdown(label="Select device", choices=["cuda", "cpu"], value=self.default_device)
                    model_dropdown = gr.Dropdown(label="Select model", choices=list(model_options.keys()), value=self.default_model)
                    language_dropdown = gr.Dropdown(label="Select language", choices=list(language_options.keys()), value="Identify")
                    transcribe_button = gr.Button("Start Transcription")

                with gr.Column():
                    transcription_time_display = gr.Textbox(label="Last Transcription Time (seconds)", interactive=False, lines=1)
                    format_choice = gr.Radio(['TXT', 'SRT', 'VTT', 'JSON'], label="Select format to view:", value='TXT')
                    output_text = gr.Textbox(label="File Content", interactive=False, lines=10)
                    download_button = gr.Button("Download Transcription")
                    format_choice.change(fn=self.update_output_text, inputs=format_choice, outputs=output_text, queue=True)
                    download_button.click(fn=lambda x: self.outputs.get(x), inputs=format_choice, outputs=gr.File())

            transcribe_button.click(fn=self.start_transcription, inputs=[file_input, device_dropdown, language_dropdown, model_dropdown], outputs=[output_text, transcription_time_display])

    def launch(self):
        """Launch the transcription application."""
        self.setup_ui()
        self.app.launch(share=True)


if __name__ == '__main__':
    app = TranscriptionApp()
    app.launch()

Cloning into 'Kit-Whisperx'...
remote: Enumerating objects: 22, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 22 (delta 7), reused 8 (delta 1), pack-reused 0[K
Receiving objects: 100% (22/22), 10.64 KiB | 10.64 MiB/s, done.
Resolving deltas: 100% (7/7), done.
/content/Kit-Whisperx
Collecting gradio
  Downloading gradio-4.41.0-py3-none-any.whl.metadata (15 kB)
Collecting ffmpeg
  Downloading ffmpeg-1.4.tar.gz (5.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting whisperx
  Downloading whisperx-3.1.5-py3-none-any.whl.metadata (13 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.112.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading grad

  torchaudio.set_audio_backend("soundfile")


Attempting to load model: Medium on device: cuda


config.json:   0%|          | 0.00/2.26k [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/1.53G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

No language specified, language will be first be detected for each audio file (increases inference time).


100%|█████████████████████████████████████| 16.9M/16.9M [00:01<00:00, 9.97MiB/s]
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../root/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.3.1+cu121. Bad things might happen unless you revert torch to 1.x.
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://995bd200e57350cf45.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
