## **Setup Enviroment**

In [1]:
# @title Install requirements

!export LC_ALL=C.UTF-8
!export LANG=C.UTF-8

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

!pip install transformers
!pip install datasets
!pip install huggingface_hub

# install whisper requirements
!apt update && apt install -y ffmpeg
!pip install git+https://github.com/openai/whisper.git

# install evaluation requirements
!pip install evaluate
!pip install jiwer # needed by evaluate

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu

Unlock exclusive datasets on Hugging Face Hub! Find your Hub authentication token [here](https://huggingface.co/settings/tokens)

In [5]:
# @title access to huggingface hub

from huggingface_hub import login
from google.colab import userdata

login(token=userdata.get('HuggingFace'), add_to_git_credential=True)

Token is valid (permission: read).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [6]:
# @title CleanUP memory

import torch
import gc

def free_memory(model=None):
    if model is not None:
        del model
        print("Model deleted.")
    torch.cuda.empty_cache()  # Empty the CUDA cache
    print("CUDA cache emptied.")
    gc.collect()  # Run the garbage collector
    # This is a trick to prompt Python to free up memory to the OS
    _ = gc.collect()
    print("Garbage collector has run.")

# **Using transformers**

In [7]:
# @title load model and processor
from transformers import WhisperProcessor, WhisperForConditionalGeneration
model_name = "openai/whisper-small"

processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.84k [00:00<?, ?B/s]

Use flerus dataset

In [8]:
# @title load the dataset
from datasets import load_dataset
from datasets import Audio

en_dataset = load_dataset("google/fleurs", "en_us", split="test", streaming=True, trust_remote_code=True)
fr_dataset = load_dataset("google/fleurs", "fr_fr", split="test", streaming=True, trust_remote_code=True)

def generate_audio_samples(dataset, num_samples=20):
    dataset = dataset.take(num_samples)
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
    for item in dataset:
        yield {**item["audio"], "reference": item["transcription"]}

Downloading builder script:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/13.3k [00:00<?, ?B/s]

In [9]:
# @title English to English

model.config.forced_decoder_ids = None

for sample in generate_audio_samples(en_dataset, 1):
    input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features

    # generate token ids
    predicted_ids = model.generate(input_features)
    # decode token ids to text
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

    print(f'Refrence text: {sample["reference"]}')
    print(f'Transcription text: {transcription}')

Refrence text: however due to the slow communication channels styles in the west could lag behind by 25 to 30 year
Transcription text: [' However, due to the slow communication channels, styles in the West could lag behind by 25 to 30 years.']


In [11]:
# @title French to English

forced_decoder_ids = processor.get_decoder_prompt_ids(language="french", task="translate")

for sample in generate_audio_samples(fr_dataset, 1):
    input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features

    # generate token ids
    predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
    # decode token ids to text
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

    print(f'Refrence text: {sample["reference"]}')
    print(f'Transcription text: {transcription}')

Refrence text: l'accident a eu lieu en terrain montagneux et il semblerait que cela ait été causé par un incendie malveillant
Transcription text: [' The accident happened in a mountain area and it seems that it was caused by a burning fire.']


In [12]:
# @title Long-Form Transcription
# @markdown The Whisper model is intrinsically designed to work on audio samples of up to 30s in duration. However, by using a chunking algorithm, it can be used to transcribe audio samples of up to arbitrary length. This is possible through Transformers pipeline method. Chunking is enabled by setting chunk_length_s=30 when instantiating the pipeline. With chunking enabled, the pipeline can be run with batched inference. It can also be extended to predict sequence level timestamps by passing return_timestamps=True:

import torch
from transformers import pipeline
from datasets import load_dataset

device = "cuda:0" if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    "automatic-speech-recognition",
    model=model_name,
    chunk_length_s=30,
    device=device,
)

for sample in generate_audio_samples(en_dataset, 1):
    prediction = pipe(sample.copy(), batch_size=8)["text"]

    # we can also return timestamps for the predictions
    prediction = pipe(sample.copy(), batch_size=8, return_timestamps=True)["chunks"]

    print(prediction)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[{'timestamp': (0.0, 10.0), 'text': ' However, due to the slow communication channels, styles in the West could lag behind by 25 to 30 years.'}]


In [13]:
# @title CleanUP
free_memory(pipe)
free_memory(model)
free_memory(processor)
free_memory(en_dataset)
free_memory(fr_dataset)

Model deleted.
CUDA cache emptied.
Garbage collector has run.
Model deleted.
CUDA cache emptied.
Garbage collector has run.
Model deleted.
CUDA cache emptied.
Garbage collector has run.
Model deleted.
CUDA cache emptied.
Garbage collector has run.
Model deleted.
CUDA cache emptied.
Garbage collector has run.


# **Evaluation**

We have two challenges when evaluating the model with audio data:
- the large volume of data
- the limitations of Colab's resources.

To solve this problem, streaming mode allows us to load and prepare samples on-demand, iterating over the dataset without needing the entire file downloaded. This efficient approach means we only have the data when needed, freeing up resources and speeding up our workflow.


Used dataset:

- [fleurs](https://huggingface.co/datasets/google/fleurs)

- [librispeech_asr](https://huggingface.co/datasets/librispeech_asr)

- [common_voice_11_0](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0)

In [14]:
# @title evaluate the model

from datasets import load_dataset
from whisper.normalizers import EnglishTextNormalizer
from transformers import pipeline
from datasets import Audio
import evaluate
import pandas as pd

# Load datasets
dataset_names = ["google/fleurs", "librispeech_asr", "mozilla-foundation/common_voice_11_0"]
dataset_splits = ["en_us", "clean", "en"]

# Load each dataset with the corresponding split and store them in a dictionary
# To demonstrate multi-dataset evaluation with streaming mode, we only evaluate the first 20 samples for each dataset.
# To run on the entire dataset, comment out or remove `num_sample`.
esb_datasets = {name: load_dataset(name, split, split="test", streaming=True, trust_remote_code=True) for name, split in zip(dataset_names, dataset_splits)}
num_sample = 20

# Initialize Whisper normalizer
# Text Normalization essential for reducing WER as mentioned in the Whisper [paper](https://cdn.openai.com/papers/whisper.pdf)
whisper_normalizer = EnglishTextNormalizer()

# Define helper functions
def get_transcript(sample):
    # Return the transcript from the sample, which could be under different keys
    return sample.get("text") or sample.get("sentence") or sample.get("transcription")

def normalizer(batch):
    # Normalize the transcript of each batch
    batch['norm_transcript'] = whisper_normalizer(get_transcript(batch))
    return batch

def is_target_text_in_range(ref):
    # Check if the reference text is not in the list of ignored sequences
    return ref.strip() not in ["ignore time segment in scoring", ""]

def data(dataset):
    # Yield each item in the dataset with its audio and normalized transcript
    for item in dataset:
        yield {**item["audio"], "reference": item["norm_transcript"]}

# Define the metrics and models to evaluate
metrics_to_evaluate = ["wer", "cer"]
models_to_evaluate = ["base.en", "small.en", "medium.en"]

# Initialize a dictionary to store the results
results = {f"{model}_{metric}": [] for model in models_to_evaluate for metric in metrics_to_evaluate}

# Define the batch size and sampling rate to take from each dataset
batch_size = 16
sampling_rate = 16_000

# Loop over each model
for model in models_to_evaluate:
    # Initialize the ASR pipeline with the current model
    whisper_asr = pipeline("automatic-speech-recognition", model=f"openai/whisper-{model}", device=0)

    # Loop over each dataset
    for dataset_name, dataset in esb_datasets.items():
        # Take a subset of the dataset
        if num_sample is not None:
            dataset = dataset.take(num_sample)
        # Cast the audio column to the desired sampling rate
        dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
        # Normalize the transcripts
        dataset = dataset.map(normalizer)
        predictions, references = [], []

        # Run the ASR pipeline on the dataset and store the predictions and references
        for out in whisper_asr(data(dataset), batch_size=batch_size):
            predictions.append(whisper_normalizer(out["text"]))
            references.append(out["reference"][0])

        # Compute each metric for the current model and dataset
        for metric in metrics_to_evaluate:
            compute_metric = evaluate.load(metric)
            result = compute_metric.compute(references=references, predictions=predictions)
            results[f"{model}_{metric}"].append(round(100 * result, 2))

Downloading builder script:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.13k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.4k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/60.9k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.94k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

Reading metadata...: 16354it [00:01, 13945.61it/s]


config.json:   0%|          | 0.00/1.94k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Reading metadata...: 16354it [00:00, 36538.26it/s]


config.json:   0%|          | 0.00/1.95k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Reading metadata...: 16354it [00:00, 36993.58it/s]


In [15]:
# @title Display results

formatted_results = {}
for model in models_to_evaluate:
    for metric in metrics_to_evaluate:
        formatted_results[(model, metric.upper())] = results[f"{model}_{metric}"]

df = pd.DataFrame(formatted_results, index=esb_datasets.keys())
df.index.name = "Dataset"
df.columns.names = [None, None]
df

Unnamed: 0_level_0,base.en,base.en,small.en,small.en,medium.en,medium.en
Unnamed: 0_level_1,WER,CER,WER,CER,WER,CER
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
google/fleurs,7.03,3.02,5.67,2.94,3.63,1.49
librispeech_asr,3.39,0.96,2.93,0.67,2.93,0.71
mozilla-foundation/common_voice_11_0,32.78,16.41,20.0,10.22,13.89,6.4


In [16]:
# @title CleanUP
free_memory(whisper_asr)

Model deleted.
CUDA cache emptied.
Garbage collector has run.


# **Use whisper wrapper to generate subtitles**

In [17]:
import os
import subprocess
from urllib.parse import urlparse, unquote

# Repository URL
repo_url = "https://github.com/abdalrohman/Video-Transcriber"

# Extract repository name from URL
repo_name = unquote(urlparse(repo_url).path.split('/')[-1])

# Clone the repository if it doesn't exist
if not os.path.isdir(repo_name):
    subprocess.run(["git", "clone", repo_url])
    print(f"Cloned the repository: {repo_url}")
else:
    print(f"The repository {repo_name} is already cloned.")

# Change to the repository directory if not already in it
if os.getcwd().split('/')[-1] != repo_name:
    os.chdir(repo_name)
    print(f"Changed current dir to {os.getcwd()}")
else:
    print(f"You are already in the repository directory: {repo_name}")

Cloned the repository: https://github.com/abdalrohman/Video-Transcriber
Changed current dir to /content/Video-Transcriber


In [21]:
!export LC_ALL=UTF-8
!export LANG=UTF-8



In [22]:
!pip install -q -r ./requirements.txt

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.4/194.4 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m88.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [23]:
!python ./video_transcriber.py -h

error: XDG_RUNTIME_DIR not set in the environment.
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1334:(snd_func_refer) error evaluating name
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5701:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM default
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5178:(_snd_config_evalu

In [24]:
!python ./video_transcriber.py --video_file "https://www.youtube.com/watch?v=OBNNoEpietw" --model "medium.en" --task "transcribe" --output_format all --font_type "Roboto" --font_color "red"

error: XDG_RUNTIME_DIR not set in the environment.
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1334:(snd_func_refer) error evaluating name
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5701:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM default
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5178:(_snd_config_evalu