In [1]:
import os

import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# cwd 
print(os.getcwd())

/mnt/beegfs/home/supriyatno/codes/transcribe_whisper


In [3]:
audio_dir = "data/small"

# list mp4 files
mp4_files = [f for f in os.listdir(audio_dir) if f.endswith('.mp4')]

In [4]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
!nvidia-smi

Mon Jun 24 12:59:02 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.147.05   Driver Version: 525.147.05   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro RTX 5000     Off  | 00000000:3B:00.0 Off |                  Off |
| 33%   32C    P2    49W / 230W |   3791MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=25,
    batch_size=32,
    torch_dtype=torch_dtype,
    device=device,
)

In [11]:
start_time = time.time()

with torch.no_grad():
    transcription = pipe(os.path.join(audio_dir, mp4_files[0]))

print(f"Time taken: {time.time() - start_time:.2f} seconds")    

Time taken: 29.00 seconds


In [8]:
mp4_files[0]

'v=_bhrRP5SElA.mp4'

In [9]:
csv_file = "world/snapshot.20240606153519/video_list_eng_title.csv"

In [10]:
import pandas as pd

file_df = pd.read_csv(csv_file)

In [13]:
file_df.head()

Unnamed: 0.1,Unnamed: 0,institution_id,institution_name,institution_alias,institution_established,institution_country,institution_city,channel_id,channel_title,channel_description,channel_url,video_id,video_title,video_description,video_published_at,video_duration,video_view_count,video_url,video_duration_seconds,is_english
0,0,https://ror.org/01kpzv902,Flinders University,Flinders University of South Australia,1966.0,Australia,Adelaide,UCykVMY_tZbXuZfCQTEYyvPg,Flinders University,Flinders is a leading international university...,https://youtube.com/channel/UCykVMY_tZbXuZfCQT...,KxQvUMYep4Y,Diploma in Sport Business with Adelaide United...,The Diploma in Sport Business is a one-year co...,5 months ago,23:08,73 views,https://www.youtube.com/watch?v=KxQvUMYep4Y,1388.0,True
1,1,https://ror.org/01kpzv902,Flinders University,Flinders University of South Australia,1966.0,Australia,Adelaide,UCykVMY_tZbXuZfCQTEYyvPg,Flinders University,Flinders is a leading international university...,https://youtube.com/channel/UCykVMY_tZbXuZfCQT...,4tNjwmxpuLM,2023 Chalmers Oration,The College of Medicine and Public Health host...,8 months ago,1:33:26,216 views,https://www.youtube.com/watch?v=4tNjwmxpuLM,5606.0,True
2,2,https://ror.org/01kpzv902,Flinders University,Flinders University of South Australia,1966.0,Australia,Adelaide,UCykVMY_tZbXuZfCQTEYyvPg,Flinders University,Flinders is a leading international university...,https://youtube.com/channel/UCykVMY_tZbXuZfCQT...,W-VhxLXIP2U,Fearless Conversations: Voice to Parliament,The Voice to Parliament Lecture focuses on the...,9 months ago,1:18:17,510 views,https://www.youtube.com/watch?v=W-VhxLXIP2U,4697.0,True
3,3,https://ror.org/01kpzv902,Flinders University,Flinders University of South Australia,1966.0,Australia,Adelaide,UCykVMY_tZbXuZfCQTEYyvPg,Flinders University,Flinders is a leading international university...,https://youtube.com/channel/UCykVMY_tZbXuZfCQT...,qGtXc8LHgVQ,Diploma in Sports Management - Port Adelaide A...,Hear about what it is like to study a Diploma ...,1 year ago,42:36,375 views,https://www.youtube.com/watch?v=qGtXc8LHgVQ,2556.0,True
4,4,https://ror.org/01kpzv902,Flinders University,Flinders University of South Australia,1966.0,Australia,Adelaide,UCykVMY_tZbXuZfCQTEYyvPg,Flinders University,Flinders is a leading international university...,https://youtube.com/channel/UCykVMY_tZbXuZfCQT...,3xqILHLwZxs,Fearless Conversations | Authenticity & Identi...,Watch our very special final Fearless Conversa...,1 year ago,51:08,114 views,https://www.youtube.com/watch?v=3xqILHLwZxs,3068.0,True


In [14]:
# sample 100 and save
file_df.sample(100).to_csv("world/snapshot.20240606153519/sample_100.csv", index=False)

In [1]:
from s3utils import get_list_of_files_s3

In [2]:
files = get_list_of_files_s3("world/snapshot.20240606153519/audio_files")

In [4]:
len(files[0])

190851

In [8]:
files[1][1]

102141374