# Download and Transcribe Playlist

## Set up Environment

In [None]:
#@title Specify Requirements
%%writefile requirements.txt
accelerate
camel_converter
optimum
polars >=0.19
pandas>=2.1.3
pytube
transformers

Writing requirements.txt


In [None]:
#@title Install Requirements
!pip install -Ur requirements.txt &>/content/logs.txt

In [None]:
#@title Import Modules
from pytube import Playlist, YouTube
from camel_converter import to_snake
from tqdm.notebook import tqdm
from rich.pretty import pprint
import pandas as pd
import os

In [None]:
#@title Load Pipeline
import torch
from transformers import pipeline
from transformers.utils import is_flash_attn_2_available

# Select checkpoint from https://huggingface.co/openai/whisper-large-v3#model-details
model_id = "distil-whisper/large-v2"
pipe = pipeline(
    "automatic-speech-recognition",
    model=model_id,
    torch_dtype=torch.float16,
    device="cuda:0", # or mps for Mac devices
    model_kwargs={"use_flash_attention_2": is_flash_attn_2_available()},
)
pprint(pipe.model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
#@title Define Utility Functions
def download_audio(video_url):
  """Download youtube video and return videoDetails"""
  yt = YouTube(video_url)
  (yt.streams
    .filter(only_audio=True, file_extension = "mp4")
    .order_by("abr")
    .desc()
    .first()
    .download(filename = f"videos/{yt.video_id}.mp4"))
  return yt.vid_info['videoDetails']

def transcribe_audio(video_id):
  """Transcribe downloaded video"""
  outputs = pipe(
      f"videos/{video_id}.mp4",
      chunk_length_s=30,
      batch_size=24,
      return_timestamps=True,
  )
  return outputs

## Download and Transcribe

In [None]:
#@title List Videos in Playlist
import os
os.makedirs("videos", exist_ok=True)
playlist_url = "https://www.youtube.com/playlist?list=PL8qcvQ7Byc3OJ02hbWJbHWePh4XEg3cvo"
video_urls = Playlist(playlist_url)
pprint(list(video_urls), max_length=3)

In [None]:
#@title Download Audio
video_info = download_audio(video_urls[0])
pprint(video_info)

In [None]:
#@title Transcribe Audio
transcript = transcribe_audio(video_info['videoId'])
pprint(transcript['chunks'], max_length=2)

In [None]:
#@title Save Info
from pathlib import Path
import json
import os
os.makedirs('data', exist_ok=True)
episode_info = video_info | {"transcript": transcript["chunks"]}
with Path(f"data/{video_info['videoId']}.json").open("w", encoding="UTF-8") as target:
    json.dump(episode_info, target)