# Download and Transcribe Playlist

In [1]:
# !pip install pytube rich tqdm camel-converter

In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
#Import Modules
from data_ingest import DataIngest
from tqdm.notebook import tqdm
from rich.pretty import pprint
import pandas as pd
import os
import json

### Create Playlist
---

In [10]:
huberman = "https://www.youtube.com/playlist?list=PLPNW_gerXa4Pc8S2qoUQc5e8Ir97RLuVW"
ingest = DataIngest(huberman)
playlist = ingest.get_playlist()

In [11]:
ingest.show_url

'https://www.youtube.com/channel/UC2D2CMWXMOVWx7giW1n3LIg'

In [12]:
pprint(playlist[:5])
len(playlist)

193

### Download Audio and Get Metadata Dictionary
---

In [13]:
%%time
video_meta = ingest.download_audio(playlist[0], index=1)

CPU times: user 8.95 s, sys: 35.1 ms, total: 8.98 s
Wall time: 9.69 s


### Save Results to Disk

In [80]:
with open('../../data/raw/huberman_meta.json', 'w') as f:
    f.write(json.dumps(video_meta, indent=2))

In [82]:
with open('../../data/raw/huberman_meta.json') as f:
    data = json.loads(f.read())

### Set Constants
---

In [27]:
# Load Pipeline
import torch
import transformers
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
from transformers.utils import is_flash_attn_2_available

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, 
    low_cpu_mem_usage=True, use_safetensors=True,
    use_flash_attention_2=is_flash_attn_2_available()
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Instantiate Pipeline
---

In [28]:
# instantiante pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=model_id,
    torch_dtype=torch.float16,
    device="cuda:0", # or mps for Mac devices
    model_kwargs={"use_flash_attention_2": is_flash_attn_2_available()},
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


2 minutes 40 seconds for 189 episodes of Huberman (Threaded)

### Create Transcript/Write to Disk/Save to meta as "text"

In [64]:
video_ids = set([os.path.splitext(file)[0] for file in os.listdir('videos/')])
transcript_ids = ([os.path.splitext(file)[0] for file in os.listdir('transcripts/')])
untranscribed = list(video_ids.difference(transcript_ids))
untranscribed

[]

In [30]:
%%time
for video in tqdm(untranscribed):
    try:
        ingest.transcribe_audio(pipe, video)
    except Exception as e:
        print(f'Error transcribing {video}, due to {e}')
        continue
# pprint(transcript['chunks'], max_length=2)

  0%|          | 0/4 [00:00<?, ?it/s]

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


CPU times: user 8min 54s, sys: 7.09 s, total: 9min 1s
Wall time: 9min


# Join Metadata with raw text

### Get Transcript Paths

In [83]:
transcripts = ingest.get_transcript_paths('./transcripts/')
len(transcripts)

193

### Get Metadata file

In [84]:
meta_data = ingest.read_json('../../data/raw/huberman_meta.json')
len(meta_data)

193

### Join Meta with Text

In [86]:
meta_data_list = [v for k,v in meta_data.items()]
len(meta_data_list)

193

In [15]:
# final_meta = ingest.join_all_transcripts_to_meta(transcripts, meta_data)

In [61]:
ingest.write_json(final_meta, '../../data/huberman_labs.json')

In [62]:
test = ingest.read_json('../../data/huberman_labs.json')

In [63]:
final_meta[0] == test[0]

True