# Download and Transcribe Playlist

In [8]:
# !pip install pytube rich tqdm

In [72]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [73]:
#Import Modules
from data_ingest import DataIngest
from tqdm.notebook import tqdm
from rich.pretty import pprint
import pandas as pd
import os
import json

### Create Playlist
---

In [74]:
huberman = "https://www.youtube.com/playlist?list=PLPNW_gerXa4Pc8S2qoUQc5e8Ir97RLuVW"
ingest = DataIngest(huberman)
playlist = ingest.get_playlist()

In [75]:
ingest.show_url

'https://www.youtube.com/channel/UC2D2CMWXMOVWx7giW1n3LIg'

In [67]:
pprint(playlist[:5])
len(playlist)

193

### Download Audio and Get Metadata Dictionary
---

In [76]:
%%time
video_meta = ingest.get_audio_files_threaded(playlist)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 193/193 [28:12<00:00,  8.77s/it]

CPU times: user 28min 45s, sys: 22.7 s, total: 29min 8s
Wall time: 28min 13s





### Save Results to Disk

In [80]:
with open('../../data/raw/huberman_meta.json', 'w') as f:
    f.write(json.dumps(video_meta, indent=2))

In [82]:
with open('../../data/raw/huberman_meta.json') as f:
    data = json.loads(f.read())

### Set Constants
---

In [27]:
# Load Pipeline
import torch
import transformers
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
from transformers.utils import is_flash_attn_2_available

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, 
    low_cpu_mem_usage=True, use_safetensors=True,
    use_flash_attention_2=is_flash_attn_2_available()
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Instantiate Pipeline
---

In [28]:
# instantiante pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=model_id,
    torch_dtype=torch.float16,
    device="cuda:0", # or mps for Mac devices
    model_kwargs={"use_flash_attention_2": is_flash_attn_2_available()},
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


2 minutes 40 seconds for 189 episodes of Huberman (Threaded)

### Create Transcript/Write to Disk/Save to meta as "text"

In [64]:
video_ids = set([os.path.splitext(file)[0] for file in os.listdir('videos/')])
transcript_ids = ([os.path.splitext(file)[0] for file in os.listdir('transcripts/')])
untranscribed = list(video_ids.difference(transcript_ids))
untranscribed

[]

In [30]:
%%time
for video in tqdm(untranscribed):
    try:
        ingest.transcribe_audio(pipe, video)
    except Exception as e:
        print(f'Error transcribing {video}, due to {e}')
        continue
# pprint(transcript['chunks'], max_length=2)

  0%|          | 0/4 [00:00<?, ?it/s]

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


CPU times: user 8min 54s, sys: 7.09 s, total: 9min 1s
Wall time: 9min


# Join Metadata with raw text

### Get Transcript Paths

In [83]:
transcripts = ingest.get_transcript_paths('./transcripts/')
len(transcripts)

193

### Get Metadata file

In [84]:
meta_data = ingest.read_json('../../data/raw/huberman_meta.json')
len(meta_data)

193

### Join Meta with Text

In [86]:
meta_data_list = [v for k,v in meta_data.items()]
len(meta_data_list)

193

In [92]:
# final_meta = ingest.join_all_transcripts_to_meta(transcripts, meta_data)

Bad pipe message: %s [b'\xc3\xba\x14*\xcb\x9b\xc4\xa9\x8c\x86\x99ot\xc2*\xea&\xba #\xdc\xcb\xfa\xce-\xb7\xbb\xcc\x1d\xb9\x0f\xb7\xb4\xb2?\x87\xa9\xb8\xdf\xf4\x16\xbcvE\x1e\xf1\xfeF\xe8,\x84\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02']
Bad pipe message: %s [b"\x8fc\x81\xc5Qj\xbc\x8e\x01\x9f\x89\x01\xebxt\xc4\x987\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc

In [61]:
ingest.write_json(final_meta, '../../data/huberman_labs.json')

In [62]:
test = ingest.read_json('../../data/huberman_labs.json')

In [63]:
final_meta[0] == test[0]

True