# Download and Transcribe Playlist

In [8]:
# !pip install pytube rich tqdm

In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
#Import Modules
from data_ingest import DataIngest
from tqdm.notebook import tqdm
from rich.pretty import pprint
import pandas as pd
import os
import json

### Create Playlist
---

In [3]:
huberman = "https://www.youtube.com/playlist?list=PLPNW_gerXa4Pc8S2qoUQc5e8Ir97RLuVW"
ingest = DataIngest(huberman)
playlist = ingest.get_playlist()

In [6]:
pprint(playlist[:5])
len(playlist)

193

### Download Audio and Get Metadata Dictionary

In [5]:
%%time
video_meta = ingest.get_audio_files_threaded(playlist)

  3%|██▊                                                                                                           | 5/193 [01:51<1:09:38, 22.23s/it]

KeyboardInterrupt



In [18]:
meta_data[4]

{'video_id': 'oL3SkPV1_Ik',
 'title': 'Dr. E.J. Chichilnisky: How the Brain Works, Curing Blindness & How to Navigate a Career Path',
 'length_seconds': '7206',
 'channel_id': 'UC2D2CMWXMOVWx7giW1n3LIg',
 'is_owner_viewing': False,
 'is_crawlable': True,
 'thumbnail': {'thumbnails': [{'url': 'https://i.ytimg.com/vi/oL3SkPV1_Ik/mqdefault.jpg',
    'width': 320,
    'height': 180},
   {'url': 'https://i.ytimg.com/vi/oL3SkPV1_Ik/hqdefault.jpg?sqp=-oaymwEXCJADEOABSFryq4qpAwkIARUAAIhCGAE=&rs=AOn4CLBn00CZgpK0KSZzateJIkWNh5R9Cw',
    'width': 400,
    'height': 224},
   {'url': 'https://i.ytimg.com/vi/oL3SkPV1_Ik/hq720.jpg?sqp=-oaymwEXCKAGEMIDSFryq4qpAwkIARUAAIhCGAE=&rs=AOn4CLBMCLU8setf33v3utMq0s-b9juzYA',
    'width': 800,
    'height': 450},
   {'url': 'https://i.ytimg.com/vi/oL3SkPV1_Ik/hq720.jpg',
    'width': 1280,
    'height': 720}]},
 'allow_ratings': True,
 'view_count': '111831',
 'author': 'Huberman Lab',
 'is_private': False,
 'is_unplugged_corpus': False,
 'music_video_type': 'MU

In [6]:
#@title Download Audio
def get_audio_files(video_urls: list[str], 
                    video_dir: str='videos/'
                    ) -> list[dict]:
    meta = []
    for url in tqdm(video_urls):
        try:
            video_info = download_audio(url, video_dir=video_dir)
            meta.append({video_info['videoId']:video_info})
        except Exception as e:
            print(e)
            continue
    return meta

In [7]:
def get_audio_files_threaded(video_urls: list[dict[int, str]], 
                             video_dir: str='videos/'
                             ) -> list[dict]:
    meta = []
    with tqdm(total=len(video_urls)) as progress:
        with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as executor:
            futures = [executor.submit(download_audio, url, video_dir) for url in video_urls]
            for future in as_completed(futures):
                try:
                    progress.update(1)
                    video_info = future.result()
                    meta.append({video_info['videoId']:video_info})
                    print(f'Completed episode: {video_info["episode_num"]}')
                except Exception as e:
                    print(f'Error: {e}')
                    continue
    return meta

### Set Constants
---

In [None]:
# Load Pipeline
import torch
import transformers
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
from transformers.utils import is_flash_attn_2_available

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, 
    low_cpu_mem_usage=True, use_safetensors=True,
    use_flash_attention_2=is_flash_attn_2_available()
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

### Instantiate Pipeline
---

In [None]:
# instantiante pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=model_id,
    torch_dtype=torch.float16,
    device="cuda:0", # or mps for Mac devices
    model_kwargs={"use_flash_attention_2": is_flash_attn_2_available()},
)

### Create Playlist
---

2 minutes 40 seconds for 189 episodes of Huberman (Threaded)

In [15]:
meta = get_audio_files_threaded(videos)

  0%|          | 0/189 [00:00<?, ?it/s]

Completed episode: 4
Completed episode: 8
Completed episode: 7
Completed episode: 12
Completed episode: 5
Completed episode: 1
Completed episode: 9
Completed episode: 3
Completed episode: 10
Completed episode: 2
Completed episode: 6
Completed episode: 14
Completed episode: 13
Completed episode: 16
Completed episode: 15
Completed episode: 17
Completed episode: 11
Completed episode: 18
Completed episode: 20
Completed episode: 19
Completed episode: 21
Completed episode: 26
Completed episode: 23
Completed episode: 22
Completed episode: 24
Completed episode: 25
Completed episode: 27
Completed episode: 28
Completed episode: 30
Completed episode: 29
Completed episode: 32
Completed episode: 31
Completed episode: 33
Completed episode: 34
Completed episode: 39
Completed episode: 38
Completed episode: 35
Completed episode: 40
Completed episode: 36
Completed episode: 37
Completed episode: 41
Completed episode: 42
Completed episode: 44
Completed episode: 43
Completed episode: 46
Completed episode: 

In [16]:
# sort by episode id
meta = sorted(meta, key=lambda x: x[list(x.keys())[0]]['episode_num'])

In [17]:
keys = [list(d.keys())[0] for d in meta]

### Save JSON to disk

In [19]:
with open('../data/huberman_meta.json', 'w') as f:
    f.write(json.dumps(meta, indent=2))

In [55]:
with open('../data/huberman_meta.json') as f:
    data = json.loads(f.read())

### Create Transcript/Write to Disk/Save to meta as "text"

In [9]:
%%time
for video in tqdm(data[36:]):
    try:
        transcribe_audio(video)
    except Exception as e:
        print(e)
        continue
# pprint(transcript['chunks'], max_length=2)

  0%|          | 0/153 [00:00<?, ?it/s]

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


CPU times: user 6h 59min 16s, sys: 5min 34s, total: 7h 4min 51s
Wall time: 7h 1min 53s


Bad pipe message: %s [b'\x9f\x14`f\x96\xebBB\x07\n\xa0\x0b\xe0\xdbhO:L ?\x91!\xf0\xa9\x19\x92*fv\xc6\x14\x07i\xcb\x16\xbc\xcfV!\xc9\x9d\xdb|\xe0I\x83\xed(\x96\x8c\x05\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08', b'\x0b\x08\x04\x08\x05\x08\x06\x04\x01']
Bad pipe message: %s [b'b\xc2\x8c\x03\xa6\x8at\xbc\xa6f\xab\xb1\xf5\xdb\xe0g\xcc\x0b #\xbeE?s2\xc6\xca\x9dT\x13\xb0\x0e\xe9\xf0\x0c\x02\xc4\x0f\xff\x1b\x10\xddJ+\x16\xec\xa8\xae\xf3\xaa\xb3\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19']
Bad pipe message: %s [b"H\xfa\x95\x02\x9d\xe0\xcd\xb1\xf7\x06\x1f.\xf5\xf8\x91y\xa7Z\x00\x00|\xc0,\xc00\x00

In [10]:
keys = [list(d.keys())[0] for d in data]

In [25]:
finished = [d for i, d in enumerate(data) if d[keys[i]].get('text') != None]

In [20]:
with open('../data/huberman_subset.json', 'w') as f:
    f.write(json.dumps(finished, indent=2))

### Join Metadata with raw text

In [13]:
from pathlib import Path

In [56]:
transcripts = [path for path in Path('/home/elastic/notebooks/vectorsearch-applications/notebooks/transcripts/').iterdir() if path.name.endswith('.txt')]

In [57]:
temp_dict = {}
for d in data:
    for k, v in d.items():
        temp_dict[k] = v 

In [59]:
for path in transcripts:
    video_id = path.name.split('.')[0]
    if video_id in temp_dict:
        with open(path) as f:
            text = f.read()
            temp_dict[video_id]['content'] = text
        

In [64]:
for k in temp_dict:
    assert temp_dict[k].get('content') != None

In [61]:
lens = [len(temp_dict[k]['content'].split()) for k in temp_dict] 

In [62]:
import pandas as pd

In [63]:
pd.DataFrame(lens).sum()[0] * 1.3

5713204.9

In [65]:
with open('../data/hubermanlabs.json', 'w') as f:
    f.write(json.dumps(temp_dict))

In [66]:
with open('../data/impact_theory_data.json') as f:
    impact = json.loads(f.read())

In [68]:
impact_lens = [len(d['content'].split()) for d in impact]

In [70]:
pd.DataFrame(impact_lens).describe()

Unnamed: 0,0
count,384.0
mean,12821.268229
std,7650.847177
min,1819.0
25%,7888.0
50%,9894.5
75%,16857.0
max,48502.0


Bad pipe message: %s [b"\x895;S\xa4N\xfc\x9e\xff\x0c\xea=n\xfd\xda0&\xfb\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0"]
Bad pipe message: %s [b'=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00']
Bad pipe message: %s [b"\xc3\x84\xb6\x89\x87\xc7EG\xaeaC\xa1\x83\xc6\x98n\x8c\x91\x00\x00\xa6\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0