In [2]:
from faster_whisper import WhisperModel
from tqdm import tqdm
import os
import time
from typing import List
import pandas as pd
import json
import re

### Load cache of metadata

In [3]:
with open('./data/huberman_meta.json') as f:
    data = json.loads(f.read())

In [5]:
cache = {d['videoId'] : d for d in data}

### Upload model

In [7]:
model = WhisperModel(model_size_or_path='large-v2', device='cuda', device_index=0, compute_type='float16')

In [8]:
video_dir = './data/videos/huberman/'
files = sorted([os.path.join(video_dir, file) for file in os.listdir(video_dir) if file.endswith('mp4')])
transcript_dir = './data/video/huberman/transcripts/'
files

['./data/videos/huberman/GpgqXCkRO-w.mp4',
 './data/videos/huberman/ccrbE0QHy94.mp4',
 './data/videos/huberman/n28W4AmvMDE.mp4']

In [9]:
def transcribe_podcast(whisper_model: WhisperModel, 
                       file_path: str,
                       outpath_dir: str,
                       cache: dict,
                       combine_metadata: bool=True
                       ) -> List[dict]:
    '''
    Transcribes single podcast given podcast metadata which includes path to 
    where podcast mp3 file is stored. 
    '''
    
    start = time.perf_counter()
    video_id = os.path.splitext(os.path.split(file_path)[-1])[0]
    metadata = cache[video_id]
    title = metadata['title']
    print(f'Processing Title: {title}')
    segments, _ = whisper_model.transcribe(file_path, beam_size=5, word_timestamps=False)
    segments = list(segments)
    # transcript = [{'start': seg.start, 'end': seg.end, 'text':seg.text} for seg in segments]
    transcript = ' '.join([seg.text.strip() for seg in segments])
    end = time.perf_counter() - start
    print(f'Transcription completed in {end:0.2f} seconds.')

    #create method to write transcript to disk
    
    if combine_metadata:
        metadata.update(text=transcript)
        save_path = os.path.join(outpath_dir, video_id) + '.json'
        with open(save_path, 'w') as f:
            json.dump(metadata, f)
        return metadata
        
    else: return transcript

Bad pipe message: %s [b"\x06R\x82l\x82\xad\xb0G\xb3_\xb5\xf2\x9dp\x17\x13\xdf\xb6\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17"]
Bad pipe message: %s [b"\xb6p\xf0\xeco\xc1\xc6k.\x8d\x18\x82%\xc5U\xaa\xf8\xc9\x00\x00\xa6\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0s\xc0w\x00\xc4\x00\xc3\xc0#\xc0'\x00g\x00@\xc0r\xc0v\x00\xbe\x00\xbd\xc0\n\xc0\x14\x0

In [None]:
transcribe_podcast(model, files[0], outpath_dir='./data/videos/huberman/', cache=cache)

Processing Title: Rick Rubin: Protocols to Access Creative Energy and Process


In [16]:
def transcribe_podcast(whisper_model: WhisperModel, 
                       file_path: str,
                       outpath_dir: str,
                       cache: dict,
                       combine_metadata: bool=True
                       ) -> List[dict]:
    '''
    Transcribes single podcast given podcast metadata which includes path to 
    where podcast mp3 file is stored. 
    '''
    
    start = time.perf_counter()
    video_id = os.path.splitext(os.path.split(file_path)[-1])[0]
    print(video_id)
    metadata = cache[id]
    title = metadata['title']
    episode = re.findall('[0-9]+', title.strip()[:3])[0]
    print(f'Processing episode #{episode}\t Title: {title}')
    segments, _ = whisper_model.transcribe(file_path, beam_size=5, word_timestamps=False)
    segments = list(segments)
    # transcript = [{'start': seg.start, 'end': seg.end, 'text':seg.text} for seg in segments]
    transcript = ' '.join([seg.text.strip() for seg in segments])
    end = time.perf_counter() - start
    print(f'Transcription completed in {end:0.2f} seconds.')

    #create method to write transcript to disk
    
    if combine_metadata:
        metadata.update(text=transcript)
        filename = os.path.splitext(os.path.split(path)[-1])[0]
        save_path = os.path.join(outpath_dir, filename) + '.json'
        with open(save_path, 'w') as f:
            json.dump(metadata, f)
        return metadata
        
    else: return transcript

### Get Data Paths

In [7]:
video_dir = './data/videos/huberman/'
files = sorted([os.path.join(video_dir, file) for file in os.listdir(video_dir) if file.endswith('mp4')])
transcript_dir = './data/video/huberman/transcripts/'
files

['./data/videos/huberman/GpgqXCkRO-w.mp4',
 './data/videos/huberman/ccrbE0QHy94.mp4',
 './data/videos/huberman/n28W4AmvMDE.mp4']

In [15]:
os.path.splitext(os.path.split(files[0])[-1])[0]
os.path.splitext(os.path.split(file_path)[-1])[0]

'GpgqXCkRO-w'

In [35]:
for path in tqdm(subset):
    try:
        meta = transcribe_podcast(model, path, transcript_dir, cache)
    except Exception as e:
        print(f'Error with {path}: {e}')
        continue        


  0%|                                                                                        | 0/64 [00:00<?, ?it/s][A

Processing episode #100	 Title: 100- My Journey Healing From Depression



  2%|█▏                                                                           | 1/64 [02:57<3:06:47, 177.89s/it][A

Transcription completed in 177.89 seconds.
Processing episode #101	 Title: 101- Are Your Adrenals The Cause of Your Weight Gain and Exhaustion w/ Kristen Files



  3%|██▍                                                                          | 2/64 [06:59<3:42:25, 215.26s/it][A

Transcription completed in 241.41 seconds.
Processing episode #102	 Title: 102- Are Trauma, Emotions, or Ancestral Health Hurting Your Gut w/ Dr. Will Cole



  5%|███▌                                                                         | 3/64 [10:32<3:38:02, 214.47s/it][A

Transcription completed in 213.53 seconds.
Processing episode #103	 Title: 103- Is It Your Mindset That’s Making You Sick? w/ Juanique Roney



  6%|████▊                                                                        | 4/64 [14:35<3:45:46, 225.77s/it][A

Transcription completed in 243.09 seconds.
Processing episode #104	 Title: 104- How Functional Medicine Helped To Save My Life w/ Taylor Dukes



  8%|██████                                                                       | 5/64 [18:05<3:36:17, 219.95s/it][A

Transcription completed in 209.64 seconds.
Processing episode #105	 Title: 105 - Do You Know Which Foods Help Each Stage of Your Cycle? w/ Candace Burch & Jess Suchan



  9%|███████▏                                                                     | 6/64 [22:44<3:51:57, 239.95s/it][A

Transcription completed in 278.76 seconds.
Processing episode #106	 Title: 106 - 7 Reasons to Choose a Natural Health Approach w/ Shane and Lizz Watt



 11%|████████▍                                                                    | 7/64 [25:59<3:34:03, 225.32s/it][A

Transcription completed in 195.19 seconds.
Processing episode #107	 Title: 107 - If trapped emotions are making us sick, how do we release them? with Dr. G



 12%|█████████▋                                                                   | 8/64 [30:46<3:48:32, 244.86s/it][A

Transcription completed in 286.72 seconds.
Processing episode #108	 Title: 108 - Is Glyphosate Okay To Eat? Is It Doing Anything To Our Health? w/ Andrew Kimbrell



 14%|██████████▊                                                                  | 9/64 [34:41<3:41:44, 241.90s/it][A

Transcription completed in 235.39 seconds.
Processing episode #109	 Title: 109 - Sunscreen vs. Sunblock, Melasma Help, And Summer First Aid Tips w/ Phil Cowley



 16%|███████████▉                                                                | 10/64 [38:02<3:26:15, 229.18s/it][A

Transcription completed in 200.69 seconds.
Processing episode #10	 Title: 10 - Sterling Jones, CEO of JOJO's Chocolates



 17%|█████████████                                                               | 11/64 [40:10<2:55:13, 198.38s/it][A

Transcription completed in 128.53 seconds.
Processing episode #110	 Title: 110 - Could The Air In Your Home Be The Cause Of Your Health Issues? w/ Peter Spiegel



 19%|██████████████▎                                                             | 12/64 [43:03<2:45:11, 190.60s/it][A

Transcription completed in 172.83 seconds.
Processing episode #111	 Title: 111- Tips & Tricks You’ve Never Heard of For Bug Bites, Sunburns, Stomachaches, & More w/ Sara-Chana



 20%|███████████████▍                                                            | 13/64 [46:39<2:48:33, 198.31s/it][A

Transcription completed in 216.04 seconds.
Processing episode #112	 Title: 112 - What is Applied Kinesiology and Who Can It Benefit? w/ Dr. Calvin Ng



 22%|████████████████▋                                                           | 14/64 [50:01<2:46:00, 199.21s/it][A

Transcription completed in 201.29 seconds.
Processing episode #113	 Title: 113 - Using Your Stem Cells for Injury Healing and Anti-Aging w/ Dr. Harry Adelson



 23%|█████████████████▊                                                          | 15/64 [52:37<2:32:08, 186.29s/it][A

Transcription completed in 156.34 seconds.
Processing episode #114	 Title: 114 - Why Is Insulin Resistance Such a Health Problem Now? w/ Dr. Benjamin Bikman



 25%|███████████████████                                                         | 16/64 [57:05<2:48:48, 211.02s/it][A

Transcription completed in 268.44 seconds.
Processing episode #115	 Title: 115 - How Brain Inflammation is Impacting Kids' Health w/ Dr. Josh Redd



 27%|███████████████████▋                                                      | 17/64 [1:00:55<2:49:36, 216.52s/it][A

Transcription completed in 229.33 seconds.
Processing episode #116	 Title: 116 - What is Your Mouth Telling You About Your Health? w/ Dr. Michelle Jorgensen



 28%|████████████████████▊                                                     | 18/64 [1:04:51<2:50:39, 222.61s/it][A

Transcription completed in 236.77 seconds.
Processing episode #117	 Title: 117 - Is It Your Water That’s Making You Sick? w/ Trent Spafford



 30%|█████████████████████▉                                                    | 19/64 [1:08:16<2:42:53, 217.19s/it][A

Transcription completed in 204.56 seconds.
Processing episode #118	 Title: 118 - Debunking Fitness Myths and Training Smarter, Not Harder w/ Dr. Shannon Ritchey



 31%|███████████████████████▏                                                  | 20/64 [1:11:58<2:40:19, 218.62s/it][A

Transcription completed in 221.96 seconds.
Processing episode #119	 Title: 119 - Conquering Your Sleep Struggles w/ Dr. Michael Breus



 33%|████████████████████████▎                                                 | 21/64 [1:16:13<2:44:30, 229.55s/it][A

Transcription completed in 255.03 seconds.
Processing episode #11	 Title: 11 - Sarah Grace Meckelberg, tips on balancing hormones



 34%|█████████████████████████▍                                                | 22/64 [1:19:48<2:37:40, 225.25s/it][A

Transcription completed in 215.22 seconds.
Processing episode #120	 Title: 120 -Tips For Clearing Acne Naturally w/ Maria Marlowe



 36%|██████████████████████████▌                                               | 23/64 [1:22:24<2:19:40, 204.41s/it][A

Transcription completed in 155.80 seconds.
Processing episode #121	 Title: 121 - Why a Breast Cancer Diagnosis Led to Finding Better Makeup w/ Jaleh Bisharat



 38%|███████████████████████████▊                                              | 24/64 [1:24:31<2:00:50, 181.27s/it][A

Transcription completed in 127.30 seconds.
Processing episode #122	 Title: 122 - Unlocking the Secrets of Fat Loss w/ Dr. Ben Bikman



 39%|████████████████████████████▉                                             | 25/64 [1:28:38<2:10:39, 201.00s/it][A

Transcription completed in 247.03 seconds.
Processing episode #123	 Title: 123 - The Hidden Dangers of Mold in your Home w/ Brian Karr



 41%|██████████████████████████████                                            | 26/64 [1:33:03<2:19:19, 219.99s/it][A

Transcription completed in 264.29 seconds.
Processing episode #124	 Title: 124 - Are you Treating Back Pain and other Physical Issues Correctly? w/ Dr. Drew Morcos



 42%|███████████████████████████████▏                                          | 27/64 [1:36:40<2:15:15, 219.34s/it][A

Transcription completed in 217.81 seconds.
Processing episode #125	 Title: 125 - Tips for Choosing Better at Restaurants w/ Michelle Walrath



 44%|████████████████████████████████▍                                         | 28/64 [1:39:17<2:00:19, 200.53s/it][A

Transcription completed in 156.65 seconds.
Processing episode #126	 Title: 126 - Can Infertility Have Hidden Causes? w/ Dr. Kalea Wattles



 45%|█████████████████████████████████▌                                        | 29/64 [1:42:41<1:57:35, 201.59s/it][A

Transcription completed in 204.05 seconds.
Processing episode #127	 Title: 127 - Managing Stress through Cold Plunge and Breathing w/ Danny Massa



 47%|██████████████████████████████████▋                                       | 30/64 [1:45:51<1:52:10, 197.95s/it][A

Transcription completed in 189.44 seconds.
Processing episode #128	 Title: 128 - Finding Root Causes through your Eyes w/ Jordan Gundersen



 48%|███████████████████████████████████▊                                      | 31/64 [1:48:42<1:44:26, 189.90s/it][A

Transcription completed in 171.12 seconds.
Processing episode #12	 Title: 12- Dr. Barb Woegerer and how magnesium can help us in our health journey



 50%|█████████████████████████████████████                                     | 32/64 [1:52:05<1:43:21, 193.80s/it][A

Transcription completed in 202.91 seconds.
Processing episode #13	 Title: 13- Dr James DiNicoloantonio and how real salt can benefit our health



 52%|██████████████████████████████████████▏                                   | 33/64 [1:54:19<1:30:57, 176.06s/it][A

Transcription completed in 134.66 seconds.
Processing episode #14	 Title: 14 - Dr. Benjamin Bikman and why insulin resistance is such a health problem now



 53%|███████████████████████████████████████▎                                  | 34/64 [1:58:48<1:41:56, 203.87s/it][A

Transcription completed in 268.76 seconds.
Processing episode #15	 Title: 15- Scott Schwab and the health benefits of CBD oil



 55%|████████████████████████████████████████▍                                 | 35/64 [2:01:37<1:33:32, 193.52s/it][A

Transcription completed in 169.37 seconds.
Processing episode #16	 Title: 16 - Dr. Wil Bulsiewicz, gastroenterologist, talks all things gut health



 56%|█████████████████████████████████████████▋                                | 36/64 [2:05:54<1:39:04, 212.31s/it][A

Transcription completed in 256.14 seconds.
Processing episode #17	 Title: 17 - Jimmer Fredette, professional basketball player,  nutrition plays a role in an athlete's career



 58%|██████████████████████████████████████████▊                               | 37/64 [2:10:09<1:41:19, 225.16s/it][A

Transcription completed in 255.15 seconds.
Processing episode #18	 Title: 18 - Dr. Maren Locke, dermatologist, all things skin related



 59%|███████████████████████████████████████████▉                              | 38/64 [2:13:16<1:32:37, 213.77s/it][A

Transcription completed in 187.17 seconds.
Processing episode #19	 Title: 19 - Dr. Jen Pfleghaar - the role of the immune system and how it's affected by nutrition



 61%|█████████████████████████████████████████████                             | 39/64 [2:16:13<1:24:32, 202.90s/it][A

Transcription completed in 177.55 seconds.
Processing episode #1	 Title: 1 - Juanique Roney- The Order of Healing



 62%|██████████████████████████████████████████████▎                           | 40/64 [2:19:33<1:20:43, 201.82s/it][A

Transcription completed in 199.31 seconds.
Processing episode #20	 Title: 20 - Shannon Tripp, ER nurse, empowering parents to help children through illness and injury.



 64%|███████████████████████████████████████████████▍                          | 41/64 [2:22:48<1:16:35, 199.81s/it][A

Transcription completed in 195.10 seconds.
Processing episode #21	 Title: 21 - Dr. Kathy Yeo, detoxing and the role of EMF's on our health



 66%|████████████████████████████████████████████████▌                         | 42/64 [2:25:36<1:09:49, 190.42s/it][A

Transcription completed in 168.52 seconds.
Processing episode #22	 Title: 22 - Dr. Josh Axe, Ancient Remedies and how they help us heal today



 67%|█████████████████████████████████████████████████▋                        | 43/64 [2:28:48<1:06:46, 190.79s/it][A

Transcription completed in 191.64 seconds.
Processing episode #23	 Title: 23 - Dr. Natasha Beck a pediatric neuropsychologist, toxins and children's development



 69%|██████████████████████████████████████████████████▉                       | 44/64 [2:31:30<1:00:41, 182.07s/it][A

Transcription completed in 161.72 seconds.
Processing episode #24	 Title: 24 - Dr. Jess, MD - parasites and mold: how they can contribute to many common health issue



 70%|█████████████████████████████████████████████████████▍                      | 45/64 [2:34:01<54:43, 172.83s/it][A

Transcription completed in 151.28 seconds.
Processing episode #25	 Title: 25- Eden Lee- Mom of 4 Young Children Trying to Live a Nontoxic Lifestyle



 72%|██████████████████████████████████████████████████████▋                     | 46/64 [2:37:36<55:37, 185.41s/it][A

Transcription completed in 214.78 seconds.
Processing episode #26	 Title: 26-Dr. Sheila Kilbane, Pediatrician, common illnesses found in children and how to help them



 73%|███████████████████████████████████████████████████████▊                    | 47/64 [2:40:38<52:16, 184.49s/it][A

Transcription completed in 182.34 seconds.
Processing episode #27	 Title: 27- Dr. Amy Shah- Intermittent Fasting, All Things You Want To Know About It



 75%|█████████████████████████████████████████████████████████                   | 48/64 [2:43:14<46:53, 175.87s/it][A

Transcription completed in 155.75 seconds.
Processing episode #28	 Title: 28 - Dr. Anna Cabeca, how to best help yourself during menopause and peri menopause



 77%|██████████████████████████████████████████████████████████▏                 | 49/64 [2:47:01<47:50, 191.38s/it][A

Transcription completed in 227.58 seconds.
Processing episode #29	 Title: 29 - Danielle Walker, author and chef, teaches how to manage auto immune and gut issues.



 78%|███████████████████████████████████████████████████████████▍                | 50/64 [2:50:20<45:09, 193.56s/it][A

Transcription completed in 198.63 seconds.
Processing episode #2	 Title: 2 - Michelle Jorgensen - The Importance of Dentistry in Healing Your Body



 80%|████████████████████████████████████████████████████████████▌               | 51/64 [2:53:50<43:00, 198.52s/it][A

Transcription completed in 210.08 seconds.
Processing episode #30	 Title: 30 - Dr. Uma Naidoo, Harvard nutritional psychiatrist, discusses how food affects the brain.



 81%|█████████████████████████████████████████████████████████████▊              | 52/64 [2:56:43<38:10, 190.85s/it][A

Transcription completed in 172.96 seconds.
Processing episode #31	 Title: 31 Sara-Chana Silverstein, Master Herbalist, how herbs can help improve your health



 83%|██████████████████████████████████████████████████████████████▉             | 53/64 [2:59:55<35:02, 191.12s/it][A

Transcription completed in 191.74 seconds.
Processing episode #32	 Title: 32- Ashleigh DiLello, Mind coach and Creator of Bio Emotional healing.



 84%|████████████████████████████████████████████████████████████████▏           | 54/64 [3:03:01<31:36, 189.61s/it][A

Transcription completed in 186.08 seconds.
Processing episode #33	 Title: 33- Dr. Tabatha Barber, OBGYN, What Affects Our Hormones



 86%|█████████████████████████████████████████████████████████████████▎          | 55/64 [3:06:11<28:28, 189.86s/it][A

Transcription completed in 190.44 seconds.
Processing episode #34	 Title: 34 - Lillie Biesinger, counting macros and eating nutritiously



 88%|██████████████████████████████████████████████████████████████████▌         | 56/64 [3:09:49<26:24, 198.11s/it][A

Transcription completed in 217.35 seconds.
Processing episode #35	 Title: 35 - Jody Moore, Life Coach, Navigating Negative Body Image and Feelings of Inadequacy in Life



 89%|███████████████████████████████████████████████████████████████████▋        | 57/64 [3:12:45<22:21, 191.61s/it][A

Transcription completed in 176.44 seconds.
Processing episode #36	 Title: 36 - Marco and Aubrey Niccoli, tips on healthy cooking from famous professional chefs



 91%|████████████████████████████████████████████████████████████████████▉       | 58/64 [3:15:57<19:09, 191.62s/it][A

Transcription completed in 191.64 seconds.
Processing episode #37	 Title: 37- Dr. Lyndi Jones and Dr. Michelle Jorgensen: how pediatric health issues related to dental health



 92%|██████████████████████████████████████████████████████████████████████      | 59/64 [3:19:40<16:45, 201.00s/it][A

Transcription completed in 222.89 seconds.
Processing episode #38	 Title: 38- Dr. Heather McKee - Health behavior change psychologist



 94%|███████████████████████████████████████████████████████████████████████▎    | 60/64 [3:23:17<13:43, 205.93s/it][A

Transcription completed in 217.44 seconds.
Processing episode #39	 Title: 39-Dr. Ana Maria Temple, pediatrician, how to best help your child with winter illness



 95%|████████████████████████████████████████████████████████████████████████▍   | 61/64 [3:27:02<10:34, 211.52s/it][A

Transcription completed in 224.54 seconds.
Processing episode #3	 Title: 3 - Dr. Carrie Jones -- Helping and Healing Your Hormones



 97%|█████████████████████████████████████████████████████████████████████████▋  | 62/64 [3:30:28<07:00, 210.07s/it][A

Transcription completed in 206.68 seconds.
Processing episode #40	 Title: 40 - Surgeon Dr. Cazzell, what diabetics need to know and the importance of foot and wound care



 98%|██████████████████████████████████████████████████████████████████████████▊ | 63/64 [3:33:08<03:15, 195.01s/it][A

Transcription completed in 159.88 seconds.
Processing episode #41	 Title: 41- Shayna Terese Taylor - famous chef talks about cooking with whole nutritious foods



100%|████████████████████████████████████████████████████████████████████████████| 64/64 [3:36:35<00:00, 203.05s/it][A

Transcription completed in 206.26 seconds.





In [8]:
def create_video_url(video_id: str, playlist_id: str='PL8qcvQ7Byc3OJ02hbWJbHWePh4XEg3cvo'):
    return f'https://www.youtube.com/watch?v={video_id}&list={playlist_id}'