In [1]:
from faster_whisper import WhisperModel
from tqdm import tqdm
import os
import time
from typing import List
import pandas as pd
import json
import re

### Load cache of metadata

In [2]:
with open('./data/huberman_meta.json') as f:
    data = json.loads(f.read())

In [3]:
cache = {d['videoId'] : d for d in data}

### Upload model

In [4]:
model = WhisperModel(model_size_or_path='large-v2', device='cuda', device_index=0, compute_type='float16')

In [5]:
video_dir = './data/videos/huberman/'
files = sorted([os.path.join(video_dir, file) for file in os.listdir(video_dir) if file.endswith('mp4')])
transcript_dir = './data/video/huberman/transcripts/'
files

['./data/videos/huberman/GpgqXCkRO-w.mp4',
 './data/videos/huberman/ccrbE0QHy94.mp4',
 './data/videos/huberman/n28W4AmvMDE.mp4']

In [7]:
def transcribe_podcast(whisper_model: WhisperModel, 
                       file_path: str,
                       outpath_dir: str,
                       cache: dict,
                       combine_metadata: bool=True
                       ) -> List[dict]:
    '''
    Transcribes single podcast given podcast metadata which includes path to 
    where podcast mp3 file is stored. 
    '''
    
    start = time.perf_counter()
    video_id = os.path.splitext(os.path.split(file_path)[-1])[0]
    metadata = cache[video_id]
    title = metadata['title']
    print(f'Processing Title: {title}')
    segments, _ = whisper_model.transcribe(file_path, beam_size=5, word_timestamps=False)
    segments = list(segments)
    # transcript = [{'start': seg.start, 'end': seg.end, 'text':seg.text} for seg in segments]
    transcript = ' '.join([seg.text.strip() for seg in segments])
    end = time.perf_counter() - start
    print(f'Transcription completed in {end:0.2f} seconds.')

    #create method to write transcript to disk
    
    if combine_metadata:
        metadata.update(text=transcript)
        save_path = os.path.join(outpath_dir, video_id) + '.json'
        with open(save_path, 'w') as f:
            json.dump(metadata, f)
        return metadata
        
    else: return transcript

In [None]:
transcribe_podcast(model, files[0], outpath_dir='./data/videos/huberman/', cache=cache)

Processing Title: Rick Rubin: Protocols to Access Creative Energy and Process


In [14]:
def func(a: str, b: int, c: dict, **kwargs):
    print(a)
    if any(kwargs):
        data = {**c, **kwargs}
        print('Kwargs present')
        return data
    return b

In [15]:
func('something', 5, {'cat':'dog'}, dog='cat')

something
Kwargs present


{'cat': 'dog', 'dog': 'cat'}

Bad pipe message: %s [b'\x8apR\x02G\xe4\'\x13\x12ePK\x93\xdd9\xcbD\xad f\x881\xf0\xa6\x96\xe8dz\x13\xd5#\xa8L"\xb6JJ\xabP\x013\xd2o\x8e{!\x07=\x1ddG\x00\x08\x13\x02\x13\x03\x13']
Bad pipe message: %s [b'']
Bad pipe message: %s [b'\xb4\x99\xb0\xdc\x02\x9d\x98[\x98\xc2\x06\xa1m\x84\x0e2\x11P\x00\x00>\xc0\x14\xc0\n\x009\x008\x007\x006\xc0\x0f\xc0\x05\x005\xc0\x13\xc0\t\x003\x002\x001\x000\xc0\x0e\xc0\x04\x00/\x00\x9a\x00\x99\x00\x98\x00\x97\x00\x96\x00\x07\xc0\x11\xc0\x07\xc0\x0c\xc0\x02\x00\x05\x00\x04\x00\xff\x02\x01\x00\x00C\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01']
Bad pipe message: %s [b'\n']
Bad pipe message: %s [b'J\r\x8f\xd5+\x8c\xfcmn\x08\xc9\xa5\xbd^w\xb6\x8dW\x00\x00\xa2\xc0\x14\xc0\n\x009\x008\x007\x006\x00\x88\x00\x87\x00\x86\x00\x85\xc0\x19\x00:\x00\x89\xc0\x0f\xc0\x05\x005\x00\x84\xc0\x13\xc0\t\x003\x002\x001\x000\x00\x9a\x00\x99\x00\x98\x00\x97\x00E\x00D\x00C\x00B\xc0\x18\x004\x00\x9b\x00F\xc0\x0e\xc0\x04\x00/\x00\x96\x00A\x00\x07\xc0\x11\xc0