In [1]:
from faster_whisper import WhisperModel
from tqdm import tqdm
import os
import time
from typing import List
import pandas as pd
import json
import re

### Load cache of metadata

In [2]:
with open('./data/huberman_meta.json') as f:
    data = json.loads(f.read())

In [3]:
cache = {d['videoId'] : d for d in data}

### Upload model

In [4]:
model = WhisperModel(model_size_or_path='large-v2', device='cuda', device_index=0, compute_type='float16')

In [5]:
video_dir = './data/videos/huberman/'
files = sorted([os.path.join(video_dir, file) for file in os.listdir(video_dir) if file.endswith('mp4')])
transcript_dir = './data/video/huberman/transcripts/'
files

['./data/videos/huberman/GpgqXCkRO-w.mp4',
 './data/videos/huberman/ccrbE0QHy94.mp4',
 './data/videos/huberman/n28W4AmvMDE.mp4']

In [7]:
def transcribe_podcast(whisper_model: WhisperModel, 
                       file_path: str,
                       outpath_dir: str,
                       cache: dict,
                       combine_metadata: bool=True
                       ) -> List[dict]:
    '''
    Transcribes single podcast given podcast metadata which includes path to 
    where podcast mp3 file is stored. 
    '''
    
    start = time.perf_counter()
    video_id = os.path.splitext(os.path.split(file_path)[-1])[0]
    metadata = cache[video_id]
    title = metadata['title']
    print(f'Processing Title: {title}')
    segments, _ = whisper_model.transcribe(file_path, beam_size=5, word_timestamps=False)
    segments = list(segments)
    # transcript = [{'start': seg.start, 'end': seg.end, 'text':seg.text} for seg in segments]
    transcript = ' '.join([seg.text.strip() for seg in segments])
    end = time.perf_counter() - start
    print(f'Transcription completed in {end:0.2f} seconds.')

    #create method to write transcript to disk
    
    if combine_metadata:
        metadata.update(text=transcript)
        save_path = os.path.join(outpath_dir, video_id) + '.json'
        with open(save_path, 'w') as f:
            json.dump(metadata, f)
        return metadata
        
    else: return transcript

In [None]:
transcribe_podcast(model, files[0], outpath_dir='./data/videos/huberman/', cache=cache)

Processing Title: Rick Rubin: Protocols to Access Creative Energy and Process
