Init the WhisperX custom model on your machine with `nos serve up -c serve.yaml --env-file ./.env`.

Copy `.env.template` into `.env` and populate fields (should just be `HUGGINGFACE_HUB_TOKEN`)

In [None]:
# Connect NOS client to the running server
import nos
from nos.client import Client
from nos.logging import logger

client = Client()

logger.debug("Waiting for server to start...")
client.WaitForServer()
 
logger.debug("Confirming server is healthy...")
if not client.IsHealthy():
    raise RuntimeError("NOS server is not healthy")

In [None]:
from pathlib import Path
import tempfile

def trim_audio(audio_path: Path, duration_s: int = 600) -> Path:
    import ffmpeg
    with tempfile.NamedTemporaryFile(suffix=Path(audio_path).suffix, delete=False) as tmp:
        audio_trimmed = ffmpeg.input(str(audio_path)).audio.filter("atrim", duration=duration_s)
        audio_output = ffmpeg.output(audio_trimmed, tmp.name)
        ffmpeg.run(audio_output, overwrite_output=True)
        return Path(tmp.name)

def download_youtube_url_and_transcribe(url):
    from yt_dlp import YoutubeDL

    ydl_opts = {
        "format": "bestaudio/best", 
        "postprocessors": [
            {
                "key": "FFmpegExtractAudio",
                "preferredcodec": "wav",
                "preferredquality": "192",
            }
        ],
    }

    with YoutubeDL(ydl_opts) as ydl:
        # set download location to current directory
        info_dict = ydl.extract_info(url, download=False)
        output_filename = ydl.prepare_filename(info_dict)
        audio_filename = output_filename.replace(".webm", ".wav")
        error_code = ydl.download([url]) 
        assert error_code == 0

    # run transcription
    whisperx = client.Module("m-bain/whisperx-large-v2")
    assert whisperx is not None
    assert whisperx.GetModelInfo() is not None

    with client.UploadFile(trim_audio(audio_filename)) as remote_path:
        response = whisperx.transcribe(path=remote_path, batch_size=96)
    
        return response

In [None]:
# Youtube: Conversation with Ray Dalio
transcription = download_youtube_url_and_transcribe("https://www.youtube.com/watch?v=Tfrrubw7pcE")

In [None]:
# WhisperX breaks transcriptions into segments and segments into words tagged with a speaker ID
transcription['segments'][0]

In [None]:
# Join words to each speaker ID for summarization through the OpenAI chat completion API
def join_segments(segments):
    user_texts = {}
    for segment in segments:
        for word_dict in segment['words']:
            if not 'speaker' in word_dict:
                continue
            speaker = word_dict['speaker']
            if speaker not in user_texts:
                user_texts[speaker] = []
            user_texts[speaker].append(word_dict['word'])
    for speaker in user_texts:
        user_texts[speaker] = ' '.join(user_texts[speaker])
    return user_texts

In [None]:
joined = join_segments(transcription['segments'])
# double check that the audio was well transcribed
joined

In [None]:
# Install openai if not already installed
!pip install openai

In [1]:
import openai
import requests
import json
import os

url = "https://api.openai.com/v1/chat/completions"

api_key = os.getenv("OPENAI_API_KEY")

headers = {
    "Content-Type": "application/json",
    "Authorization": "Bearer " + api_key,
}

speakers = list(joined.keys())
first_speaker_data = joined[speakers[0]]

data = {
    "model": "gpt-3.5-turbo",
    "messages": [
      {
        "role": "system",
        "content": "You are a summarization bot for youtube videos. you provide two sentence descriptions."
      },
      {
        "role": "user",
        "content": "Summarize the following transcript: " + first_speaker_data
      }
    ],
    "max_tokens": 100,
    "temperature": 0.3,
}

response = requests.post(url, headers=headers, data=json.dumps(data))    
summary = response.json()['choices'][0]['message']['content']
print(f"Summary: {summary}")


TypeError: can only concatenate str (not "NoneType") to str