In [None]:
from dotenv import load_dotenv

from langgraph.graph import StateGraph, START, END
from langgraph.types import Send
from typing import TypedDict
from openai import OpenAI
import subprocess
import textwrap
from langchain.chat_models import init_chat_model
from typing_extensions import Annotated
import operator

# import .env variables
load_dotenv()

# initialize the LLM
llm = init_chat_model("openai:gpt-4o-mini")

class State(TypedDict):
  # the video input file
  video_file: str
  # the audio input file
  audio_file: str
  # the transcription of the video
  transcription: str
  # the summary of the chunks of transcription
  summaries: Annotated[list[str], operator.add]
  # the final summary of the transcription
  final_summary: str

In [None]:
# extract the audio of mp4 file using ffmpeg
def extract_audio(state: State):

  # converts mp4 video into mp3 audio file
  output_file = state["video_file"].replace("mp4", "mp3")

  # ffmpeg command to extract audio from video
  # e.g. $ ffmpeg -i input.mp4 output.avi
  command = [
    "ffmpeg",
    "-i",
    state["video_file"],
    # filter to the audio
    "-filter:a",
    # speed up video (doesn't change the quality of transcription but cheaper since shorter video)
    "atempo=2.0",
    # answers yes to all prompts (do you want to overwrite the file if it already exists)
    "-y",
    output_file
  ]

  # run the command in the terminal
  subprocess.run(command)

  # update the 'audio_file' state
  return {
    "audio_file": output_file
  }

# transcribe the audio file using whisper
def transcribe_audio(state: State):

  client = OpenAI()
  
  # open the audio file
  # state["audio_file"] is the path to the audio file
  # file requires a file-like object
  # 'rb' - open the file for reading in binary mode
  with open(state["audio_file"], "rb") as audio:
    # create the transcription
    transcription = client.audio.transcriptions.create(
      # required - 'file', 'model'
      # optional - 'response_format', 'language', etc...
      file=audio,
      model="whisper-1",
      response_format="text",
      # The language of the input audio. 
      # Supplying the input language in ISO-639-1 (e.g. en) format will improve accuracy and latency.
      language="en",
      # helps the model to understand the context of the video
      prompt="Netherlands, Rotterdam, Amsterdam, The Hague"
    )

  # update the 'transcription' state
  return {
    "transcription": transcription
  }

# dispatch the transcription into chunks to 'summarize_chunk' node
def dispatch_summarizers(state: State):
  transcription = state["transcription"]
  chunks = []

  # creates a list of chunks with appropriate id
  # textwrap - split the transcript into chunks of defined length
  for i, text in enumerate(textwrap.wrap(transcription, 500)):
    chunks.append({"id": i + 1, "text": text})

  # send the chunks to the 'summarize_chunk' node in parallel
  return [Send("summarize_chunk", chunk) for chunk in chunks]

# summarize each chunk of transcription
def summarize_chunk(chunk):
  chunk_id = chunk["id"]
  text = chunk["text"]

  # ask the LLM to summarize the text
  response = llm.invoke(
    f"""
    Please summarize the following text.

    Text: {text}
    """
  )
  
  # format the summary
  summary = f"[Chunk {chunk_id}] {response.content}"

  # update the 'summaries' state
  return {
    "summaries": [summary]
  }

# create the final summary of the transcription
def mega_summary(state: State):

  # combine all the summaries into one string
  all_summaries = "\n".join(state["summaries"])

  prompt = f"""
    You are given multiple summaries of different chunks from a video transcription.

    Please create a comprehensive final summary that combines all the key points.

    Individual summaries: {all_summaries} 
  """

  response = llm.invoke(prompt)

  return {
    "final_summary": response.content
  }

In [None]:

# create the state graph
graph_builder = StateGraph(State)

# create the nodes using the predefined functions
graph_builder.add_node("extract_audio", extract_audio)
graph_builder.add_node("transcribe_audio", transcribe_audio)
graph_builder.add_node("summarize_chunk", summarize_chunk)
graph_builder.add_node("mega_summary", mega_summary)

# create the edges between the nodes
graph_builder.add_edge(START, "extract_audio")
graph_builder.add_edge("extract_audio", "transcribe_audio")
graph_builder.add_conditional_edges("transcribe_audio", dispatch_summarizers, ["summarize_chunk"])
graph_builder.add_edge("summarize_chunk", "mega_summary")
graph_builder.add_edge("mega_summary", END)


# compile the graph
graph = graph_builder.compile()

graph

In [None]:
graph.invoke({"video_file": "video.mp4"})