## **Import required libraries**

In [None]:
import os
import cv2
from moviepy.editor import VideoFileClip
import time
import base64

from langchain_openai import AzureChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage

## **Load video and process it into frames**

In [None]:
def process_video(video_path, seconds_per_frame=2):
    base64Frames = []
    base_video_path, _ = os.path.splitext(video_path)

    video = cv2.VideoCapture(video_path)
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = video.get(cv2.CAP_PROP_FPS)
    frames_to_skip = int(fps * seconds_per_frame)
    curr_frame=0

    # Loop through the video and extract frames at specified sampling rate
    while curr_frame < total_frames - 1:
        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
        curr_frame += frames_to_skip
    video.release()

    print(f"Extracted {len(base64Frames)} frames")
    return base64Frames

## **Process videos**

In [None]:
video_path = "path/to/video.mp4"
base64Frames = process_video(video_path, seconds_per_frame=1)

Extracted 4 frames


## **Use AzureChatOpenAI for summarizing videos**

The visual summary is generated by sending the model only the frames from the video. With just the frames, the model is likely to capture the visual aspects, but will miss any details discussed by the speaker.

In [None]:
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")

model = AzureChatOpenAI(
    openai_api_version=os.getenv("AZURE_OAI_API_VERSION"),
    azure_deployment=os.getenv("AZURE_OAI_DEPLOYMENT"),
    openai_api_key=os.getenv("AZURE_OAI_KEY"),
    openai_api_type='openai',
    temperature=0.0,
    streaming=False
)

messages=[
    {"role": "system", "content": "You are generating a video summary. Please provide a summary of the video. Respond in Markdown."},
    {"role": "user", "content": [
        {"type": "text", "text": "These are the frames from the video."},
        *map(lambda x: {"type": "image_url",
                        "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames)
        ],
    }
    ]
ai_message = model.invoke(messages)
video_summary = ai_message.content

## **Output video summary**

In [None]:
video_summary

"The frames from the video appear to be from an ultrasound scan. Ultrasound imaging is commonly used in medical settings to visualize internal organs, tissues, and, in some cases, developing fetuses. The images show various cross-sectional views, likely of an abdominal region or a developing fetus, though it's not entirely clear without more specific context.\n\n### Summary\n- **Type of Video**: Medical ultrasound imaging\n- **Content**: The video consists of a series of ultrasound frames showing internal anatomy.\n- **Purpose**: Likely diagnostic, monitoring, or examination of internal structures.\n\nThese types of images are typically used by healthcare professionals for diagnosis or to monitor the health and development of organs or fetuses."