# Introduction to GPT-4o and GPT-4o mini

https://cookbook.openai.com/examples/gpt4o/introduction_to_gpt4o

In [1]:
from openai import OpenAI
import os

MODEL = "gpt-4o-mini"
client = OpenAI()

In [None]:
completion = client.chat.completions.create(
    model=MODEL,
    messages=[
        {"role": "system", "content": "You are a helpful assistant. Help me with my math homework!"},
        {"role": "user", "content": "Hello! Could you solve 2+2?"}
    ]
)

print("Assistant: " + completion.choices[0].message.content)

## 画像の入力

In [None]:
from IPython.display import Image, display, Audio, Markdown
import base64

IMAGE_PATH = "../data/triangle.png"

display(Image(IMAGE_PATH))

In [None]:
def encode_image(image_path):
    with open(image_path, "rb") as img_file:
        return base64.b64encode(img_file.read()).decode("utf-8")


base64_image = encode_image(IMAGE_PATH)
type(base64_image)

In [None]:
response = client.chat.completions.create(
    model=MODEL,
    messages=[
       {"role": "system", "content": "You are a helpful assistant that responds in Markdown. Help me with my math homework!"}, 
       {"role": "user", "content": [
           {"type": "text", "text": "What's the area of the triangle?"},
           {"type": "image_url", "image_url": {
               "url": f"data:image/png;base64,{base64_image}"
           }}
       ]}
    ],
    temperature=0.0,
)

print(response.choices[0].message.content)

In [None]:
response = client.chat.completions.create(
    model=MODEL,
    messages=[
        {"role": "system", "content": "あなたは有能なアシスタントです。"},
        {"role": "user", "content": [
            {"type": "text", "text": "この絵のタイトルと作者は誰ですか？"},
            {"type": "image_url", "image_url": {
                "url": "https://upload.wikimedia.org/wikipedia/commons/7/74/Monet_dejeunersurlherbe.jpg"}
            }
        ]}
    ],
    temperature=0.0,
)

print(response.choices[0].message.content)

## 動画の入力

In [8]:
import cv2
from moviepy.editor import VideoFileClip
import time
import base64

VIDEO_PATH = "../data/keynote_recap.mp4"

In [None]:
base64Frames = []
base_video_path, _ = os.path.splitext(VIDEO_PATH)
base_video_path

In [None]:
video = cv2.VideoCapture(VIDEO_PATH)
video

In [None]:
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
total_frames

In [None]:
fps = video.get(cv2.CAP_PROP_FPS)
fps

In [None]:
frames_to_skip = int(fps * 2)
frames_to_skip

In [16]:
curr_frame = 0
while curr_frame < total_frames - 1:
    video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
    success, frame = video.read()
    if not success:
        break
    _, buffer = cv2.imencode(".jpg", frame)
    base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
    curr_frame += frames_to_skip

In [17]:
video.release()

In [None]:
len(base64Frames)

In [None]:
# Extract audio from video
audio_path = f"{base_video_path}.mp3"
clip = VideoFileClip(VIDEO_PATH)
clip.audio.write_audiofile(audio_path, bitrate="32k")
clip.audio.close()
clip.close()

In [None]:
len(base64Frames)

In [None]:
audio_path

In [None]:
%matplotlib inline
display_handle = display(None, display_id=True)
for img in base64Frames:
    display_handle.update(Image(data=base64.b64decode(img.encode("utf-8")), width=600))
    time.sleep(0.025)

In [None]:
Audio(audio_path)

In [30]:
for x in map(lambda x: {"type": "image_url", "image_url": {"url": f"data:image/jpg;base64,{x}", "detail": "low"}}, base64Frames):
    pass

In [None]:
response = client.chat.completions.create(
    model=MODEL,
    messages=[
    {"role": "system", "content": "You are generating a video summary. Please provide a summary of the video. Respond in Markdown."},
    {"role": "user", "content": [
        "These are the frames from the video.",
        *map(lambda x: {"type": "image_url", 
                        "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames)
        ],
    }
    ],
    temperature=0,
)

print(response.choices[0].message.content)

## 動画の音声の処理

現状、audioは入力できないため書き起こしてテキストを入力する

In [32]:
transcription = client.audio.transcriptions.create(
    model="whisper-1",
    file=open(audio_path, "rb"),
)

In [None]:
print(transcription)

In [34]:
response = client.chat.completions.create(
    model=MODEL,
    messages=[
        {"role": "system", "content":"""You are generating a transcript summary. Create a summary of the provided transcription. Respond in Markdown."""},
        {"role": "user", "content": [
                {"type": "text", "text": f"The audio transcription is: {transcription.text}"}
            ],
        }
    ],
    temperature=0.0,
)

In [None]:
print(response.choices[0].message.content)

# 動画の画像と音声の書き起こしの両方を入力

In [36]:
## Generate a summary with visual and audio
response = client.chat.completions.create(
    model=MODEL,
    messages=[
        {"role": "system", "content":"""You are generating a video summary. Create a summary of the provided video and its transcript. Respond in Markdown"""},
        {"role": "user", "content": [
                "These are the frames from the video.",
                *map(lambda x: {"type": "image_url", 
                                "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
                {"type": "text", "text": f"The audio transcription is: {transcription.text}"}
            ],
        }
    ],
    temperature=0,
)

In [None]:
print(response.choices[0].message.content)