Skip to content

Commit

Permalink
feat: add video description into agent func (#585)
Browse files Browse the repository at this point in the history
Co-authored-by: Wendong <w3ndong.fan@gmail.com>
  • Loading branch information
raywhoelse and Wendong-Fan authored Jun 14, 2024
1 parent a170ce1 commit 849896c
Show file tree
Hide file tree
Showing 18 changed files with 391 additions and 119 deletions.
181 changes: 135 additions & 46 deletions camel/messages/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from dataclasses import dataclass
from typing import Any, Dict, List, Literal, Optional, Tuple, Union

import numpy as np
from PIL import Image

from camel.messages import (
Expand All @@ -27,10 +28,11 @@
from camel.prompts import CodePrompt, TextPrompt
from camel.types import (
OpenAIBackendRole,
OpenAIImageDetailType,
OpenAIImageType,
OpenAIVisionDetailType,
RoleType,
)
from camel.utils import Constants


@dataclass
Expand All @@ -39,36 +41,54 @@ class BaseMessage:
Args:
role_name (str): The name of the user or assistant role.
role_type (RoleType): The type of role, either
:obj:`RoleType.ASSISTANT` or :obj:`RoleType.USER`.
role_type (RoleType): The type of role, either :obj:`RoleType.
ASSISTANT` or :obj:`RoleType.USER`.
meta_dict (Optional[Dict[str, str]]): Additional metadata dictionary
for the message.
content (str): The content of the message.
video_bytes (Optional[bytes]): Optional bytes of a video associated
with the message. Default is None.
image_list (Optional[List[Image.Image]]): Optional list of PIL Image
objects associated with the message. Default is None.
image_detail (Literal["auto", "low", "high"]): Detail level of the
images associated with the message. Default is "auto".
video_detail (Literal["auto", "low", "high"]): Detail level of the
videos associated with the message. Default is "low".
"""

role_name: str
role_type: RoleType
meta_dict: Optional[Dict[str, str]]
content: str
image: Optional[Image.Image] = None
video_bytes: Optional[bytes] = None
image_list: Optional[List[Image.Image]] = None
image_detail: Literal["auto", "low", "high"] = "auto"
video_detail: Literal["auto", "low", "high"] = "low"

@classmethod
def make_user_message(
cls,
role_name: str,
content: str,
meta_dict: Optional[Dict[str, str]] = None,
image: Optional[Image.Image] = None,
image_detail: Union[OpenAIImageDetailType, str] = "auto",
) -> 'BaseMessage':
video_bytes: Optional[bytes] = None,
image_list: Optional[List[Image.Image]] = None,
image_detail: Union[
OpenAIVisionDetailType, str
] = OpenAIVisionDetailType.AUTO,
video_detail: Union[
OpenAIVisionDetailType, str
] = OpenAIVisionDetailType.LOW,
) -> "BaseMessage":
return cls(
role_name,
RoleType.USER,
meta_dict,
content,
image,
OpenAIImageDetailType(image_detail).value,
video_bytes,
image_list,
OpenAIVisionDetailType(image_detail).value,
OpenAIVisionDetailType(video_detail).value,
)

@classmethod
Expand All @@ -77,16 +97,24 @@ def make_assistant_message(
role_name: str,
content: str,
meta_dict: Optional[Dict[str, str]] = None,
image: Optional[Image.Image] = None,
image_detail: Union[OpenAIImageDetailType, str] = "auto",
) -> 'BaseMessage':
video_bytes: Optional[bytes] = None,
image_list: Optional[List[Image.Image]] = None,
image_detail: Union[
OpenAIVisionDetailType, str
] = OpenAIVisionDetailType.AUTO,
video_detail: Union[
OpenAIVisionDetailType, str
] = OpenAIVisionDetailType.LOW,
) -> "BaseMessage":
return cls(
role_name,
RoleType.ASSISTANT,
meta_dict,
content,
image,
OpenAIImageDetailType(image_detail).value,
video_bytes,
image_list,
OpenAIVisionDetailType(image_detail).value,
OpenAIVisionDetailType(video_detail).value,
)

def create_new_instance(self, content: str) -> "BaseMessage":
Expand Down Expand Up @@ -241,46 +269,107 @@ def to_openai_user_message(self) -> OpenAIUserMessage:
Returns:
OpenAIUserMessage: The converted :obj:`OpenAIUserMessage` object.
"""
if self.image is None:
return {"role": "user", "content": self.content}
else:
#
if self.image.format is None:
raise ValueError(
f"Image's `format` is `None`, please "
f"transform the `PIL.Image.Image` to one of "
f"following supported formats, such as "
f"{list(OpenAIImageType)}"
)

image_type: str = self.image.format.lower()
if image_type not in OpenAIImageType:
raise ValueError(
f"Image type {self.image.format} "
f"is not supported by OpenAI vision model"
)
with io.BytesIO() as buffer:
self.image.save(fp=buffer, format=self.image.format)
encoded_image = base64.b64encode(buffer.getvalue()).decode(
"utf-8"
)
image_prefix = f"data:image/{image_type};base64,"
hybird_content: List[Any] = []
hybird_content.append(
{
"type": "text",
"text": self.content,
}
)

return {
"role": "user",
"content": [
{
"type": "text",
"text": self.content,
},
if self.image_list and len(self.image_list) > 0:
for image in self.image_list:
if image.format is None:
raise ValueError(
f"Image's `format` is `None`, please "
f"transform the `PIL.Image.Image` to one of "
f"following supported formats, such as "
f"{list(OpenAIImageType)}"
)

image_type: str = image.format.lower()
if image_type not in OpenAIImageType:
raise ValueError(
f"Image type {image.format} "
f"is not supported by OpenAI vision model"
)
with io.BytesIO() as buffer:
image.save(fp=buffer, format=image.format)
encoded_image = base64.b64encode(buffer.getvalue()).decode(
"utf-8"
)
image_prefix = f"data:image/{image_type};base64,"
hybird_content.append(
{
"type": "image_url",
"image_url": {
"url": f"{image_prefix}{encoded_image}",
"detail": self.image_detail,
},
}
)

if self.video_bytes:
import imageio.v3 as iio

base64Frames: List[str] = []
frame_count = 0
# read video bytes
video = iio.imiter(
self.video_bytes, plugin=Constants.VIDEO_DEFAULT_PLUG_PYAV
)

for frame in video:
frame_count += 1
if (
frame_count % Constants.VIDEO_IMAGE_EXTRACTION_INTERVAL
== 0
):
# convert frame to numpy array
frame_array = np.asarray(frame)
frame_image = Image.fromarray(frame_array)

# Get the dimensions of the frame
width, height = frame_image.size

# resize the frame to the default image size
new_width = Constants.VIDEO_DEFAULT_IMAGE_SIZE
aspect_ratio = width / height
new_height = int(new_width / aspect_ratio)
resized_img = frame_image.resize((new_width, new_height))

# encode the image to base64
with io.BytesIO() as buffer:
image_format = OpenAIImageType.JPEG.value
image_format = image_format.upper()
resized_img.save(fp=buffer, format=image_format)
encoded_image = base64.b64encode(
buffer.getvalue()
).decode("utf-8")

base64Frames.append(encoded_image)

for encoded_image in base64Frames:
item = {
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encoded_image}",
"detail": self.video_detail,
},
],
}

hybird_content.append(item)

if len(hybird_content) > 1:
return {
"role": "user",
"content": hybird_content,
}
# This return just for str message
else:
return {
"role": "user",
"content": self.content,
}

def to_openai_assistant_message(self) -> OpenAIAssistantMessage:
Expand Down
2 changes: 2 additions & 0 deletions camel/prompts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .ai_society import AISocietyPromptTemplateDict
from .base import CodePrompt, TextPrompt, TextPromptDict
from .code import CodePromptTemplateDict
from .descripte_video_prompt import DescriptionVideoPromptTemplateDict
from .evaluation import EvaluationPromptTemplateDict
from .misalignment import MisalignmentPromptTemplateDict
from .object_recognition import ObjectRecognitionPromptTemplateDict
Expand All @@ -37,4 +38,5 @@
'PromptTemplateGenerator',
'SolutionExtractionPromptTemplateDict',
'ObjectRecognitionPromptTemplateDict',
'DescriptionVideoPromptTemplateDict',
]
33 changes: 33 additions & 0 deletions camel/prompts/descripte_video_prompt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
from typing import Any

from camel.prompts.base import TextPrompt, TextPromptDict
from camel.types import RoleType


# flake8: noqa :E501
class DescriptionVideoPromptTemplateDict(TextPromptDict):
ASSISTANT_PROMPT = TextPrompt(
"""You are a master of video analysis.
Please provide a shot description of the content of the current video."""
)

def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.update(
{
RoleType.ASSISTANT: self.ASSISTANT_PROMPT,
}
)
4 changes: 4 additions & 0 deletions camel/prompts/task_prompt_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
TextPromptDict,
)
from camel.prompts.code import CodePromptTemplateDict
from camel.prompts.descripte_video_prompt import (
DescriptionVideoPromptTemplateDict,
)
from camel.prompts.evaluation import (
EvaluationPromptTemplateDict,
)
Expand Down Expand Up @@ -57,5 +60,6 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
TaskType.SOLUTION_EXTRACTION: SolutionExtractionPromptTemplateDict(), # noqa: E501
TaskType.ROLE_DESCRIPTION: RoleDescriptionPromptTemplateDict(),
TaskType.OBJECT_RECOGNITION: ObjectRecognitionPromptTemplateDict(), # noqa: E501
TaskType.DESCRIPTE_VIDEO: DescriptionVideoPromptTemplateDict(),
}
)
5 changes: 3 additions & 2 deletions camel/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
EmbeddingModelType,
ModelType,
OpenAIBackendRole,
OpenAIImageDetailType,
OpenAIImageType,
OpenAIVisionDetailType,
OpenAPIName,
RoleType,
StorageType,
Expand Down Expand Up @@ -58,8 +58,9 @@
'ChatCompletionAssistantMessageParam',
'ChatCompletionFunctionMessageParam',
'CompletionUsage',
'OpenAIVideoType',
'OpenAIImageType',
'OpenAIImageDetailType',
'OpenAIVisionDetailType',
'OpenAPIName',
'AudioModelType',
'VoiceType',
Expand Down
3 changes: 2 additions & 1 deletion camel/types/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ class TaskType(Enum):
ROLE_DESCRIPTION = "role_description"
OBJECT_RECOGNITION = "object_recognition"
DEFAULT = "default"
DESCRIPTE_VIDEO = "descripte_video"


class VectorDistance(Enum):
Expand Down Expand Up @@ -238,7 +239,7 @@ class OpenAIImageType(Enum, metaclass=OpenAIImageTypeMeta):
GIF = "gif"


class OpenAIImageDetailType(Enum):
class OpenAIVisionDetailType(Enum):
AUTO = "auto"
LOW = "low"
HIGH = "high"
Expand Down
2 changes: 2 additions & 0 deletions camel/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
text_extract_from_web,
to_pascal,
)
from .constants import Constants
from .token_counting import (
AnthropicTokenCounter,
BaseTokenCounter,
Expand All @@ -52,6 +53,7 @@
'BaseTokenCounter',
'OpenAITokenCounter',
'OpenSourceTokenCounter',
'Constants',
'text_extract_from_web',
'create_chunks',
'dependencies_required',
Expand Down
26 changes: 26 additions & 0 deletions camel/utils/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========


class Constants:
# This value defines the default size (both width and height) for images
# extracted from a video.
VIDEO_DEFAULT_IMAGE_SIZE = 768

# This value defines the interval (in number of frames) at which images
# are extracted from the video.
VIDEO_IMAGE_EXTRACTION_INTERVAL = 50

# default plug of imageio to read video
VIDEO_DEFAULT_PLUG_PYAV = "pyav"
Loading

0 comments on commit 849896c

Please sign in to comment.