feat: add video description into agent func (#585)

Co-authored-by: Wendong <w3ndong.fan@gmail.com>
camel-ai · Jun 14, 2024 · 849896c · 849896c
1 parent a170ce1
commit 849896c
Show file tree

Hide file tree

Showing 18 changed files with 391 additions and 119 deletions.
diff --git a/camel/messages/base.py b/camel/messages/base.py
@@ -16,6 +16,7 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 
+import numpy as np
 from PIL import Image
 
 from camel.messages import (
@@ -27,10 +28,11 @@
 from camel.prompts import CodePrompt, TextPrompt
 from camel.types import (
     OpenAIBackendRole,
-    OpenAIImageDetailType,
     OpenAIImageType,
+    OpenAIVisionDetailType,
     RoleType,
 )
+from camel.utils import Constants
 
 
 @dataclass
@@ -39,36 +41,54 @@ class BaseMessage:
 
     Args:
         role_name (str): The name of the user or assistant role.
-        role_type (RoleType): The type of role, either
-            :obj:`RoleType.ASSISTANT` or :obj:`RoleType.USER`.
+        role_type (RoleType): The type of role, either :obj:`RoleType.
+            ASSISTANT` or :obj:`RoleType.USER`.
         meta_dict (Optional[Dict[str, str]]): Additional metadata dictionary
             for the message.
         content (str): The content of the message.
+        video_bytes (Optional[bytes]): Optional bytes of a video associated
+            with the message. Default is None.
+        image_list (Optional[List[Image.Image]]): Optional list of PIL Image
+            objects associated with the message. Default is None.
+        image_detail (Literal["auto", "low", "high"]): Detail level of the
+            images associated with the message. Default is "auto".
+        video_detail (Literal["auto", "low", "high"]): Detail level of the
+            videos associated with the message. Default is "low".
     """
 
     role_name: str
     role_type: RoleType
     meta_dict: Optional[Dict[str, str]]
     content: str
-    image: Optional[Image.Image] = None
+    video_bytes: Optional[bytes] = None
+    image_list: Optional[List[Image.Image]] = None
     image_detail: Literal["auto", "low", "high"] = "auto"
+    video_detail: Literal["auto", "low", "high"] = "low"
 
     @classmethod
     def make_user_message(
         cls,
         role_name: str,
         content: str,
         meta_dict: Optional[Dict[str, str]] = None,
-        image: Optional[Image.Image] = None,
-        image_detail: Union[OpenAIImageDetailType, str] = "auto",
-    ) -> 'BaseMessage':
+        video_bytes: Optional[bytes] = None,
+        image_list: Optional[List[Image.Image]] = None,
+        image_detail: Union[
+            OpenAIVisionDetailType, str
+        ] = OpenAIVisionDetailType.AUTO,
+        video_detail: Union[
+            OpenAIVisionDetailType, str
+        ] = OpenAIVisionDetailType.LOW,
+    ) -> "BaseMessage":
         return cls(
             role_name,
             RoleType.USER,
             meta_dict,
             content,
-            image,
-            OpenAIImageDetailType(image_detail).value,
+            video_bytes,
+            image_list,
+            OpenAIVisionDetailType(image_detail).value,
+            OpenAIVisionDetailType(video_detail).value,
         )
 
     @classmethod
@@ -77,16 +97,24 @@ def make_assistant_message(
         role_name: str,
         content: str,
         meta_dict: Optional[Dict[str, str]] = None,
-        image: Optional[Image.Image] = None,
-        image_detail: Union[OpenAIImageDetailType, str] = "auto",
-    ) -> 'BaseMessage':
+        video_bytes: Optional[bytes] = None,
+        image_list: Optional[List[Image.Image]] = None,
+        image_detail: Union[
+            OpenAIVisionDetailType, str
+        ] = OpenAIVisionDetailType.AUTO,
+        video_detail: Union[
+            OpenAIVisionDetailType, str
+        ] = OpenAIVisionDetailType.LOW,
+    ) -> "BaseMessage":
         return cls(
             role_name,
             RoleType.ASSISTANT,
             meta_dict,
             content,
-            image,
-            OpenAIImageDetailType(image_detail).value,
+            video_bytes,
+            image_list,
+            OpenAIVisionDetailType(image_detail).value,
+            OpenAIVisionDetailType(video_detail).value,
         )
 
     def create_new_instance(self, content: str) -> "BaseMessage":
@@ -241,46 +269,107 @@ def to_openai_user_message(self) -> OpenAIUserMessage:
         Returns:
             OpenAIUserMessage: The converted :obj:`OpenAIUserMessage` object.
         """
-        if self.image is None:
-            return {"role": "user", "content": self.content}
-        else:
-            #
-            if self.image.format is None:
-                raise ValueError(
-                    f"Image's `format` is `None`, please "
-                    f"transform the `PIL.Image.Image` to  one of "
-                    f"following supported formats, such as "
-                    f"{list(OpenAIImageType)}"
-                )
-
-            image_type: str = self.image.format.lower()
-            if image_type not in OpenAIImageType:
-                raise ValueError(
-                    f"Image type {self.image.format} "
-                    f"is not supported by OpenAI vision model"
-                )
-            with io.BytesIO() as buffer:
-                self.image.save(fp=buffer, format=self.image.format)
-                encoded_image = base64.b64encode(buffer.getvalue()).decode(
-                    "utf-8"
-                )
-            image_prefix = f"data:image/{image_type};base64,"
+        hybird_content: List[Any] = []
+        hybird_content.append(
+            {
+                "type": "text",
+                "text": self.content,
+            }
+        )
 
-            return {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": self.content,
-                    },
+        if self.image_list and len(self.image_list) > 0:
+            for image in self.image_list:
+                if image.format is None:
+                    raise ValueError(
+                        f"Image's `format` is `None`, please "
+                        f"transform the `PIL.Image.Image` to  one of "
+                        f"following supported formats, such as "
+                        f"{list(OpenAIImageType)}"
+                    )
+
+                image_type: str = image.format.lower()
+                if image_type not in OpenAIImageType:
+                    raise ValueError(
+                        f"Image type {image.format} "
+                        f"is not supported by OpenAI vision model"
+                    )
+                with io.BytesIO() as buffer:
+                    image.save(fp=buffer, format=image.format)
+                    encoded_image = base64.b64encode(buffer.getvalue()).decode(
+                        "utf-8"
+                    )
+                image_prefix = f"data:image/{image_type};base64,"
+                hybird_content.append(
                     {
                         "type": "image_url",
                         "image_url": {
                             "url": f"{image_prefix}{encoded_image}",
                             "detail": self.image_detail,
                         },
+                    }
+                )
+
+        if self.video_bytes:
+            import imageio.v3 as iio
+
+            base64Frames: List[str] = []
+            frame_count = 0
+            # read video bytes
+            video = iio.imiter(
+                self.video_bytes, plugin=Constants.VIDEO_DEFAULT_PLUG_PYAV
+            )
+
+            for frame in video:
+                frame_count += 1
+                if (
+                    frame_count % Constants.VIDEO_IMAGE_EXTRACTION_INTERVAL
+                    == 0
+                ):
+                    # convert frame to numpy array
+                    frame_array = np.asarray(frame)
+                    frame_image = Image.fromarray(frame_array)
+
+                    # Get the dimensions of the frame
+                    width, height = frame_image.size
+
+                    # resize the frame to the default image size
+                    new_width = Constants.VIDEO_DEFAULT_IMAGE_SIZE
+                    aspect_ratio = width / height
+                    new_height = int(new_width / aspect_ratio)
+                    resized_img = frame_image.resize((new_width, new_height))
+
+                    # encode the image to base64
+                    with io.BytesIO() as buffer:
+                        image_format = OpenAIImageType.JPEG.value
+                        image_format = image_format.upper()
+                        resized_img.save(fp=buffer, format=image_format)
+                        encoded_image = base64.b64encode(
+                            buffer.getvalue()
+                        ).decode("utf-8")
+
+                    base64Frames.append(encoded_image)
+
+            for encoded_image in base64Frames:
+                item = {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{encoded_image}",
+                        "detail": self.video_detail,
                     },
-                ],
+                }
+
+                hybird_content.append(item)
+
+        if len(hybird_content) > 1:
+            return {
+                "role": "user",
+                "content": hybird_content,
+            }
+        # This return just for str message
+        else:
+            return {
+                "role": "user",
+                "content": self.content,
             }
 
     def to_openai_assistant_message(self) -> OpenAIAssistantMessage:

diff --git a/camel/prompts/__init__.py b/camel/prompts/__init__.py
@@ -14,6 +14,7 @@
 from .ai_society import AISocietyPromptTemplateDict
 from .base import CodePrompt, TextPrompt, TextPromptDict
 from .code import CodePromptTemplateDict
+from .descripte_video_prompt import DescriptionVideoPromptTemplateDict
 from .evaluation import EvaluationPromptTemplateDict
 from .misalignment import MisalignmentPromptTemplateDict
 from .object_recognition import ObjectRecognitionPromptTemplateDict
@@ -37,4 +38,5 @@
     'PromptTemplateGenerator',
     'SolutionExtractionPromptTemplateDict',
     'ObjectRecognitionPromptTemplateDict',
+    'DescriptionVideoPromptTemplateDict',
 ]
diff --git a/camel/prompts/descripte_video_prompt.py b/camel/prompts/descripte_video_prompt.py
@@ -0,0 +1,33 @@
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+from typing import Any
+
+from camel.prompts.base import TextPrompt, TextPromptDict
+from camel.types import RoleType
+
+
+# flake8: noqa :E501
+class DescriptionVideoPromptTemplateDict(TextPromptDict):
+    ASSISTANT_PROMPT = TextPrompt(
+        """You are a master of video analysis. 
+        Please provide a shot description of the content of the current video."""
+    )
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.update(
+            {
+                RoleType.ASSISTANT: self.ASSISTANT_PROMPT,
+            }
+        )
diff --git a/camel/prompts/task_prompt_template.py b/camel/prompts/task_prompt_template.py
@@ -18,6 +18,9 @@
     TextPromptDict,
 )
 from camel.prompts.code import CodePromptTemplateDict
+from camel.prompts.descripte_video_prompt import (
+    DescriptionVideoPromptTemplateDict,
+)
 from camel.prompts.evaluation import (
     EvaluationPromptTemplateDict,
 )
@@ -57,5 +60,6 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
                 TaskType.SOLUTION_EXTRACTION: SolutionExtractionPromptTemplateDict(),  # noqa: E501
                 TaskType.ROLE_DESCRIPTION: RoleDescriptionPromptTemplateDict(),
                 TaskType.OBJECT_RECOGNITION: ObjectRecognitionPromptTemplateDict(),  # noqa: E501
+                TaskType.DESCRIPTE_VIDEO: DescriptionVideoPromptTemplateDict(),
             }
         )
diff --git a/camel/types/__init__.py b/camel/types/__init__.py
@@ -16,8 +16,8 @@
     EmbeddingModelType,
     ModelType,
     OpenAIBackendRole,
-    OpenAIImageDetailType,
     OpenAIImageType,
+    OpenAIVisionDetailType,
     OpenAPIName,
     RoleType,
     StorageType,
@@ -58,8 +58,9 @@
     'ChatCompletionAssistantMessageParam',
     'ChatCompletionFunctionMessageParam',
     'CompletionUsage',
+    'OpenAIVideoType',
     'OpenAIImageType',
-    'OpenAIImageDetailType',
+    'OpenAIVisionDetailType',
     'OpenAPIName',
     'AudioModelType',
     'VoiceType',

diff --git a/camel/types/enums.py b/camel/types/enums.py
@@ -191,6 +191,7 @@ class TaskType(Enum):
     ROLE_DESCRIPTION = "role_description"
     OBJECT_RECOGNITION = "object_recognition"
     DEFAULT = "default"
+    DESCRIPTE_VIDEO = "descripte_video"
 
 
 class VectorDistance(Enum):
@@ -238,7 +239,7 @@ class OpenAIImageType(Enum, metaclass=OpenAIImageTypeMeta):
     GIF = "gif"
 
 
-class OpenAIImageDetailType(Enum):
+class OpenAIVisionDetailType(Enum):
     AUTO = "auto"
     LOW = "low"
     HIGH = "high"

diff --git a/camel/utils/__init__.py b/camel/utils/__init__.py
@@ -28,6 +28,7 @@
     text_extract_from_web,
     to_pascal,
 )
+from .constants import Constants
 from .token_counting import (
     AnthropicTokenCounter,
     BaseTokenCounter,
@@ -52,6 +53,7 @@
     'BaseTokenCounter',
     'OpenAITokenCounter',
     'OpenSourceTokenCounter',
+    'Constants',
     'text_extract_from_web',
     'create_chunks',
     'dependencies_required',

diff --git a/camel/utils/constants.py b/camel/utils/constants.py
@@ -0,0 +1,26 @@
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+
+
+class Constants:
+    # This value defines the default size (both width and height) for images
+    # extracted from a video.
+    VIDEO_DEFAULT_IMAGE_SIZE = 768
+
+    # This value defines the interval (in number of frames) at which images
+    # are extracted from the video.
+    VIDEO_IMAGE_EXTRACTION_INTERVAL = 50
+
+    # default plug of imageio to read video
+    VIDEO_DEFAULT_PLUG_PYAV = "pyav"