camel-ai · Wendong-Fan · Jun 5, 2024 · Nov 23, 2023 · Nov 25, 2023 · Nov 30, 2023
diff --git a/camel/embeddings/__init__.py b/camel/embeddings/__init__.py
@@ -14,9 +14,11 @@
 from .base import BaseEmbedding
 from .openai_embedding import OpenAIEmbedding
 from .sentence_transformers_embeddings import SentenceTransformerEncoder
+from .vlm_embedding import VisionLanguageEmbedding
 
 __all__ = [
     "BaseEmbedding",
     "OpenAIEmbedding",
     "SentenceTransformerEncoder",
+    "VisionLanguageEmbedding",
 ]
diff --git a/camel/embeddings/vlm_embedding.py b/camel/embeddings/vlm_embedding.py
@@ -0,0 +1,105 @@
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+from typing import Any, List, Optional, Union
+
+from PIL import Image
+
+from camel.embeddings import BaseEmbedding
+
+
+class VisionLanguageEmbedding(BaseEmbedding[Union[str, Image.Image]]):
+    r"""Provides image embedding functionalities using multimodal model.
+
+    Args:
+        model_name : The model type to be used for generating embeddings.
+            And the default value is: obj:`openai/clip-vit-base-patch32`.
+
+    Raises:
+        RuntimeError: If an unsupported model type is specified.
+    """
+
+    def __init__(
+        self, model_name: str = "openai/clip-vit-base-patch32"
+    ) -> None:
+        r"""Initializes the: obj: `VisionLanguageEmbedding` class
+                                    with a specified model
+                                    and return the dimension of embeddings.
+
+        Args:
+            model_name (str, optional): The version name of the model to use.
+            (default: :obj:`openai/clip-vit-base-patch32`)
+        """
+        from transformers import AutoModel, AutoProcessor
+
+        self.model = AutoModel.from_pretrained(model_name)
+        self.processor = AutoProcessor.from_pretrained(model_name)
+        self.dim: Optional[int] = None
+
+    def embed_list(
+        self,
+        objs: List[Union[Image.Image, str]],
+        **kwargs: Any,
+    ) -> List[List[float]]:
+        r"""Generates embeddings for the given images or texts.
+
+        Args:
+            objs (List[Image.Image|str]): The list of images or texts for
+                which to generate the embeddings.
+            **kwargs (Any): Extra kwargs passed to the embedding API.
+
+        Returns:
+            List[List[float]]: A list that represents the generated embedding
+                as a list of floating-point numbers.
+        """
+        if not objs:
+            raise ValueError("Input objs list is empty.")
+        result_list = []
+        for obj in objs:
+            if isinstance(obj, Image.Image):
+                input = self.processor(
+                    images=obj, return_tensors="pt", padding=True, **kwargs
+                )
+                image_feature = (
+                    self.model.get_image_features(**input, **kwargs)
+                    .squeeze(dim=0)
+                    .tolist()
+                )
+                result_list.append(image_feature)
+
+            elif isinstance(obj, str):
+                input = self.processor(
+                    text=obj, return_tensors="pt", padding=True, **kwargs
+                )
+                text_feature = (
+                    self.model.get_text_features(**input, **kwargs)
+                    .squeeze(dim=0)
+                    .tolist()
+                )
+                result_list.append(text_feature)
+            else:
+                raise ValueError("Input type is not image nor text.")
+        self.dim = len(result_list[0])
+        return result_list
+
+    def get_output_dim(self):
+        r"""Returns the output dimension of the embeddings.
+
+        Returns:
+            int: The dimensionality of the embedding for the current model.
+        """
+        if self.dim is None:
+            text = 'dimension'
+            inputs = self.processor(text=[text], return_tensors="pt")
+            self.dim = self.model.get_text_features(**inputs).shape[1]
+        return self.dim
diff --git a/licenses/update_license.py b/licenses/update_license.py
@@ -39,10 +39,12 @@ def update_license_in_file(
     start_line_start_with: str,
     end_line_start_with: str,
 ) -> bool:
-    with open(file_path, 'r') as f:
+    with open(
+        file_path, 'r', encoding='utf-8'
+    ) as f:  # for windows compatibility
         content = f.read()
 
-    with open(license_template_path, 'r') as f:
+    with open(license_template_path, 'r', encoding='utf-8') as f:
         new_license = f.read().strip()
 
     maybe_existing_licenses = re.findall(

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -59,7 +59,7 @@ pyowm = { version = "^3.3.0", optional = true }
 googlemaps = { version = "^4.10.0", optional = true }
 requests_oauthlib = { version = "^1.3.1", optional = true }
 unstructured = { extras = ["all-docs"], version = "^0.10.30", optional = true }
-
+pillow = { version = "^10.2.0", optional = true }
 # encoders
 sentence-transformers = { version = "^2.2.2", optional = true }
 

diff --git a/test/embeddings/test_vlm_embeddings.py b/test/embeddings/test_vlm_embeddings.py
@@ -0,0 +1,78 @@
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+import pytest
+import requests
+from PIL import Image
+from transformers import CLIPModel, CLIPProcessor
+
+from camel.embeddings import VisionLanguageEmbedding
+
+
+@pytest.fixture
+def VLM_instance() -> VisionLanguageEmbedding:
+    return VisionLanguageEmbedding()
+
+
+def test_CLIPEmbedding_initialization(VLM_instance):
+    assert VLM_instance is not None
+    assert isinstance(VLM_instance.model, CLIPModel)
+    assert isinstance(VLM_instance.processor, CLIPProcessor)
+
+
+def test_image_embed_list_with_valid_input(VLM_instance):
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    test_images = [image, image]
+    embeddings = VLM_instance.embed_list(test_images)
+    assert isinstance(embeddings, list)
+    assert len(embeddings) == 2
+    for e in embeddings:
+        assert len(e) == VLM_instance.get_output_dim()
+
+
+def test_image_embed_list_with_empty_input(VLM_instance):
+    with pytest.raises(ValueError):
+        VLM_instance.embed_list([])
+
+
+def test_text_embed_list_with_valid_input(VLM_instance):
+    test_texts = ['Hello world', 'Testing sentence embeddings']
+    embeddings = VLM_instance.embed_list(test_texts)
+    assert isinstance(embeddings, list)
+    assert len(embeddings) == 2
+    for e in embeddings:
+        assert len(e) == VLM_instance.get_output_dim()
+
+
+def test_text_embed_list_with_empty_input(VLM_instance):
+    with pytest.raises(ValueError):
+        VLM_instance.embed_list([])
+
+
+def test_mixed_embed_list_with_valid_input(VLM_instance):
+    test_list = ['Hello world', 'Testing sentence embeddings']
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    test_list.append(image)
+    embeddings = VLM_instance.embed_list(test_list)
+    assert isinstance(embeddings, list)
+    assert len(embeddings) == 3
+    for e in embeddings:
+        assert len(e) == VLM_instance.get_output_dim()
+
+
+def test_get_output_dim(VLM_instance):
+    output_dim = VLM_instance.get_output_dim()
+    assert isinstance(output_dim, int)
+    assert output_dim > 0