In [2]:
# We use this variable to control where you want to host LLaVA, locally or remotely?
# More details in the two setup options below.
LLAVA_MODE = "local" # Either "local" or "remote"

assert LLAVA_MODE in ["local", "remote"]

import os
## alternatively, you can put your API key here for the environment variable.
os.environ["REPLICATE_API_TOKEN"] = "r8_U1JTCjH9dvsk9IYWWnGeYLQ9jkHSr0c29XbZF"

In [3]:
from autogen import AssistantAgent, UserProxyAgent, config_list_from_json
import autogen
import replicate
import requests
from datetime import datetime
import http.client
import json
import base64

# config_list = config_list_from_json(env_or_file="OAI_CONFIG_LIST")
# llm_config = {"config_list": config_list, "request_timeout": 120}

# function to use llava model to review image

def img_review(image_path, prompt):
    output = replicate.run(
        "yorickvp/llava-13b:6bc1c7bb0d2a34e413301fee8f7cc728d2d4e75bfab186aa995f63292bda92fc",
        input={
            "image": open(image_path, "rb"),
            "prompt": f"What is happening in the image? From scale 1 to 10, decide how similar the image is to the text prompt {prompt}?",
        }
    )

    result = ""
    for item in output:
        result += item

    return result

# Example usage
result = img_review("/workspace/downloads/parrot/06qFTIVaZaY.jpg", "a parrot driving a car")
result

'In the image, a green parrot is standing on a rocky surface, possibly a stone or a concrete slab. The parrot is not driving a car, and the scene does not involve any vehicles. The image is not similar to the text prompt of a parrot driving a car, as it only shows the parrot standing on a rocky surface. On a scale of 1 to 10, the similarity between the image and the text prompt is  1.'

In [5]:
# Run this code block only if you want to run LlaVA locally
import requests
import json
from llava.conversation import default_conversation as conv
from llava.conversation import Conversation

# Setup some global constants for convenience
# Note: make sure the addresses below are consistent with your setup in LLaVA 
CONTROLLER_ADDR = "http://0.0.0.0:10000"
SEP =  conv.sep
ret = requests.post(CONTROLLER_ADDR + "/list_models")
print(ret.json())
MODEL_NAME = ret.json()["models"][0]
print("Model Name:", MODEL_NAME)

{'models': ['llava-v1.5-7b']}
Model Name: llava-v1.5-7b


In [6]:
import base64
import re
from io import BytesIO

from PIL import Image

import re
from typing import List, Tuple


def get_image_data(image_file, use_b64=True):
    if image_file.startswith('http://') or image_file.startswith('https://'):
        response = requests.get(image_file)
        content = response.content
    elif re.match(r"data:image/(?:png|jpeg);base64,", image_file):
        return re.sub(r"data:image/(?:png|jpeg);base64,", "", image_file)
    else:
        image = Image.open(image_file).convert('RGB')
        buffered = BytesIO()
        image.save(buffered, format="PNG")
        content = buffered.getvalue()
        
    if use_b64:
        return base64.b64encode(content).decode('utf-8')
    else:
        return content

def lmm_formater(prompt: str, order_image_tokens: bool = False) -> Tuple[str, List[str]]:
    """
    Formats the input prompt by replacing image tags and returns the new prompt along with image locations.
    
    Parameters:
        - prompt (str): The input string that may contain image tags like <img ...>.
        - order_image_tokens (bool, optional): Whether to order the image tokens with numbers. 
            It will be useful for GPT-4V. Defaults to False.
    
    Returns:
        - Tuple[str, List[str]]: A tuple containing the formatted string and a list of images (loaded in b64 format).
    """
    
    # Initialize variables
    new_prompt = prompt
    image_locations = []
    images = []
    image_count = 0
    
    # Regular expression pattern for matching <img ...> tags
    img_tag_pattern = re.compile(r'<img ([^>]+)>')
    
    # Find all image tags
    for match in img_tag_pattern.finditer(prompt):
        image_location = match.group(1)
        
        try: 
            img_data = get_image_data(image_location)
        except:
            # Remove the token
            print(f"Warning! Unable to load image from {image_location}")
            new_prompt = new_prompt.replace(match.group(0), "", 1)
            continue
        
        image_locations.append(image_location)
        images.append(img_data)
        
        # Increment the image count and replace the tag in the prompt
        new_token = f'<image {image_count}>' if  order_image_tokens else "<image>"

        new_prompt = new_prompt.replace(match.group(0), new_token, 1)
        image_count += 1
        
    return new_prompt, images




def extract_img_paths(paragraph: str) -> list:
    """
    Extract image paths (URLs or local paths) from a text paragraph.
    
    Parameters:
        paragraph (str): The input text paragraph.
        
    Returns:
        list: A list of extracted image paths.
    """
    # Regular expression to match image URLs and file paths
    img_path_pattern = re.compile(r'\b(?:http[s]?://\S+\.(?:jpg|jpeg|png|gif|bmp)|\S+\.(?:jpg|jpeg|png|gif|bmp))\b', 
                                  re.IGNORECASE)
    
    # Find all matches in the paragraph
    img_paths = re.findall(img_path_pattern, paragraph)
    return img_paths



    
def _to_pil(data):
    return Image.open(BytesIO(base64.b64decode(data)))


def llava_call(prompt:str, model_name: str=MODEL_NAME, images: list=[], max_new_tokens:int=1000, temperature: float=0.5) -> str:
    """
    Makes a call to the LLaVA service to generate text based on a given prompt and optionally provided images.

    Args:
        - prompt (str): The input text for the model. Any image paths or placeholders in the text should be replaced with "<image>".
        - model_name (str, optional): The name of the model to use for the text generation. Defaults to the global constant MODEL_NAME.
        - images (list, optional): A list of image paths or URLs. If not provided, they will be extracted from the prompt.
            If provided, they will be appended to the prompt with the "<image>" placeholder.
        - max_new_tokens (int, optional): Maximum number of new tokens to generate. Defaults to 1000.
        - temperature (float, optional): temperature for the model. Defaults to 0.5.

    Returns:
        - str: Generated text from the model.

    Raises:
        - AssertionError: If the number of "<image>" tokens in the prompt and the number of provided images do not match.
        - RunTimeError: If any of the provided images is empty.

    Notes:
    - The function uses global constants: CONTROLLER_ADDR and SEP.
    - Any image paths or URLs in the prompt are automatically replaced with the "<image>" token.
    - If more images are provided than there are "<image>" tokens in the prompt, the extra tokens are appended to the end of the prompt.
    """
    if len(images) == 0:
        prompt, images = lmm_formater(prompt, order_image_tokens=False)
    else:
        # Append the <image> token if missing
        assert prompt.count("<image>") <= len(images), "the number "
        "of image token in prompt and in the images list should be the same!"
        num_token_missing = len(images) - prompt.count("<image>")
        prompt += " <image> " * num_token_missing
        images = [get_image_data(x) for x in images]
    
    for im in images:
        if len(im) == 0:
            raise RunTimeError("An image is empty!")

    if LLAVA_MODE == "local":
        headers = {"User-Agent": "LLaVA Client"}
        pload = {
            "model": model_name,
            "prompt": prompt,
            "max_new_tokens": max_new_tokens,
            "temperature": temperature,
            "stop": SEP,
            "images": images,
        }

        response = requests.post(CONTROLLER_ADDR + "/worker_generate_stream", headers=headers,
                json=pload, stream=False)

        for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
            if chunk:
                data = json.loads(chunk.decode("utf-8"))
                output = data["text"].split(SEP)[-1]
    elif LLAVA_MODE == "remote":
        # The Replicate version of the model only support 1 image for now.
        img = 'data:image/jpeg;base64,' + images[0]
        response = replicate.run(
            "yorickvp/llava-13b:2facb4a474a0462c15041b78b1ad70952ea46b5ec6ad29583c0b29dbd4249591",
            input={"image": img, "prompt": prompt.replace("<image>", " ")}
        )
        # The yorickvp/llava-13b model can stream output as it's running.
        # The predict method returns an iterator, and you can iterate over that output.
        output = ""
        for item in response:
            # https://replicate.com/yorickvp/llava-13b/versions/2facb4a474a0462c15041b78b1ad70952ea46b5ec6ad29583c0b29dbd4249591/api#output-schema
            output += item
        
    # Remove the prompt and the space.
    output = output.replace(prompt, "").strip().rstrip()
    
    return output

In [7]:
out = llava_call("Describe this image: <image>", 
                 images=["https://github.com/haotian-liu/LLaVA/raw/main/images/llava_logo.png"])
print(out)


background

The image shows a toy animal with a flaming mane, which resembles a horse. The animal is wearing glasses, which is a unique and amusing detail. The toy is positioned on a gray surface, which provides a contrast to the bright colors of the flaming mane. The toy is a fascinating and creative representation of a horse with a fiery mane.


In [9]:
import json

# Create a list of OpenAI configuration settings
config_list = [
  {
    "model": "gpt-3.5-turbo",
    "api_key": "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",
  }
]

# Save the configuration list to a file
with open("OAI_CONFIG_LIST.json", "w") as f:
    json.dump(config_list, f)

In [10]:
# Import AutoGen and setup the LLaVAAgent
import autogen
from autogen import AssistantAgent, Agent, UserProxyAgent
from termcolor import colored
import random

config_list_gpt4 = autogen.config_list_from_json(
    "OAI_CONFIG_LIST.json",
    filter_dict={
        "model": ["gpt-3.5-turbo"],
    },
)

llm_config = {"config_list": config_list_gpt4, "seed": 42}



class LLaVAAgent(AssistantAgent):
    def __init__(self, img_str_format=None, **kwargs):
        super().__init__(**kwargs)
        self.register_reply([Agent, None], reply_func=LLaVAAgent._image_reply, position=0)
        self._img_str_format = img_str_format

    def _image_reply(
        self,
        messages=None,
        sender=None, config=None
    ):
        # Note: we did not use "llm_config" yet.
        # TODO 1: make the LLaVA API design compatible with llm_config
        # TODO 2: the `_image_reply` function should handle input prompt
        #         differently for different `img_str_format`, which will
        #         be valueable for different LMMs.
        # TODO 3: handle the memory and store image data in the memory.
        
        if all((messages is None, sender is None)):
            error_msg = f"Either {messages=} or {sender=} must be provided."
            logger.error(error_msg)
            raise AssertionError(error_msg)

        if messages is None:
            messages = self._oai_messages[sender]

        # The formats for LLaVA and GPT are different. So, we manually handle them here.
        # TODO: format the images from the history accordingly.
        prompt = self.system_message
        for msg in messages:
            role = "Human" if msg["role"] == "user" else "Assistant"
            content = msg["content"]
            prompt += f"{SEP}{role}: {content}"
        prompt += SEP
        print(colored(prompt, "blue"))
    
        out = ""
        retry = 10
        while len(out) == 0 and retry > 0:
            # image names will be inferred automatically from llava_call
            out = llava_call(prompt=prompt, temperature=0, max_new_tokens=2000)
            retry -= 1
            
        assert out != "", "Empty response from LLaVA."
        
        
        return True, out

In [11]:
image_agent = LLaVAAgent(
    name="image-explainer",
    system_message="You are a garden helper. You should describe the image and provide suggestions for the garden.",
    max_consecutive_auto_reply=0
)

user_proxy = autogen.UserProxyAgent(
    name="User_proxy",
    system_message="A human admin.",
    code_execution_config={
        "last_n_messages": 3,
        "work_dir": "groupchat"
    },
    human_input_mode="NEVER", # Try between ALWAYS or NEVER
    llm_config=llm_config,
    max_consecutive_auto_reply=0,
)

# Ask the question with an image
user_proxy.initiate_chat(image_agent, 
                         message="Here is my image: <img http://th.bing.com/th/id/R.105d684e5df7d540e61f6300d0bd374e?rik=PR8LCyvpe93DZA&pid=ImgRaw&r=0>. Can you give me some suggestions?")

[33mUser_proxy[0m (to image-explainer):

Here is my image: <img http://th.bing.com/th/id/R.105d684e5df7d540e61f6300d0bd374e?rik=PR8LCyvpe93DZA&pid=ImgRaw&r=0>. Can you give me some suggestions?

--------------------------------------------------------------------------------
[34mYou are a garden helper. You should describe the image and provide suggestions for the garden.###Human: Here is my image: <img http://th.bing.com/th/id/R.105d684e5df7d540e61f6300d0bd374e?rik=PR8LCyvpe93DZA&pid=ImgRaw&r=0>. Can you give me some suggestions?###[0m
[33mimage-explainer[0m (to User_proxy):

Human: Here is my image: <image>. Can you give me some suggestions?Human: Sure, I can give you some suggestions. The image shows a variety of plants, including strawberries, leaves, and other vegetation. Some suggestions for the garden could be:

1. Planting more strawberries to increase the yield and variety of the fruit.
2. Adding more leaves to the garden to provide shade and improve the overall aestheti