In [12]:
from langchain.llms import AzureOpenAI
from langchain.chat_models import AzureChatOpenAI
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

# we cannot use a davinci we need a chat model
llm = AzureChatOpenAI(
    openai_api_version="2023-03-15-preview",
    deployment_name="Gpt35", 
    model_name="gpt-35-turbo"
)

In [13]:
# !pip install transformers
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration


# specify model to be used
hf_model = "Salesforce/blip-image-captioning-large"
# use GPU if it's available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# preprocessor will prepare images for the model
processor = BlipProcessor.from_pretrained(hf_model)
# then we initialize the model itself
model = BlipForConditionalGeneration.from_pretrained(hf_model).to(device)


Downloading (…)rocessor_config.json: 100%|██████████| 445/445 [00:00<00:00, 243kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 527/527 [00:00<00:00, 340kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.85MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 5.89MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 93.4kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 4.60k/4.60k [00:00<00:00, 3.39MB/s]
Downloading pytorch_model.bin:   4%|▍         | 73.4M/1.88G [00:13<05:33, 5.41MB/s]

KeyboardInterrupt: 

Downloading pytorch_model.bin:   4%|▍         | 73.4M/1.88G [00:28<05:33, 5.41MB/s]

all `cache` is in folder 

```bash
cd ~/.cache/huggingface/
```

Now try to classify an image

In [14]:
import requests
from PIL import Image

img_url = 'https://images.unsplash.com/photo-1616128417859-3a984dd35f02?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=2372&q=80' 
image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

In [None]:
# unconditional image captioning
inputs = processor(image, return_tensors="pt").to(device)

out = model.generate(**inputs, max_new_tokens=20)
print(processor.decode(out[0], skip_special_tokens=True))

there is a monkey that is sitting in a tree


Now create a tool that can be used to classify an image

In [None]:
from langchain.tools import BaseTool
desc = (
    "use this tool when given the URL of an image that you'd like to be "
    "described. It will return a simple caption describing the image."

)

class ImageCaptionTool(BaseTool):
    name = "Image captioner"
    description = desc
    
    def _run(self, url: str):
        # download the image and convert to PIL object
        image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
        # preprocess the image
        inputs = processor(image, return_tensors="pt").to(device)
        # generate the caption
        out = model.generate(**inputs, max_new_tokens=20)
        # get the caption
        caption = processor.decode(out[0], skip_special_tokens=True)
        return caption
    
    def _arun(self, query: str):
        raise NotImplementedError("This tool does not support async")

tools = [ImageCaptionTool()]


In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory


# initialize conversational memory
conversational_memory = ConversationBufferWindowMemory(
        memory_key='chat_history',
        k=5,
        return_messages=True
)

from langchain.agents import initialize_agent

tools = [ImageCaptionTool()]

# initialize agent with tools
agent = initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=conversational_memory
)


We can ask to explain the image using simply the url of the image, the chain will download the image and then classify it.


In [10]:
agent(f"What does this image show?\n{img_url}")



[1m> Entering new  chain...[0m
[32;1m[1;3m{
    "action": "Image captioner",
    "action_input": "https://images.unsplash.com/photo-1616128417859-3a984dd35f02?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=2372&q=80"
}[0m
Observation: [36;1m[1;3mthere is a monkey that is sitting in a tree[0m
Thought:[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "The image shows a monkey sitting in a tree."
}[0m

[1m> Finished chain.[0m


{'input': 'What does this image show?\nhttps://images.unsplash.com/photo-1616128417859-3a984dd35f02?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=2372&q=80',
 'chat_history': [],
 'output': 'The image shows a monkey sitting in a tree.'}

In [11]:
img_url = "https://images.unsplash.com/photo-1502680390469-be75c86b636f?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=2370&q=80"
agent(f"what is in this image?\n{img_url}")



[1m> Entering new  chain...[0m
[32;1m[1;3m{
    "action": "Image captioner",
    "action_input": "https://images.unsplash.com/photo-1502680390469-be75c86b636f?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=2370&q=80"
}[0m
Observation: [36;1m[1;3msurfer riding a wave in the ocean on a clear day[0m
Thought:[32;1m[1;3m{
    "action": "Final Answer",
    "action_input": "The image shows a surfer riding a wave in the ocean on a clear day."
}[0m

[1m> Finished chain.[0m


{'input': 'what is in this image?\nhttps://images.unsplash.com/photo-1502680390469-be75c86b636f?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=2370&q=80',
 'chat_history': [HumanMessage(content='What does this image show?\nhttps://images.unsplash.com/photo-1616128417859-3a984dd35f02?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=2372&q=80', additional_kwargs={}, example=False),
  AIMessage(content='The image shows a monkey sitting in a tree.', additional_kwargs={}, example=False)],
 'output': 'The image shows a surfer riding a wave in the ocean on a clear day.'}