In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Visual captioning with Imagen on Vertex AI



## Overview

[Imagen on Vertex AI](https://cloud.google.com/vertex-ai/docs/generative-ai/image/overview) (image Generative AI) offers a variety of features:
- Image generation
- Image editing
- Visual captioning
- Visual question answering

This notebook focuses on **visual captioning** only.

[Visual captioning with Imagen on Vertex AI](https://cloud.google.com/vertex-ai/docs/generative-ai/image/image-captioning) can generate text descriptions of images. The model takes in an image as input and produces one or more text descriptions of the image as output. The generated text descriptions can be used for a variety of use cases:
- getting detailed metadata about images for storing and searching
- generating automated captioning to support accessibility use cases
- producing descriptions of products and visual assets

More information about Visual captioning with Imagen on  Vertex AI can be found in the [official documentation](https://cloud.google.com/vertex-ai/docs/generative-ai/image/image-captioning).

### Objectives

In this notebook, you will learn how to use the Vertex AI Python SDK to:

- Generate image captions using the Imagen's visual captioning features

- Experiment with different parameters, such as:
    - number of captions to be generated
    - language of the generated captions
    - type and version of model that is used to generate the captions




### Costs

- This notebook uses billable components of Google Cloud:
  - Vertex AI (Imagen)

- Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Getting Started

### Install Vertex AI SDK, other packages and their dependencies

In [None]:
%pip install --upgrade --user google-cloud-aiplatform>=1.29.0

### Restart current runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which will restart the current kernel.

In [None]:
# Restart kernel after installs so that your environment can access the new packages
import IPython
import time

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</b>
</div>

### Authenticate your notebook environment (Colab only)

Since you are running this notebook on Google Colab, you will need to authenticate your environment. To do this, run the new cell below. These steps are not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).

#### **IMPORTANT:**
If you are running this during the Accenture workshop 24 August 2023, we have decided to give you a billing account to use. Upload the json key sent to you on teams after running the cell below.

In [None]:

# Run this cell and ADD THE JSON FILE sent to you on Teams by pressing the "Choose Files" button

import os
import sys
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move the uploaded file to the desired location and set the environment variable
!mv {fn} '/content/key.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/key.json'

If you did not receive the json key for this workshop you'll need to activate your own GCP billing account (enabling Vertex AI) and authenticate by uncommenting the below cell.

In [1]:
# import sys

# # Addtional authentication is required for Google Colab
# if 'google.colab' in sys.modules:

#     # Authenticate user to Google Cloud
#     from google.colab import auth
#     auth.authenticate_user()

### Define Google Cloud project information (Colab only)

Since you are running this notebook on Google Colab, you need to define Google Cloud project information to be used. In the following cell, you will define the information, import Vertex AI package, and initialize it. This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).

In [2]:
if 'google.colab' in sys.modules:
    
    # Define project information
    PROJECT_ID = "your-project-id" # @param {type:"string"}
    LOCATION = "us-central1" # @param {type:"string"}

    # Initialize Vertex AI
    import vertexai
    vertexai.init(project=PROJECT_ID, location=LOCATION)

### Load the image captioning model

The model names from Vertex AI Imagen have two components: model name and version number. The naming convention follow this format: `<model-name>@<version-number>`. For example, `imagetext@001` represent the version **001** of the **imagetext** model.



In [3]:
from vertexai.preview.vision_models import ImageCaptioningModel

image_captioning_model = ImageCaptioningModel.from_pretrained("imagetext@001")

### Load the image file

To use the visual captioning model, you first need to create an `Image` class using the image file. The model only accepts `Image` class objects, so this is a necessary step before you can generate captions.

Moreover, [Visual Captioning with Imagen](https://cloud.google.com/vertex-ai/docs/generative-ai/image/image-captioning) only accepts specific image file formats (e.g. PNG, JPEG), and may have file size is limitations (e.g. 10 MB). You can find out specific details from [this official documentation](https://cloud.google.com/vertex-ai/docs/generative-ai/image/image-captioning#img-cap-rest).



In [None]:
# Downloading an image from Google Cloud Storage

! gsutil cp "gs://cloud-samples-data/vision/using_curl/shanghai.jpeg" .

In [None]:
from vertexai.preview.vision_models import Image

# Load the image file as Image object
shanghai_image = Image.load_from_file("shanghai.jpeg")
shanghai_image.show()

###  Generate captions from the image

In this section, you will use the visual captioning model to generate text descriptions of an image.

In [None]:
# Get a caption from the image
image_captioning_model.get_captions(
    image=shanghai_image,
)

### Generating captions in non-English languages

Visual captioning with Imagen on Vertex AI can generate captions in multiple languages as well. To generate a caption in a specific language, you can set the `language` parameter as one of the values:
- `no` - Norwegian
- `en` - English
- `fr` - French
- `de` - German
- `it` - Italian
- `es` - Spanish

For a list of supported languages, check out the [official documentation](https://cloud.google.com/vertex-ai/docs/generative-ai/image/image-captioning#languages).

In [None]:
# Get 3 image captions in French
image_captioning_model.get_captions(
    image=shanghai_image,
    number_of_results=3,
    language="no",
)

You can generate up to three separate captions from a single image by changing the `number_of_results` parameter from 1 to 3.

## Try it yourself

You can also try using the visual captioning model with images of your choice. If you need to download an image file, you can use the provided auxiliary function `download_image`.

Feel free to experiment with different images and model parameters to see how the results change.

In [9]:
import os
import requests

def download_image(url):
    """Downloads an image from the specified URL."""

    # Send a get request to the url
    response = requests.get(url)

    # If the request is successful
    if response.status_code == 200:

        # Define image related variables
        image_path = os.path.basename(url)
        image_bytes = response.content
        image_type = response.headers['Content-Type'].split('/')[1]

        # Check for image type, currently only PNG or JPEG format are supported
        if image_type in ("png", "jpg", "jpeg"):

            # Write image data to a file
            with open(image_path, "wb") as f:
                f.write(image_bytes)
            return image_path
        else:
            raise Exception("Image can only be in PNG or JPEG format")

    else:
        raise Exception(f"Failed to download image from {url}")

In [10]:
# Download an image
url = "https://storage.googleapis.com/gweb-cloudblog-publish/images/transfor_shared_fate.0999075519991154.max-2000x2000.jpg"
image_path = download_image(url)

In [None]:
# Load the newly downloaded image
user_image = Image.load_from_file(image_path)
user_image.show()

In [None]:
# Generate the visual captions for the image in Norwegian:
image_captioning_model.get_captions(
    image=user_image,
    number_of_results=3,
    language='en',
)

## Ask questions about the image

Now ask questions about the image using the model:

### Load the image question answering model

The model names from Vertex AI Imagen have two components: model name and version number. The naming convention follow this format: `<model-name>@<version-number>`. For example, `imagetext@001` represents the version **001** of the **imagetext** model.



In [None]:
from vertexai.preview.vision_models import ImageQnAModel

image_qna_model = ImageQnAModel.from_pretrained("imagetext@001")

In [None]:
# Ask a question about the image
image_qna_model.ask_question(
  image=shanghai_image,
  question="What is happening in this image?"
)

In [None]:
# Ask a follow up question about the image
image_qna_model.ask_question(
    image=shanghai_image,
    question="What are the people in the image doing?"
)

You can get up to three answers from a single image by changing the `number_of_results` parameter from 1 to 3.

In [None]:
# Get 3 answers from the image
image_qna_model.ask_question(
    image=shanghai_image,
    question="What are the people in the image doing?",
    number_of_results=3,
)

## Try it yourself

You can also try using the visual question answering model with images of your choice. If you need to download an image file, you can use the provided auxiliary function `download_image`.

Feel free to experiment with different images and model parameters to see how the results change.

In [None]:
# Download an image
url = "https://storage.googleapis.com/gweb-cloudblog-publish/images/GettyImages-871168786.max-2600x2600.jpg"
image_path = download_image(url)

# Load the newly downloaded image
user_image = Image.load_from_file(image_path)
user_image.show()

In [None]:
# Ask a question about the image
image_qna_model.ask_question(
    image=user_image,
    question="What is happening in this photo?",
    number_of_results=3,
)

In [None]:
# Ask a question about the image
image_qna_model.ask_question(
    image=user_image,
    question="What advertising channels would this image be suitable for?",
    number_of_results=3,
)

In [None]:
# Ask a question about the image
image_qna_model.ask_question(
      image=user_image,
      question="What type of insects could live in this area?",
      number_of_results=3,
)