In [1]:
!pip install -U -q keras-nlp --no-deps
# https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg?download=true
# https://colab.research.google.com/github/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/finetune_paligemma.ipynb

# Imports

Let's important Keras, KerasNLP and some helper. Remember to set the backend before you import!

In [2]:
import os
# Set the backbend before importing Keras
os.environ["KERAS_BACKEND"] = "jax"

In [3]:
import keras
import keras_nlp
import numpy as np
import PIL
import requests
import io
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import re

In [4]:
keras.__version__

'3.3.3'

In [5]:
keras_nlp.__version__

'0.15.0'

In [6]:
keras.config.backend()

'jax'

# Helper Functions

In [7]:
def crop_and_resize(image, target_size):
    width, height = image.size
    source_size = min(image.size)
    left = width // 2 - source_size // 2
    top = height // 2 - source_size // 2
    right, bottom = left + source_size, top + source_size
    return image.resize(target_size, box=(left, top, right, bottom))


def read_image(url, target_size):
    contents = io.BytesIO(requests.get(url).content)
    image = PIL.Image.open(contents)
    image = crop_and_resize(image, target_size)
    image = np.array(image)
    # Remove alpha channel if neccessary.
    if image.shape[2] == 4:
        image = image[:, :, :3]
    return image


def parse_bbox_and_labels(detokenized_output: str):
    matches = re.finditer(
        "<loc(?P<y0>\d\d\d\d)><loc(?P<x0>\d\d\d\d)><loc(?P<y1>\d\d\d\d)><loc(?P<x1>\d\d\d\d)>"
        " (?P<label>.+?)( ;|$)",
        detokenized_output,
    )
    labels, boxes = [], []
    fmt = lambda x: float(x) / 1024.0
    for m in matches:
        d = m.groupdict()
        boxes.append([fmt(d["y0"]), fmt(d["x0"]), fmt(d["y1"]), fmt(d["x1"])])
        labels.append(d["label"])
    return np.array(boxes), np.array(labels)


def display_boxes(image, boxes, labels, target_image_size):
    h, l = target_size
    fig, ax = plt.subplots()
    ax.imshow(image)
    for i in range(boxes.shape[0]):
        y, x, y2, x2 = boxes[i] * h
        width = x2 - x
        height = y2 - y
        # Create a Rectangle patch
        rect = patches.Rectangle(
            (x, y), width, height, linewidth=1, edgecolor="r", facecolor="none"
        )
        # Add label
        plt.text(x, y, labels[i], color="red", fontsize=12)
        # Add the patch to the Axes
        ax.add_patch(rect)

    plt.show()


# Load the model from Kaggle Models

Now we can load the PaliGemma "causal language model" from the Kaggle Models hub. A causal language model is just a LLM that is ready for generation, by training with a causal mask, and running generation a token at a time in a recurrent loop.

In [8]:
keras.config.set_floatx("bfloat16")

In [9]:
pali_gemma_lm = keras_nlp.models.PaliGemmaCausalLM.from_preset(
    "pali_gemma_3b_mix_224"
)
pali_gemma_lm.summary()

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


# Test PaliGemma Inference

We can now load a test image and start query it with the model.

In [12]:
import cv2
import pandas as pd
df_test = pd.read_csv(r"/kaggle/input/ml-amazon/test.csv")
df_submit = pd.read_csv("/kaggle/input/empty20k/empty3000.csv")
# path = "/kaggle/input/ml-amazon/test_1000each/test_1000each"
# image_url = "/kaggle/input/ml-amazon/test_1000each/test_1000each/21-VzxP3BDL.jpg"
# image1 = cv2.imread(image_url)

In [12]:
# a = []
# for i in df_submit[~df_submit['prediction'].isna()]['prediction']:
#     a.append(i.split(' ')[-1])

In [42]:
# df_submit['index'].tolist()[:]

In [14]:
indx = df_submit['index'].tolist()

In [18]:
df = df_test[df_test['index'].isin(indx)]

In [14]:
# df.to_csv('null_df.csv',index=False)

In [19]:
df

Unnamed: 0,index,image_link,group_id,entity_name
15,15,https://m.media-amazon.com/images/I/216rjgJHAe...,279307,item_weight
94,94,https://m.media-amazon.com/images/I/31+zdbOuiT...,279307,item_weight
113,113,https://m.media-amazon.com/images/I/3101lgy28B...,219211,item_weight
114,114,https://m.media-amazon.com/images/I/3105qskWRc...,219211,item_weight
117,117,https://m.media-amazon.com/images/I/3106iDqsfQ...,267482,item_weight
...,...,...,...,...
131123,131224,https://m.media-amazon.com/images/I/91xm1-3iQs...,240413,voltage
131146,131247,https://m.media-amazon.com/images/I/91zGFUNJCb...,219211,item_weight
131158,131259,https://m.media-amazon.com/images/I/A1+vAtYHI3...,257505,item_volume
131176,131277,https://m.media-amazon.com/images/I/A1fMYCkFGp...,279307,item_weight


In [20]:
import os
# all_img = os.listdir('/kaggle/input/ml-amazon/test_1000each/test_1000each')

# indx = []
# from tqdm import tqdm
images_name = df['image_link'].apply(lambda x: x.split('/')[-1])
df['image_id'] = images_name 
# for i,img in tqdm(enumerate(images_name)):
#     if img in all_img:
#         indx.append(i)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['image_id'] = images_name


In [None]:
# df = df_train.iloc[indx]
# df.reset_index(drop=True,inplace=True)

In [21]:
d = {'width': "What is the width of this item?",
"item_weight": "What is the weight of the item?",
"height": "What is the height of the item?",
"depth": "What is the depth of this item?",
"maximum_weight_recommendation": "What is the maximum recommended weight for this item?",
"wattage": "What is the wattage of this item?",
"voltage": "What is the operating voltage of the item?",
"item_volume": "What is the volume of this item?"}

In [22]:
df['question'] = df['entity_name'].apply(lambda x:d[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question'] = df['entity_name'].apply(lambda x:d[x])


In [23]:
df

Unnamed: 0,index,image_link,group_id,entity_name,image_id,question
15,15,https://m.media-amazon.com/images/I/216rjgJHAe...,279307,item_weight,216rjgJHAeL.jpg,What is the weight of the item?
94,94,https://m.media-amazon.com/images/I/31+zdbOuiT...,279307,item_weight,31+zdbOuiTL.jpg,What is the weight of the item?
113,113,https://m.media-amazon.com/images/I/3101lgy28B...,219211,item_weight,3101lgy28BL.jpg,What is the weight of the item?
114,114,https://m.media-amazon.com/images/I/3105qskWRc...,219211,item_weight,3105qskWRcL.jpg,What is the weight of the item?
117,117,https://m.media-amazon.com/images/I/3106iDqsfQ...,267482,item_weight,3106iDqsfQL.jpg,What is the weight of the item?
...,...,...,...,...,...,...
131123,131224,https://m.media-amazon.com/images/I/91xm1-3iQs...,240413,voltage,91xm1-3iQsL.jpg,What is the operating voltage of the item?
131146,131247,https://m.media-amazon.com/images/I/91zGFUNJCb...,219211,item_weight,91zGFUNJCbL.jpg,What is the weight of the item?
131158,131259,https://m.media-amazon.com/images/I/A1+vAtYHI3...,257505,item_volume,A1+vAtYHI3L.jpg,What is the volume of this item?
131176,131277,https://m.media-amazon.com/images/I/A1fMYCkFGp...,279307,item_weight,A1fMYCkFGpL.jpg,What is the weight of the item?


In [None]:
# df_train[df_train['image_link'].apply(lambda x: x.split('/')[-1]) == '21-VzxP3BDL.jpg']

In [None]:
# plt.imshow(image1)

In [97]:
# Crop the image to the desired dimensions.
# target_size = (224, 224)
# image_url = 'https://storage.googleapis.com/keras-cv/models/paligemma/cow_beach_1.png'
# # image_url = "/kaggle/input/ml-amazon/test_1000each/test_1000each/21-VzxP3BDL.jpg"
# image = read_image(image_url, target_size)
# plt.imshow(image)

Here's a generation call with a single image and prompt. The prompt should always end with `\n`.

There's a few other style of prompts this model can handle out of the box...

- `"cap {lang}\n"`: very raw short caption (from WebLI-alt).
- `"caption {lang}\n"`: nice, coco-like short captions.
- `"describe {lang}\n"`: somewhat longer more descriptive captions.
- `"ocr\n"`: optical character recognition.
- `"answer en {question}\n"`: question answering about the image contents.
- `"question {lang} {answer}\n"`: question generation for a given answer.
- `"detect {thing} ; {thing}\n"`: count objects in a scene.

Try them out!

In [24]:
# qstn = 'What is the volume of this item'

def get_prediction(image,question):
#     prompt = f"ocr\n"
    prompt = f"answer en {question}\n"
    
    output = pali_gemma_lm.generate(
        inputs={
            "images": image,
            "prompts": prompt,
        }
    )
    return output.split('\n')[1]

In [99]:
# image_url = "/kaggle/input/ml-amazon/test_full/all_data/test/51jYqL79yML.jpg"
# image1 = cv2.imread(image_url)

In [100]:
# image1

In [101]:
# df_train[df_train['image_link'].apply(lambda x: x.split('/')[-1]) == '51jYqL79yML.jpg']

In [102]:
# plt.imshow(image1)

In [103]:
# get_prediction(image1,"What is the height of this item?")

In [25]:
from tqdm import tqdm
tqdm.pandas()

In [26]:
# for img in  df['image_id']:
#     img_path = os.path.join(path,img)
#     image = cv2.imread(img_path)
    
path = "/kaggle/input/ml-amazon/test_full/all_data/test"
f = lambda x: cv2.imread(os.path.join(path,x))

In [111]:
# temp = df.sample(1000)

In [27]:
# df[:3].progress_apply(lambda x:get_prediction(f(x['image_id']), x['question']),axis=1)

100%|██████████| 3/3 [00:19<00:00,  6.60s/it]


15      4 lbs
94     3 pack
113    2.4 kg
dtype: object

In [120]:
# df.shape

(19276, 6)

In [None]:
df['pred'] = df.progress_apply(lambda x:get_prediction(f(x['image_id']), x['question']),axis=1)

  1%|          | 24/3000 [00:28<1:00:19,  1.22s/it]

In [None]:
df.to_csv('result_3000.csv',index=False)

In [None]:
# prompts = df['question'].apply(lambda x: f"answer en {x}\n").tolist()

In [None]:
# images = []
# for img in  tqdm(df['image_id']):
#     img_path = os.path.join(path,img)
#     image = cv2.imread(img_path)
#     images.append(image)

In [None]:
# prompts = [
#     'answer en where is the cow standing?\n',
#     'answer en what color is the cow?\n',
#     'describe en\n',
#     'detect cow\n',
#     'segment cow\n',
# ]
# images = [image, image, image, image, image]

# outputs = pali_gemma_lm.generate(
#     inputs={
#         "images": images,
#         "prompts": prompts,
#     }
# )
# for output in outputs:
#     print(output)

In [None]:
# prompt = 'detect cow\n'
# output = pali_gemma_lm.generate(
#     inputs={
#         "images": image,
#         "prompts": prompt,
#     }
# )
# boxes, labels = parse_bbox_and_labels(output)
# display_boxes(image, boxes, labels, target_size)

Here's a generation call with batched inputs.

In [None]:
# prompts = [
#     'answer en where is the cow standing?\n',
#     'answer en what color is the cow?\n',
#     'describe en\n',
#     'detect cow\n',
#     'segment cow\n',
# ]
# images = [image, image, image, image, image]
# outputs = pali_gemma_lm.generate(
#     inputs={
#         "images": images,
#         "prompts": prompts,
#     }
# )
# for output in outputs:
#     print(output)