## Captioning / Scene Description

In [None]:
import json

from os import listdir, makedirs, path
from PIL import Image as PImage

from captions_models import Blip, Vit, CPM2, GPT4, EnPt

from parameters import IMAGES_PATH, CAPTIONS_PATH

makedirs(CAPTIONS_PATH, exist_ok=True)

### Run Captions

In [None]:
%%time

input_files = sorted([f for f in listdir(IMAGES_PATH) if f.endswith("jpg")])

for io_file in input_files[:4096]:
  img_id = io_file.replace(".jpg", "")
  input_file_path = path.join(IMAGES_PATH, io_file)
  output_file_path = path.join(CAPTIONS_PATH, io_file.replace(".jpg", ".json"))

  if path.isfile(output_file_path):
    continue

  print(IMAGES_PATH, io_file)

  image = PImage.open(input_file_path).convert("RGB")

  image_captions = {}
  image_captions["pt"] = {}

  image_captions["en"] = {
    "cpm": CPM2.caption(image),
    "blip": Blip.caption(image),
    "vit": Vit.caption(image),
  }

  for k,txt in image_captions["en"].items():
    image_captions["pt"][k] = EnPt.translate(txt)

  try:
    gpt_cap = GPT4.caption(img_id)
    image_captions["en"]["gpt"], image_captions["pt"]["gpt"] = gpt_cap
  except:
    print(img_id, gpt_cap)
  else:
    with open(output_file_path, "w", encoding="utf-8") as of:
      json.dump(image_captions, of, sort_keys=True, separators=(',',':'), ensure_ascii=False)

### Check GPT

In [None]:
input_files = sorted([f for f in listdir(IMAGES_PATH) if f.endswith("jpg")])

no_gpt = []

for io_file in input_files[:4096]:
  input_file_path = path.join(IMAGES_PATH, io_file)
  output_file_path = path.join(CAPTIONS_PATH, io_file.replace(".jpg", ".json"))

  if path.isfile(output_file_path):
    with open(output_file_path, "r", encoding="utf8") as capf:
      fcaps = json.load(capf)
      if "gpt" not in fcaps["en"]:
        no_gpt.append(io_file)
  else:
    no_gpt.append(io_file)

len(no_gpt)

### Clean up GPT Captions

In [None]:
input_files = sorted([f for f in listdir(CAPTIONS_PATH) if f.endswith("json")])

for io_file in input_files:
  file_path = path.join(CAPTIONS_PATH, io_file)

  with open(file_path, "r", encoding="utf8") as inf:
    fcaps = json.load(inf)
    ntw = False

    for l in ["en", "pt"]:
      if "gpt" not in fcaps[l]:
        ntw = True
        fcaps[l]["gpt"] = fcaps[l]["cpm"]

      if "english: " in fcaps[l]["gpt"] or "portuguese: " in fcaps[l]["gpt"]:
        ntw = True
        fcaps[l]["gpt"] = fcaps[l]["gpt"].replace("english: ", "").replace("portuguese: ", "")

      if fcaps[l]["gpt"].lower() == "picture of " or fcaps[l]["gpt"].lower() == "imagem de ":
        ntw = True
        fcaps[l]["gpt"] = fcaps[l]["cpm"]

  if ntw:
    print(io_file)
    with open(file_path, "w", encoding="utf8") as outf:
      json.dump(fcaps, outf, sort_keys=True, separators=(',',':'), ensure_ascii=False)

In [None]:
input_files = sorted([f for f in listdir(CAPTIONS_PATH) if f.endswith("json")])

to_remove_en = [
  "sure, here are the nouns",
  "sure, here are the",
  "sure here are the nouns",
  "sure, here you go",
  "sure here you go",
  " sure,",
  " sure",
  " nouns in",
  " nouns",
  " certainly!",
  " certainly",
  " certainly,",
  "**english**",
]

to_remove_pt = [
  "**portuguese**",
  "**português**",
  "**",
  " substantivos",
]

for io_file in input_files:
  file_path = path.join(CAPTIONS_PATH, io_file)

  with open(file_path, "r", encoding="utf8") as inf:
    fcaps = json.load(inf)
    gpt_en = fcaps["en"]["gpt"].replace("\n", "").replace("  ", " ").replace(".", "").replace(":", "")
    gpt_pt = fcaps["pt"]["gpt"].replace("\n", "").replace("  ", " ").replace(".", "").replace(":", "")

    for tr in to_remove_en:
      if tr in gpt_en:
        print(io_file, tr, "\n\t", gpt_en, "\n\t", gpt_en.replace(tr, ""))
        gpt_en = gpt_en.replace(tr, "")
    fcaps["en"]["gpt"] = gpt_en

    for tr in to_remove_pt:
      if tr in gpt_pt:
        print(io_file, tr, "\n\t", gpt_pt, "\n\t", gpt_pt.replace(tr, ""))
        gpt_pt = gpt_pt.replace(tr, "")
    fcaps["pt"]["gpt"] = gpt_pt

  #with open(file_path, "w", encoding="utf8") as outf:
    #json.dump(fcaps, outf, sort_keys=True, separators=(',',':'), ensure_ascii=False)

### Post-Process: Create output json file

In [None]:
from export_utils import export_objs_caps
from parameters import OBJECTS_PATH, CAPTIONS_PATH, DB_FILE_PATH

export_objs_caps(OBJECTS_PATH, CAPTIONS_PATH, DB_FILE_PATH)

### Post-Process: Create separate json files

In [None]:
from export_utils import export_by_keys
from parameters import CAPTIONS_PATH

keys = ["captions"]
export_by_keys(CAPTIONS_PATH, keys)

In [None]:
from export_utils import export_all_captions
from parameters import CAPTIONS_PATH

export_all_captions(CAPTIONS_PATH)

### TEST: EN/PT Translation

In [None]:
from models.EnPt import EnPt

PHRASES = [
  "I like to eat rice.",
  "Tom tried to stab me.",
  "He has been to Hawaii several times.",
  "The image features a white house with black trim, windows on the front and side walls.",
  "This image features a modern, open-concept living space with an eye-catching staircase and various furniture pieces.",
  "The image depicts an interior space with a staircase, furniture such as chairs and tables.",
  "The image showcases a modern building with glass walls, concrete stairs leading to it and greenery surrounding the area.",
  "The image shows a view through glass panes, revealing indoor furniture and plants outside.",
  "The image is of a modern building with large windows and columns."
]

for p in PHRASES:
  print(EnPt.translate(p))

### TEST: Description Translation

In [None]:
from os import path
from PIL import Image as PImage

from models.Blip import Blip
from models.Vit import Vit
from models.EnPt import EnPt

from parameters import IMAGES_PATH

In [None]:
input_file_path = path.join(IMAGES_PATH, "10026.jpg")
image = PImage.open(input_file_path).convert("RGB")

In [None]:
cap = Blip.caption(image)
cap_pt = EnPt.translate(cap)

display(image)
cap, cap_pt

## TEST: Caption Models

In [None]:
import torch

from os import path
from PIL import Image as PImage
from transformers import AutoProcessor, LlavaForConditionalGeneration, LlavaNextForConditionalGeneration

from parameters import IMAGES_PATH

io_file = "10027.jpg"
io_file = "10000.jpg"

input_file_path = path.join(IMAGES_PATH, io_file)
image = PImage.open(input_file_path).convert("RGB")

### Chat Models

In [None]:
def prep_xtuner_llava_model(model_name):
  processor = AutoProcessor.from_pretrained(model_name)
  model = LlavaForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to("cuda", dtype=torch.bfloat16)
  return model, processor

def run_xtuner_llava(model, processor, user_image, user_text=None):
  if user_text == None:
    user_text = "Describe the image using only 8 nouns. Focus on architecture and urbanism aspects."

  prompt = f"<|user|>\n<image>\n{user_text}<|end|>\n<|assistant|>\n"
  inputs = processor(images=user_image, text=prompt, return_tensors="pt").to("cuda", dtype=torch.bfloat16)
  output = model.generate(**inputs, max_new_tokens=100, do_sample=False)
  caption = processor.decode(output[0], skip_special_tokens=True)
  return caption

  
def prep_llava_model(model_name):
  processor = AutoProcessor.from_pretrained(model_name)
  model = LlavaNextForConditionalGeneration.from_pretrained(model_name, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to("cuda", dtype=torch.bfloat16)
  return model, processor

def run_llava(model, processor, user_image, user_text=None):
  if user_text == None:
    user_text = "Describe the image using only 8 nouns. Focus on architecture and urbanism aspects."

  conversation = [{
    "role": "user",
    "content": [
      {"type": "text", "text": user_text},
      {"type": "image"},
    ]
  }]

  prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
  inputs = processor(images=user_image, text=prompt, return_tensors="pt").to("cuda", dtype=torch.bfloat16)
  output = model.generate(**inputs, max_new_tokens=100)
  caption = processor.decode(output[0], skip_special_tokens=True)
  return caption  

In [None]:
MODEL_NAME = "llava-hf/llava-v1.6-vicuna-7b-hf"

model, processor = prep_llava_model(MODEL_NAME)
caption = run_llava(model, processor, image)
caption

In [None]:
MODEL_NAME = "llava-hf/llava-v1.6-mistral-7b-hf"

model, processor = prep_llava_model(MODEL_NAME)
caption = run_llava(model, processor, image)
caption

In [None]:
MODEL_NAME = "xtuner/llava-llama-3-8b-transformers"

model, processor = prep_xtuner_llava_model(MODEL_NAME)
caption = run_xtuner_llava(model, processor, image)
caption

In [None]:
MODEL_NAME = "xtuner/llava-llama-3-8b-v1_1-transformers"

model, processor = prep_xtuner_llava_model(MODEL_NAME)
caption = run_xtuner_llava(model, processor, image)
caption

In [None]:
MODEL_NAME = "xtuner/llava-phi-3-mini-hf"

model, processor = prep_xtuner_llava_model(MODEL_NAME)
caption = run_xtuner_llava(model, processor, image)
caption

In [None]:
from models.CPM2 import CPM2
from models.CPM2_6 import CPM2_6

In [None]:
CPM2.caption(image), CPM2_6.caption(image)

### Image Models

In [None]:
from os import path
from PIL import Image as PImage

from parameters import IMAGES_PATH

io_file = "10027.jpg"
io_file = "10000.jpg"

input_file_path = path.join(IMAGES_PATH, io_file)
image = PImage.open(input_file_path).convert("RGB")

In [None]:
from models.Blip import Blip
from models.Vit import Vit

Blip.caption(image), Vit.caption(image)

### llama-cpp

In [None]:
import base64

from os import listdir, path
from PIL import Image as PImage

from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Llava15ChatHandler

IMAGES_PATH = "../../imgs/arquigrafia"
INPUT_FILES = sorted([f for f in listdir(IMAGES_PATH) if f.endswith("jpg")])

io_file = INPUT_FILES[10]
io_file = "10027.jpg"
io_file = "10000.jpg"
io_file = "12451.jpg"

input_file_path = path.join(IMAGES_PATH, io_file)
image = PImage.open(input_file_path).convert("RGB")

In [None]:
def image_to_base64_data_uri(file_path):
  with open(file_path, "rb") as img_file:
    base64_data = base64.b64encode(img_file.read()).decode('utf-8')
    return f"data:image/jpeg;base64,{base64_data}"

def prep_llama_cpp_model(model_name, model_file, proj_file):
  model_path = hf_hub_download(model_name, filename=model_file)
  proj_path = hf_hub_download(model_name, filename=proj_file)

  chat_handler = Llava15ChatHandler(clip_model_path=proj_path)
  llm = Llama(model_path=model_path, chat_handler=chat_handler, verbose=False, n_ctx=4096, n_threads=8, n_gpu_layers=-1, logits_all=True)
  #llm = Llama.from_pretrained(repo_id=MODEL_NAME,filename=MODEL_FILE,verbose=False, n_ctx=4096, n_threads=8, n_gpu_layers=-1)
  return llm

def run_llama_cpp(llm, user_image_path, user_text=None):
  user_image_uri = image_to_base64_data_uri(user_image_path)
  if user_text == None:
    user_text = "Describe the image using only 8 nouns. Focus on architecture and urbanism aspects."

  messages = [
    {"role": "system", "content": "You are an assistant who perfectly describes images using only nouns."},
    {"role": "user", "content": [
      {"type": "image_url", "image_url": {"url":  user_image_uri }},
      {"type" : "text", "text": user_text}
    ]}
  ]
  res = llm.create_chat_completion(messages=messages, max_tokens=60, stop=["</s>"], top_k=1)
  return res["choices"][0]["message"]["content"]

In [None]:
MODEL_NAME = "xtuner/llava-llama-3-8b-v1_1-gguf"
MODEL_FILE = "llava-llama-3-8b-v1_1-int4.gguf"
PROJ_FILE = "llava-llama-3-8b-v1_1-mmproj-f16.gguf"

llm = prep_llama_cpp_model(MODEL_NAME, MODEL_FILE, PROJ_FILE)

In [None]:
caption = run_llama_cpp(llm, input_file_path)
caption

### OpenAI

In [None]:
from models.GPT4 import GPT4

In [None]:
GPT4.caption("10026")

In [None]:
# https://platform.openai.com/docs/guides/vision
# https://github.com/openai/openai-python

from openai import OpenAI
from msecrets import OPENAI_API_KEY

client = OpenAI(api_key=OPENAI_API_KEY)

response = client.chat.completions.create(
  #model="gpt-4o-mini",
  model="gpt-4o-2024-08-06",
  messages=[{
      "role": "user",
      "content": [
        {"type": "text", "text": "What’s in this image? Answer using only nouns. Answer in english and portuguese."},
        {"type": "text", "text": "Separate english and portuguese descriptions with the word SEPARATOR"},
        {"type": "image_url",
          "image_url": {"url": "https://www.arquigrafia.org.br/arquigrafia-images/10026_view.jpg",},
        },
      ],
    }],
  max_tokens=200,
)

print(response.choices[0])
response.choices[0].message.content

### TEST: Binaries

In [None]:
import numpy as np

from os import listdir, path
from PIL import Image as PImage

IMAGES_PATH = "../../imgs/arquigrafia"
INPUT_FILES = sorted([f for f in listdir(IMAGES_PATH) if f.endswith("jpg")])

io_file = INPUT_FILES[100]
input_file_path = path.join(IMAGES_PATH, io_file)

In [None]:
BINARIES = [
  ["horizontal", "vertical"],
  ["translucent", "opaque"],
  ["symmetric", "asymmetric"],
  ["complex", "simple"],
  ["internal", "external"],
  ["open", "closed"],
]

In [None]:
def bin2float(b, b0, b1):
  if b0 in b:
    return 0.0
  elif b1 in b:
    return 1.0
  elif "neither" in b:
    return 0.5
  else:
    print("b2f error: ", b)
    return 0.5

def run_binaries(img, model, bins):
  bin_results = {}
  for b in bins:
    chat = [{
      "role": "user",
      "content": f"Is the architecture pictured in the image more {b[0]}, {b[1]} or neither? Answer using only the words {b[0]}, {b[1]} or neither"
    }]
    response, _, _ = model["model"].chat(
      image=img,
      msgs=chat,
      max_length=4,
      context=None,
      tokenizer=model["pre"],
      sampling=True,
      temperature=0.005
    )

    response_list = ' '.join(response.split()).split()
    if len(response_list) != 1:
      print("wtf", response_list)
    bin_results["/".join(b)] = bin2float(response_list[0].lower(), b[0], b[1])

  return bin_results

In [None]:
%%time

image = PImage.open(input_file_path).convert("RGB")
image_binaries = run_binaries(image, CAP_MODEL, BINARIES)

display(image)
image_binaries