In [1]:
##### Evertime #####
import argparse
import gradio as gr
import os, sys
sys.path.append(os.path.dirname("/workspace/CogAgent/basic_demo"))
world_size = int(os.environ.get('WORLD_SIZE', 1))
from PIL import Image
import torch
import time
from sat.model.mixins import CachedAutoregressiveMixin
from sat.mpu import get_model_parallel_world_size
from sat.model import AutoModel
from sat.quantization.kernels import quantize

from utils.utils import chat, llama2_tokenizer, llama2_text_processor_inference, get_image_processor, parse_response, get_grounding_image_processor
from utils.models import CogAgentModel, CogVLMModel


DESCRIPTION = '''<h1 style='text-align: center'> <a href="https://github.com/THUDM/CogVLM">CogVLM / CogAgent</a> </h1>'''

NOTES = '<h3> This app is adapted from <a href="https://github.com/THUDM/CogVLM">https://github.com/THUDM/CogVLM</a>. It would be recommended to check out the repo if you want to see the detail of our model, CogVLM & CogAgent. </h3>'

MAINTENANCE_NOTICE1 = 'Hint 1: If the app report "Something went wrong, connection error out", please turn off your proxy and retry.<br>Hint 2: If you upload a large size of image like 10MB, it may take some time to upload and process. Please be patient and wait.'


AGENT_NOTICE = 'Hint 1: To use <strong>Agent</strong> function, please use the <a href="https://github.com/THUDM/CogVLM/blob/main/utils/utils/template.py#L761">prompts for agents</a>.'

GROUNDING_NOTICE = 'Hint 2: To use <strong>Grounding</strong> function, please use the <a href="https://github.com/THUDM/CogVLM/blob/main/utils/utils/template.py#L344">prompts for grounding</a>.'

default_chatbox = [("", "Hi, What do you want to know about this image?")]


model = image_processor = text_processor_infer = None

is_grounding = False


def process_image_without_resize(image_prompt):
    image = Image.open(image_prompt)
    # print(f"height:{image.height}, width:{image.width}")
    timestamp = int(time.time())
    file_ext = os.path.splitext(image_prompt)[1]
    filename_grounding = f"examples/{timestamp}_grounding{file_ext}"
    return image, filename_grounding

[2024-03-24 16:53:29,886] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
##### Just Once #####
def load_model(args): 
    model, model_args = AutoModel.from_pretrained(
        args.from_pretrained,
        args=argparse.Namespace(
        deepspeed=None,
        local_rank=0,
        rank=0,
        world_size=world_size,
        model_parallel_size=world_size,
        mode='inference',
        fp16=args.fp16,
        bf16=args.bf16,
        skip_init=True,
        use_gpu_initialization=True if (torch.cuda.is_available() and args.quant is None) else False,
        device='cpu' if args.quant else 'cuda',
        vg_token_idx = args.vg_token_idx),
        overwrite_args={'model_parallel_size': world_size} if world_size != 1 else {}
    )
    model = model.eval()
    assert world_size == get_model_parallel_world_size(), "world size must equal to model parallel size for cli_demo!"

    language_processor_version = model_args.text_processor_version if 'text_processor_version' in model_args else args.version
    tokenizer = llama2_tokenizer(args.local_tokenizer, signal_type=language_processor_version)
    image_processor = get_image_processor(model_args.eva_args["image_size"][0])
    cross_image_processor = get_image_processor(model_args.cross_image_pix) if "cross_image_pix" in model_args else None
    grounding_image_processor = get_grounding_image_processor(args.gnd_image_pix)

    if args.quant:
        quantize(model, args.quant)
        if torch.cuda.is_available():
            model = model.cuda()
    model.add_mixin('auto-regressive', CachedAutoregressiveMixin())

    text_processor_infer = llama2_text_processor_inference(tokenizer, args.max_length, model.image_length)

    return model, image_processor, cross_image_processor, text_processor_infer, grounding_image_processor

In [3]:
# ##### Evertime #####
# def post(
#         input_text,
#         temperature,
#         top_p,
#         top_k,
#         image_prompt,
#         result_previous,
#         hidden_image,
#         state
#         ):
#     result_text = [(ele[0], ele[1]) for ele in result_previous]
#     for i in range(len(result_text)-1, -1, -1):
#         if result_text[i][0] == "" or result_text[i][0] == None:
#             del result_text[i]
#     print(f"history {result_text}")
    
#     global model, image_processor, cross_image_processor, text_processor_infer, grounding_image_processor, is_grounding
    
#     try:
#         with torch.no_grad():
#             pil_img, image_path_grounding = process_image_without_resize(image_prompt)
#             # response, _, cache_image = chat(
#             response, _, cache_image, bbox_outputs_dict = chat(
#                     image_path="", 
#                     model=model, 
#                     text_processor=text_processor_infer,
#                     img_processor=image_processor,
#                     grounding_img_processor=grounding_image_processor,
#                     query=input_text, 
#                     history=result_text, 
#                     cross_img_processor=cross_image_processor,
#                     image=pil_img, 
#                     max_length=2048, 
#                     top_p=top_p, 
#                     temperature=temperature,
#                     top_k=top_k,
#                     invalid_slices=text_processor_infer.invalid_slices if hasattr(text_processor_infer, "invalid_slices") else [],
#                     no_prompt=False,
#                     args=state['args']
#             )
#     except Exception as e:
#         print("error message", e)
#         result_text.append((input_text, 'Timeout! Please wait a few minutes and retry.'))
#         return "", result_text, hidden_image

#     answer = response
#     if is_grounding:
#         parse_response(pil_img, answer, image_path_grounding)
#         new_answer = answer.replace(input_text, "")
#         result_text.append((input_text, new_answer))
#         result_text.append((None, (image_path_grounding,)))
#     else:
#         result_text.append((input_text, answer))
    
#     print("Bounding box outputs: ", bbox_outputs_dict)
    
#     print("Text: ", result_text)
#     print('finished')
#     return "", result_text, hidden_image

In [4]:
##### Just Once #####
global model, image_processor, cross_image_processor, text_processor_infer, grounding_image_processor, is_grounding
from argparse import Namespace

args = Namespace(
    max_length=2048,
    top_p=0.4,
    top_k=1,
    temperature=0.8,
    version="chat_old",
    quant=None,
    from_pretrained="../finetune_demo/checkpoints/finetune-cogagent-vqa-03-21-19-37/",
    local_tokenizer="lmsys/vicuna-7b-v1.5",
    fp16=False,
    bf16=False,
    stream_chat=False,
    gnd_image_pix=512,
    use_lora=False
)
args.fp16 = True
args.use_lora = True
rank = int(os.environ.get('RANK', 0))
world_size = int(os.environ.get('WORLD_SIZE', 1))

from utils.utils import llama2_tokenizer
tokenizer = llama2_tokenizer(args.local_tokenizer, signal_type=args.version)
vg_token = "给"
args.vg_token_idx = tokenizer.convert_tokens_to_ids(vg_token)
print("Total number of tokens: ", tokenizer.vocab_size)
print("Using VG token: ", vg_token, " with index: ", args.vg_token_idx)
print("\n\nargs:", args)
assert args.use_lora == True

model, image_processor, cross_image_processor, text_processor_infer, grounding_image_processor = load_model(args)
is_grounding = 'grounding' in args.from_pretrained

[2024-03-24 16:53:33,486] [INFO] building FineTuneTrainCogAgentModelNew model ...
[2024-03-24 16:53:33,491] [INFO] [RANK 0] > initializing model parallel with size 1
[2024-03-24 16:53:33,492] [INFO] [RANK 0] You didn't pass in LOCAL_WORLD_SIZE environment variable. We use the guessed LOCAL_WORLD_SIZE=1. If this is wrong, please pass the LOCAL_WORLD_SIZE manually.
[2024-03-24 16:53:33,493] [INFO] [RANK 0] You are using model-only mode.
For torch.distributed users or loading model parallel models, set environment variables RANK, WORLD_SIZE and LOCAL_RANK.


Total number of tokens:  32000
Using VG token:  给  with index:  31999


args: Namespace(max_length=2048, top_p=0.4, top_k=1, temperature=0.8, version='chat_old', quant=None, from_pretrained='../finetune_demo/checkpoints/finetune-cogagent-vqa-03-21-19-37/', local_tokenizer='lmsys/vicuna-7b-v1.5', fp16=True, bf16=False, stream_chat=False, gnd_image_pix=512, use_lora=True, vg_token_idx=31999)


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]



Loaded pretrained groundindino model from ../groundingdino_swinb_cogcoor.pth with msg: _IncompatibleKeys(missing_keys=['transformer.tgt_embed.weight'], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids', 'bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 

[2024-03-24 16:56:24,482] [INFO] [RANK 0]  > number of parameters on model parallel rank 0: 18432088128
INFO:sat:[RANK 0]  > number of parameters on model parallel rank 0: 18432088128
[2024-03-24 16:57:57,847] [INFO] [RANK 0] 

Adding LoRA to the model for inference. Expecting the model weights to be in the LoRA format. If not, please stop the process and fix it
\m
INFO:sat:[RANK 0] 

Adding LoRA to the model for inference. Expecting the model weights to be in the LoRA format. If not, please stop the process and fix it
\m
[2024-03-24 16:57:57,851] [INFO] [RANK 0] replacing layer 0 attention with lora
INFO:sat:[RANK 0] replacing layer 0 attention with lora
[2024-03-24 16:57:58,882] [INFO] [RANK 0] replacing layer 0 cross attention with lora
INFO:sat:[RANK 0] replacing layer 0 cross attention with lora
[2024-03-24 16:57:59,670] [INFO] [RANK 0] replacing layer 1 attention with lora
INFO:sat:[RANK 0] replacing layer 1 attention with lora
[2024-03-24 16:58:00,816] [INFO] [RANK 0] replacing 

In [5]:
# for name, param in model.named_parameters():
#     print(name, param.dtype)

In [6]:
# Hardcoded image and text
image_prompt = 'wf_deep13_g.png'
input_text = 'Task: Give me the list of 30 software engineer working in a Venture firm with minimum 10 years of experience. \n Previous Action: TYPE: Type Software engineer in search for a job title tab \nGive me the next action?'
input_text += vg_token

result_text = []
hidden_image = None
state = {'args': args}  # Assuming args is defined

with torch.no_grad():
    pil_img, image_path_grounding = process_image_without_resize(image_prompt)
    text_processor,output, _, cache_image, bbox_outputs_dict = chat(
            image_path="", 
            model=model, 
            text_processor=text_processor_infer,
            img_processor=image_processor,
            grounding_img_processor=grounding_image_processor,
            query=input_text, 
            history=result_text, 
            cross_img_processor=cross_image_processor,
            image=pil_img, 
            max_length=2048, 
            top_p=0.4, 
            temperature=0.8,
            top_k=10,
            invalid_slices=text_processor_infer.invalid_slices if hasattr(text_processor_infer, "invalid_slices") else [],
            no_prompt=False,
            args=state['args']
    )

[2024-03-24 17:04:38,988] [INFO] [RANK 0] Processing image...
INFO:sat:[RANK 0] Processing image...


image:  <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1920x1080 at 0x7F69D581A710>




In [7]:
bbox_outputs_dict

{'bbox_outputs': {'pred_logits': tensor([[[-1.8351,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
           [-1.6891,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
           [-1.8817,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
           ...,
           [-3.6626,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
           [-3.5564,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
           [-2.7744,    -inf,    -inf,  ...,    -inf,    -inf,    -inf]]],
         device='cuda:0'),
  'pred_boxes': tensor([[[0.2758, 0.5381, 0.1375, 0.0372],
           [0.3659, 0.4159, 0.1684, 0.0379],
           [0.2830, 0.6452, 0.1699, 0.0460],
           ...,
           [0.2157, 0.1857, 0.0718, 0.0278],
           [0.4865, 0.7780, 0.1891, 0.0452],
           [0.8285, 0.4096, 0.1110, 0.0324]]], device='cuda:0'),
  'text_mask': tensor([[ True, False, False, False, False, False, False, False, False, False,
           False, False, False, False, False, False, False, False, False

In [8]:
output.shape # torch.Size([1, 319, 32000])
# output = output.squeeze(0)
# output.shape # torch.Size([319, 32000])

torch.Size([1, 319, 32000])

In [10]:
# print(output.shape) # torch.Size([319, 32000])
# before passing to decoder Convert output to a list of integers
out = output.squeeze(0).tolist()
print(len(out))
print(len(out[0]))

319
32000


In [11]:
print(output.shape) # torch.Size([1, 319, 32000])
from transformers import LlamaTokenizer
tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
tokenizer

LlamaTokenizer(name_or_path='lmsys/vicuna-7b-v1.5', vocab_size=32000, model_max_length=4096, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [21]:
token_ids = torch.argmax(output, dim=-1)
print(token_ids.shape)
token_ids = token_ids.squeeze().tolist()
print(len(token_ids))
text = tokenizer.decode(token_ids)
print(text)

torch.Size([1, 319])
319
nobody sierp letter letter Q hren   includes of of Q search interface filter. interface home.. search....hren.<s>. include job. include. Comp Q. H............E job. H include. jobhren. includes include includeshren Bo Bo data Pl include. search include type. type... include.. Bo.ed... include include Bo. type job include job. includes Bo job no job job job job type job De include job. type.. blue plays Eng Bo software Software Is Is W we job W W job NR Job Bo. include Bo job De De job search includes software software Bo Bo Bo Bo Bo job App Bo De Bo pop Bo W NR NR NR NR Is Is Is Is Eng Anal green Bo button software Is Is Is Is Is Is Is software Is Is. A Is inv In Bo Bo Is Is Is Is Is Is Is Is Is and no pop inv Bo A Me I Is Is Bo Is I Is Is interface. no Is Inv Is include software Bo I Bo Bo I Bo Is.. Comped job Q Is bo Bo I Is Is Is I Is Is Bo buttonshren In A A No A A Is Is example Is include A. Is buttons search8</s>: We the a information of people5 people pe

In [22]:
import torch.nn.functional as F
probabilities = F.softmax(output, dim=-1)
token_ids = torch.argmax(probabilities, dim=-1)
print(token_ids.shape)
token_ids = token_ids.squeeze().tolist()
print(len(token_ids))
text = tokenizer.decode(token_ids)
print(text)

torch.Size([1, 319])
319
nobody sierp letter letter Q hren   includes of of Q search interface filter. interface home.. search....hren.<s>. include job. include. Comp Q. H............E job. H include. jobhren. includes include includeshren Bo Bo data Pl include. search include type. type... include.. Bo.ed... include include Bo. type job include job. includes Bo job no job job job job type job De include job. type.. blue plays Eng Bo software Software Is Is W we job W W job NR Job Bo. include Bo job De De job search includes software software Bo Bo Bo Bo Bo job App Bo De Bo pop Bo W NR NR NR NR Is Is Is Is Eng Anal green Bo button software Is Is Is Is Is Is Is software Is Is. A Is inv In Bo Bo Is Is Is Is Is Is Is Is Is and no pop inv Bo A Me I Is Is Bo Is I Is Is interface. no Is Inv Is include software Bo I Bo Bo I Bo Is.. Comped job Q Is bo Bo I Is Is Is I Is Is Bo buttonshren In A A No A A Is Is example Is include A. Is buttons search8</s>: We the a information of people5 people pe