[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/trainer/blob/main/captioner.ipynb)

In [None]:
%cd /content

!pip install -q tiktoken transformers_stream_generator gradio optimum auto-gptq huggingface_hub
!pip install -q modelscope -f https://pypi.org/project/modelscope
# !wget https://raw.githubusercontent.com/camenduru/Qwen-VL-Chat-colab/main/app.py -O /content/app.py
# !python app.py --share

import os
from argparse import ArgumentParser
from pathlib import Path
import copy
import os
import re
import secrets
import tempfile
from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from huggingface_hub import snapshot_download

model_dir = snapshot_download('4bit/Qwen-VL-Chat-Int4')
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True, resume_download=True,)
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True, resume_download=True,).eval()
model.generation_config = GenerationConfig.from_pretrained(model_dir, trust_remote_code=True, resume_download=True,)

def _parse_text(text):
    lines = text.split("\n")
    lines = [line for line in lines if line != ""]
    count = 0
    for i, line in enumerate(lines):
        if "```" in line:
            count += 1
            items = line.split("`")
            if count % 2 == 1:
                lines[i] = f'<pre><code class="language-{items[-1]}">'
            else:
                lines[i] = f"<br></code></pre>"
        else:
            if i > 0:
                if count % 2 == 1:
                    line = line.replace("`", r"\`")
                    line = line.replace("<", "&lt;")
                    line = line.replace(">", "&gt;")
                    line = line.replace(" ", "&nbsp;")
                    line = line.replace("*", "&ast;")
                    line = line.replace("_", "&lowbar;")
                    line = line.replace("-", "&#45;")
                    line = line.replace(".", "&#46;")
                    line = line.replace("!", "&#33;")
                    line = line.replace("(", "&#40;")
                    line = line.replace(")", "&#41;")
                    line = line.replace("$", "&#36;")
                lines[i] = "<br>" + line
    text = "".join(lines)
    return text

uploaded_file_dir = "/content/image"

def predict(_chatbot, task_history):
    chat_query = _chatbot[-1][0]
    query = task_history[-1][0]
    # print("User: " + _parse_text(query))
    history_cp = copy.deepcopy(task_history)
    full_response = ""

    history_filter = []
    pic_idx = 1
    pre = ""
    for i, (q, a) in enumerate(history_cp):
        if isinstance(q, (tuple, list)):
            q = f'Picture {pic_idx}: <img>{q[0]}</img>'
            pre += q + '\n'
            pic_idx += 1
        else:
            pre += q
            history_filter.append((pre, a))
            pre = ""
    history, message = history_filter[:-1], history_filter[-1][0]
    response, history = model.chat(tokenizer, message, history=history)
    image = tokenizer.draw_bbox_on_latest_picture(response, history)
    if image is not None:
        temp_dir = secrets.token_hex(20)
        temp_dir = Path(uploaded_file_dir) / temp_dir
        temp_dir.mkdir(exist_ok=True, parents=True)
        name = f"tmp{secrets.token_hex(5)}.jpg"
        filename = temp_dir / name
        image.save(str(filename))
        _chatbot[-1] = (_parse_text(chat_query), (str(filename),))
        chat_response = response.replace("<ref>", "")
        chat_response = chat_response.replace(r"</ref>", "")
        chat_response = re.sub(BOX_TAG_PATTERN, "", chat_response)
        if chat_response != "":
            _chatbot.append((None, chat_response))
    else:
        _chatbot[-1] = (_parse_text(chat_query), response)
    full_response = _parse_text(response)

    task_history[-1] = (query, full_response)
    # print("Qwen-VL-Chat: " + _parse_text(full_response))
    task_history = task_history[-10:]
    return _chatbot

In [None]:
!huggingface-cli login --token
!mkdir /content/images
!wget https://huggingface.co/camenduru/polaroid/resolve/main/polaroid.zip
!unzip style_name_fix.zip -d /content/images

In [None]:
import os
from datasets import Dataset, Image

file_names = os.listdir('/content/images')
sorted_file_names = sorted(file_names)
image = []
text = []
for file_name in sorted_file_names[:10]:
    try:
      _chatbot = [[(f'/content/images/{file_name}',), None], ['Describe the image and color details.', None]]
      result = predict(_chatbot, _chatbot)
      print(result[0][0][0],'||',result[1][1])
      image.append(result[0][0][0])
      text.append(result[1][1])
    except Exception as e:
        print(f"Error processing {file_name}: {str(e)}")
        continue

ds = Dataset.from_dict({"image": image, "text": text})
ds = ds.cast_column("image", Image())
print(len(image), len(text))
ds.push_to_hub(f"camenduru/test-polaroid")