# Data
    
## prepare datasource

取数据集, 取[huggingface](https://huggingface.co/datasets/phiyodr/coco2017)或者[modelScope](https://www.modelscope.cn/datasets/zacbi2023/coco2017_caption/summary)

### huggingface

In [None]:
# tmp 代码临时执行区
# torch.cuda.empty_cache()

# llama_model.to('cuda')

In [None]:
from datasets import load_dataset

hf_dataset = load_dataset("phiyodr/coco2017")

# 如果单独取 train 或者 validation
hf_ds_train = load_dataset("phiyodr/coco2017", split="train")
hf_ds_validation = load_dataset("phiyodr/coco2017", split="validation")

### modelscope

modelscope 的优点就是非常快, 毕竟是墙内的, 缺点是某些数据集不全，就算调用官方的api，并指定源为huggingface，也可能产生报错。
这里的解决方案是先把huggingface的repo download下来，然后传到modelscope里，对于parquet类型的数据文件，非常友好.

In [None]:
from modelscope.msdatasets import MsDataset

ms_ds_train = MsDataset.load('zacbi2023/coco2017_caption', subset_name='default', split='train')
ms_ds_validation = MsDataset.load('zacbi2023/coco2017_caption', subset_name='default', split='validation')

## Remap data

目前我们需要的仅仅是生成horse, dog, cat类型的图像, 用简单分词然后建立新的数据库

In [None]:
import pyarrow as pa
from modelscope import AutoTokenizer

# bert 原始一点, 不会特别多的在 token 上做操作,比如增加'_' prefix or suffix
bert_tokenizer = AutoTokenizer.from_pretrained("sdfdsfe/bert-base-uncased")

categories = set(['horse', 'dog', 'cat'])
select_ds_cols = ['image_id', 'captions']
def build_new_ds(dataset):
    """
        构建新数据集,包括原来的图片 id 和 caption,caption 取 captions 中最长的,另外增加 tag,标明是怎么样的类型
    """
    new_dataset = dataset.select_columns(select_ds_cols)
    new_dataset = new_dataset.add_column(name = 'caption', column = [captions[max(range(len(captions)), key=lambda i: len(captions[i]))]  for captions in  dataset['captions']])
    # 暂时不用存储 llama 的输出
    # new_dataset = new_dataset.add_column(name = 'last_hidden_state', column = [torch.zeros(1).numpy().tolist() for i in range(len(new_dataset))])
    new_dataset = new_dataset.remove_columns(['captions'])

    def add_tags(caption):
        tags = []
        tokennized_caption = set(bert_tokenizer.tokenize(caption))
        for category in categories:
            if category in tokennized_caption:
                tags.append(category)
        return tags
    new_dataset = new_dataset.add_column(name = 'tags', column = [add_tags(caption) for caption in new_dataset['caption']])
    new_dataset = new_dataset.filter(lambda x: len(x['tags']) > 0)

    return new_dataset

ds_train = build_new_ds(ms_ds_train)
ds_train.features, ds_train[0]

# Infer

## 加载 model 和 tokenizer

In [3]:
import os
import torch

from modelscope.models import Model
from modelscope.pipelines import pipeline
from modelscope.preprocessors import Preprocessor
from modelscope.utils.constant import Tasks
from modelscope.hub.snapshot_download import snapshot_download

# 拉取到本地
base_model_pth = r'./model'
llama_id = r'modelscope/Llama-2-7b-ms'
sd_id = r'AI-ModelScope/stable-diffusion-v2-1'
llama_path = os.path.join(base_model_pth, llama_id)
sd_path = os.path.join(base_model_pth, sd_id)

In [None]:
if not os.path.exists(llama_path):
    snapshot_download(llama_id, cache_dir = base_model_pth)
if not os.path.exists(sd_path):
    snapshot_download(sd_id, cache_dir = base_model_pth) 

In [None]:
# accelerate infer
device = 'gpu' if torch.cuda.is_available() else 'cpu'
device_map = 'auto'
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
from_pretrained_dict = {'device_map': device_map, 'torch_dtype' : torch_dtype, 'revision': 'v1.0.1'}

def prompt_tensors_to_cuda(token_tensors):
    for k, v in token_tensors.items():
        token_tensors[k] = v.to('cuda')
    return token_tensors

In [None]:
from transformers import LlamaModel

# llama
llama_model = LlamaModel.from_pretrained(llama_path, **from_pretrained_dict)
llama_tokenizer = AutoTokenizer.from_pretrained(llama_path)

In [None]:
# sd
# stable_diffusion_model = Model.from_pretrained(sd_path, **from_pretrained_dict)

prompt = ds_train['caption'][0]
token_tensors = llama_tokenizer(prompt, return_tensors='pt')
llama_outputs = llama_model(**prompt_tensors_to_cuda(token_tensors))

In [None]:
llama_outputs.last_hidden_state.shape

In [None]:
# sd pipeline
from diffusers import AutoPipelineForText2Image

sd_pipeline = AutoPipelineForText2Image.from_pretrained(sd_path, **from_pretrained_dict)

In [None]:
prompt = ds_train['caption'][0]
with torch.no_grad():
    token_tensors = llama_tokenizer(prompt, return_tensors='pt')
    llama_outputs = llama_model(**prompt_tensors_to_cuda(token_tensors))
    prompt_embeds = outputs.last_hidden_state
    sd_outputs = sd_pipeline(prompt_embeds=prompt_embeds)
    sd_outputs.image[0]

# 语义分割

目前sota的还是EVA模型：

In [4]:
from modelscope.models import Model
from huggingface_hub import hf_hub_download
from modelscope import snapshot_download
from transformers import AutoModel
import torch

eva02_model_id = r'zacbi2023/eva02/eval02/seg/eva02_L_ade_seg_upernet_sz640.pth'
eva02_path = os.path.join(base_model_pth, eva02_model_id)
# if not os.path.exists(llama_path):
snapshot_download('zacbi2023/eva02', cache_dir = base_model_pth)

Downloading: 100%|██████████| 8.00k/8.00k [00:00<00:00, 1.38MB/s]
Downloading: 100%|██████████| 1.75k/1.75k [00:00<00:00, 432kB/s]
Downloading: 100%|██████████| 1.75k/1.75k [00:00<00:00, 370kB/s]
Downloading: 100%|██████████| 1.86k/1.86k [00:00<00:00, 498kB/s]
Downloading: 100%|██████████| 321/321 [00:00<00:00, 77.2kB/s]
Downloading: 100%|█████████▉| 2.22G/2.22G [00:15<00:00, 152MB/s] 
Downloading: 100%|█████████▉| 1.35G/1.35G [00:09<00:00, 152MB/s]
Downloading: 100%|██████████| 415/415 [00:00<00:00, 110kB/s]
Downloading: 100%|██████████| 416/416 [00:00<00:00, 113kB/s]
Downloading: 100%|██████████| 1.82k/1.82k [00:00<00:00, 513kB/s]
Downloading: 100%|██████████| 2.58k/2.58k [00:00<00:00, 730kB/s]
Downloading: 100%|██████████| 2.57k/2.57k [00:00<00:00, 716kB/s]
Downloading: 100%|██████████| 2.58k/2.58k [00:00<00:00, 716kB/s]
Downloading: 100%|██████████| 2.60k/2.60k [00:00<00:00, 731kB/s]


'./model/zacbi2023/eva02'

In [None]:
from detectron2.config import LazyConfig
from detectron2.config import instantiate
from detectron2.checkpoint import DetectionCheckpointer

# refer: https://github.com/baaivision/EVA/issues/11
eval02_config_path = r'model/zacbi2023/eva02/eval02/seg/configs/eva02/upernet/upernetpro_eva02_large_24_640_slide_80k.py'
eval02_pth_path = r'model/zacbi2023/eva02/eval02/seg/checkpoint/eva02_L_ade_seg_upernet_sz640.pth'

cfg = LazyConfig.load(eval02_config_path)
model = instantiate(cfg.model)

DetectionCheckpointer(model).load(eval02_pth_path)
model.eval()