# FastModel

In [None]:
# tmp code area
# snapshot_download(sd_id, cache_dir = ckpt_base_pth, revision='v1.0.0') 
ds_train[1]

## Import

In [None]:
import os
from os import path

import random

import pyarrow as pa
import torch
from datasets import load_dataset
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import LazyConfig, instantiate
from diffusers import AutoPipelineForText2Image
from huggingface_hub import hf_hub_download
from modelscope import AutoTokenizer
from modelscope.hub.snapshot_download import snapshot_download
from modelscope.models import Model
from modelscope.msdatasets import MsDataset
from modelscope.pipelines import pipeline
from modelscope.preprocessors import Preprocessor
from modelscope.utils.constant import Tasks
from transformers import AutoModel, LlamaModel

## prepare datasource

get dataset from [huggingface](https://huggingface.co/datasets/phiyodr/coco2017) or [modelScope](https://www.modelscope.cn/datasets/zacbi2023/coco2017_caption/summary)

### huggingface

In [None]:
hf_dataset = load_dataset("phiyodr/coco2017")

hf_ds_train = load_dataset("phiyodr/coco2017", split="train")
hf_ds_validation = load_dataset("phiyodr/coco2017", split="validation")

### modelscope

The advantage of ModelScope is that it is very fast, after all, it is within the wall, and the disadvantage is that some datasets are incomplete, even if you call the official API and specify the source as HuggingFace, you may also generate an error.
The solution here is to download the HuggingFace repo and then transfer it to Modelscope, which is very friendly for parquet type data files.

In [None]:
ms_ds_train = MsDataset.load('zacbi2023/coco2017_caption', subset_name='default', split='train')
ms_ds_validation = MsDataset.load('zacbi2023/coco2017_caption', subset_name='default', split='validation')

In [None]:

base_url = r'/mnt/workspace'
ckpt_base_pth = path.join(base_url, 'model')
llama_id = r'modelscope/Llama-2-7b-ms'
sd_id = r'AI-ModelScope/stable-diffusion-2-1'
llama_path = os.path.join(ckpt_base_pth, llama_id)
sd_path = os.path.join(ckpt_base_pth, sd_id)

## Remap data

All we need is to generate images of the horse, dog, and cat types, use simple tokenization, and then create a new database

In [None]:
if not os.path.exists(llama_path):
    snapshot_download(llama_id, cache_dir = ckpt_base_pth)
if not os.path.exists(sd_path):
    snapshot_download(sd_id, cache_dir = ckpt_base_pth) 

In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained("sdfdsfe/bert-base-uncased")

In [None]:
categories = set(['horse', 'dog', 'cat'])
select_ds_cols = ['image_id', 'captions']
def build_new_ds(dataset):
    new_dataset = dataset.select_columns(select_ds_cols)
    new_dataset = new_dataset.add_column(name = 'caption', column = [captions[max(range(len(captions)), key=lambda i: len(captions[i]))]  for captions in  dataset['captions']])
    # new_dataset = new_dataset.add_column(name = 'last_hidden_state', column = [torch.zeros(1).numpy().tolist() for i in range(len(new_dataset))])
    new_dataset = new_dataset.remove_columns(['captions'])

    def add_tags(caption):
        tags = []
        tokennized_caption = set(bert_tokenizer.tokenize(caption))
        for category in categories:
            if category in tokennized_caption:
                tags.append(category)
        return tags
    new_dataset = new_dataset.add_column(name = 'tags', column = [add_tags(caption) for caption in new_dataset['caption']])
    new_dataset = new_dataset.filter(lambda x: len(x['tags']) > 0)

    return new_dataset

ds_train = build_new_ds(ms_ds_train)
ds_train.features, ds_train[0]

## accelerate infer

In [None]:
device = 'gpu' if torch.cuda.is_available() else 'cpu'
# device_map = '0'
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# from_pretrained_dict = {'device_map': device_map, 'torch_dtype' : torch_dtype, 'revision': 'v1.0.1'}
from_pretrained_dict = {'torch_dtype' : torch_dtype,  'variant': 'fp16'}

def prompt_tensors_to_cuda(token_tensors):
    for k, v in token_tensors.items():
        token_tensors[k] = v.to('cuda')
    return token_tensors

# LLama

## Load model and tokenizer

In [None]:
# llama
# llama_model = LlamaModel.from_pretrained(llama_path, **from_pretrained_dict)
llama_tokenizer = AutoTokenizer.from_pretrained(llama_path)

In [None]:
# sd
# stable_diffusion_model = Model.from_pretrained(sd_path, **from_pretrained_dict)

prompt = ds_train['caption'][0]
token_tensors = llama_tokenizer(prompt, return_tensors='pt')
llama_outputs = llama_model(**prompt_tensors_to_cuda(token_tensors))

In [None]:
llama_outputs.last_hidden_state.shape

In [None]:
# sd pipeline
sd_pipeline = AutoPipelineForText2Image.from_pretrained(sd_path, **from_pretrained_dict).to('cuda')

In [None]:
prompt = ds_train['caption'][0]
llama_tokenizer = AutoTokenizer.from_pretrained(llama_path)
with torch.no_grad():
    token_tensors = llama_tokenizer(prompt, return_tensors='pt')
    llama_outputs = llama_model(**prompt_tensors_to_cuda(token_tensors))
    prompt_embeds = outputs.last_hidden_state
    sd_outputs = sd_pipeline(prompt_embeds=prompt_embeds)
    sd_outputs.image[0]

# Stable diffusion

In [None]:
# prompt = ds_train['caption'][0]
prompt = 'A girl smiles as she holds a cat and wears a brightly colored skirt.'
sd_output = sd_pipeline(prompt)
images = sd_output.images
image = images[0]
image

In [None]:
image_id = random.randint(1, 100000)
image.save('/mnt/workspace/data/image/sd/output/eva_sd_{}.jpg'.format(image_id))