## 计算压缩率及统计

In [1]:
from modelzipper.tutils import *
from tqdm import trange


CodeLLaMA_PATH = "/zecheng2/model_hub/CodeLlama-7b-hf"
FILE_PATH = "/zecheng2/vqllama/test_vqllama_quantizer/test_1/visualized_compress_level_1/svg_paths.jsonl"
COMPRESSED_PATH = "/zecheng2/vqllama/test_vqllama_quantizer/test_1/compress_level_1_predictions.pkl"

llama_tokenizer = AutoTokenizer.from_pretrained(CodeLLaMA_PATH)
str_cotent = auto_read_data(FILE_PATH)
compress_content = auto_read_data(COMPRESSED_PATH)

# count compress svg tokens
p_predict = compress_content['p_predict']
golden = compress_content['golden']

def count_non_pad_rows(x):
    non_pad_rows = 0
    for row in x:
        row_list = row.tolist()
        if row_list[0] == 0:
            if any(row_list[1:]):
                non_pad_rows += 1
            else:
                break
        else:
            non_pad_rows += 1
    
    return non_pad_rows


def count_svg_tokens(batch_x):
    """
    batch_x: [b, l, 9]
    """
    total_tokens = 0
    for i in trange(len(batch_x)):
        item = batch_x[i]
        non_pad_rows = count_non_pad_rows(item)
        total_tokens += non_pad_rows * 9
    avg_tokens = total_tokens / len(batch_x) 
    return avg_tokens

golden_svg_tokens = count_svg_tokens(golden)
p_predict_svg_tokens = count_svg_tokens(p_predict)

print("golden_svg_tokens: ", golden_svg_tokens)
print("p_predict_svg_tokens: ", p_predict_svg_tokens)


def count_str_tokens(batch_x, tokenizer: AutoTokenizer):
    """
    batch_x: List[str]
    """
    total_tokens = 0
    for i in trange(len(batch_x)):
        item = batch_x[i]
        tokens = tokenizer(item)['input_ids']
        total_tokens += len(tokens)
    avg_tokens = total_tokens / len(batch_x) 
    return avg_tokens


p_svg_str = [item['p_svg_str'] for item in str_cotent]
g_svg_str = [item['g_svg_str'] for item in str_cotent]

p_svg_str_tokens = count_str_tokens(p_svg_str, llama_tokenizer)
g_svg_str_tokens = count_str_tokens(g_svg_str, llama_tokenizer)

print("p_str_tokens: ", p_svg_str_tokens)
print("golden_str_tokens: ", g_svg_str_tokens)


compress_codebook_tokens = compress_content['zs'].shape[-1]
print("compress_codebook_tokens: ", compress_codebook_tokens)

print(f"压缩率 (codebook V.S. str): {g_svg_str_tokens / compress_codebook_tokens} 倍")
print(f"压缩率 (codebook V.S. numerical matrix): {golden_svg_tokens / compress_codebook_tokens} 倍")



  from .autonotebook import tqdm as notebook_tqdm


ModelZipper is ready for launch🚀 | Current Version🦄 >>> 0.2.6 <<< | AOE Time🕒 2024-01-05 15:37:11


100%|██████████| 2000/2000 [00:01<00:00, 1668.19it/s]
100%|██████████| 2000/2000 [00:01<00:00, 1270.35it/s]


golden_svg_tokens:  811.4895
p_predict_svg_tokens:  4608.0


100%|██████████| 2000/2000 [00:44<00:00, 45.07it/s]
100%|██████████| 2000/2000 [00:10<00:00, 183.02it/s]

p_str_tokens:  22720.927
golden_str_tokens:  7261.193
compress_codebook_tokens:  256
压缩率 (codebook V.S. str): 28.36403515625 倍
压缩率 (codebook V.S. numerical matrix): 3.169880859375 倍





## 打印输出结果

In [27]:
raw = compress_content.get('raw_predict')[0][: 10]
golden = compress_content.get('golden')[0][: 10]
p_predict = compress_content.get('p_predict')[0][: 10]


def q_p(x):
    for line in x:
        print(line.tolist())


q_p(p_predict)

[0, 0, 0, 0, 0, 0, 0, 4, 105]
[1, 4, 105, 0, 0, 0, 0, 4, 200]
[1, 4, 200, 0, 0, 0, 0, 199, 199]
[1, 199, 199, 0, 0, 0, 0, 200, 4]
[1, 200, 4, 0, 0, 0, 0, 5, 4]
[1, 5, 4, 0, 0, 0, 0, 4, 101]
[1, 4, 101, 0, 0, 0, 0, 1, 103]
[0, 1, 103, 0, 0, 0, 0, 148, 89]
[1, 148, 89, 0, 0, 0, 0, 152, 161]
[1, 152, 161, 0, 0, 0, 0, 52, 170]


## 找到对应的SVG Path

In [5]:
from modelzipper.tutils import *
from tqdm import trange

ANALYSIS_DIR = "/zecheng2/vqllama/test_vqllama_quantizer/test_1/epoch_65/analysis"

CodeLLaMA_PATH = "/zecheng2/model_hub/CodeLlama-7b-hf"
FILE_PATH = "/zecheng2/vqllama/test_vqllama_quantizer/test_1/epoch_65/visualized_compress_level_1/svg_paths.jsonl"
COMPRESSED_PATH = "/zecheng2/vqllama/test_vqllama_quantizer/test_1/epoch_65/compress_level_1_predictions.pkl"

llama_tokenizer = AutoTokenizer.from_pretrained(CodeLLaMA_PATH)
str_content = auto_read_data(FILE_PATH)
compress_content = auto_read_data(COMPRESSED_PATH)

raw_predict = compress_content['raw_predict']
p_predict = compress_content['p_predict']
golden = compress_content['golden']

FILE_ID = [36, 66, 72, 80]


def convert_tensor_to_str(x):
    res = ""
    for i in range(len(x)):
        item = "[" + ",".join([format(j, '5d') for j in x[i].tolist()]) + "]"
        res += item + "\n"
    return res



# TEMPLATE = "raw predict:\n{raw_predict}\n\np predict:\n{p_predict}\n\ngolden:\n{golden}\n\n"

for i in FILE_ID:
    s_raw_predict = convert_tensor_to_str(raw_predict[i])
    s_p_predict = convert_tensor_to_str(p_predict[i])
    s_golden = convert_tensor_to_str(golden[i])
    auto_save_data([s_raw_predict], os.path.join(ANALYSIS_DIR, f"analysis_{i}_raw_predict.txt"))
    auto_save_data([s_p_predict], os.path.join(ANALYSIS_DIR, f"analysis_{i}_p_predict.txt"))
    auto_save_data([s_golden], os.path.join(ANALYSIS_DIR, f"analysis_{i}_golden.txt"))
    # s = TEMPLATE.format(raw_predict=s_raw_predict, p_predict=s_p_predict, golden=s_golden)
    # auto_save_data([s], os.path.join(ANALYSIS_DIR, f"analysis_{i}.txt"))

/zecheng2/vqllama/test_vqllama_quantizer/test_1/epoch_65/analysis not exist! --> Create data dir /zecheng2/vqllama/test_vqllama_quantizer/test_1/epoch_65/analysis
txt file saved successfully!
Save file to /zecheng2/vqllama/test_vqllama_quantizer/test_1/epoch_65/analysis/analysis_36_raw_predict.txt | len: 1
txt file saved successfully!
Save file to /zecheng2/vqllama/test_vqllama_quantizer/test_1/epoch_65/analysis/analysis_36_p_predict.txt | len: 1
txt file saved successfully!
Save file to /zecheng2/vqllama/test_vqllama_quantizer/test_1/epoch_65/analysis/analysis_36_golden.txt | len: 1
txt file saved successfully!
Save file to /zecheng2/vqllama/test_vqllama_quantizer/test_1/epoch_65/analysis/analysis_66_raw_predict.txt | len: 1
txt file saved successfully!
Save file to /zecheng2/vqllama/test_vqllama_quantizer/test_1/epoch_65/analysis/analysis_66_p_predict.txt | len: 1
txt file saved successfully!
Save file to /zecheng2/vqllama/test_vqllama_quantizer/test_1/epoch_65/analysis/analysis_66_g

## 统计数据集平均长度，最长长度和最短长度

In [3]:
from modelzipper.tutils import *
from tqdm import trange

FILE_PATH = "/zecheng2/svg/icon-shop/mesh_data_svg_convert_p.pkl"

content = auto_read_data(FILE_PATH)


total_length, max_length, min_length = 0, 0, 1000
interval_counts = {}  # 新增一个字典来存储区间计数

for i in trange(len(content)):
    svg_data = content[i]['mesh_data']
    length = len(svg_data)
    total_length += length
    max_length = max(max_length, length)
    min_length = min(min_length, length)
    
    # 计算当前长度所在的区间，并更新对应区间的计数
    interval = (length // 100) * 100
    if interval not in interval_counts:
        interval_counts[interval] = 0
    interval_counts[interval] += 1

avg_length = total_length / len(content)

print(f"avg_length: {avg_length}")
print(f"max_length: {max_length}")
print(f"min_length: {min_length}")

# 打印出每个区间的条数
for k in sorted(interval_counts):
    print(f"区间 {k} 到 {k+99} 的条数: {interval_counts[k]}")

100%|██████████| 347000/347000 [00:00<00:00, 438812.38it/s]

avg_length: 98.70044380403458
max_length: 750
min_length: 7
区间 0 到 99 的条数: 228294
区间 100 到 199 的条数: 92352
区间 200 到 299 的条数: 17245
区间 300 到 399 的条数: 5117
区间 400 到 499 的条数: 2114
区间 500 到 599 的条数: 1054
区间 600 到 699 的条数: 608
区间 700 到 799 的条数: 216





## 检查测试数据

In [36]:
import random
import os
import transformers
import sys
sys.path.append("/workspace/zecheng/modelzipper/projects/custom_llama")
from dataclasses import dataclass, field
from transformers import Trainer
from modelzipper.tutils import *
from data.vqseq2seq_dataset import OfflineBasicDataset
from models.vqvae import VQVAE, postprocess
from data.svg_data import *
import pytorch_lightning as pl
from utils.visualize_svg import convert_svg
import transformers
from tqdm import trange
from PIL import Image

FILE_PATH = "/zecheng2/svg/icon-shop/test_data_snaps/test_data_all_seq_with_mesh.pkl"

VQVAE_CONFIG_PATH = "/workspace/zecheng/modelzipper/projects/custom_llama/configs/deepspeed/vqvae_config.yaml"
DATA_PATH = "/zecheng2/svg/icon-shop/test_data_snaps/test_data_all_seq_with_mesh.pkl"

tokenizer = transformers.AutoTokenizer.from_pretrained("/zecheng2/model_hub/flan-t5-xl")

content = auto_read_data(DATA_PATH)
dataset = OfflineBasicDataset(content=content, tokenizer=tokenizer, mode='test')
vqvae_config = load_yaml_config(VQVAE_CONFIG_PATH)

block_kwargs = dict(
        width=vqvae_config.vqvae_conv_block.width, 
        depth=vqvae_config.vqvae_conv_block.depth, 
        m_conv=vqvae_config.vqvae_conv_block.m_conv,
        dilation_growth_rate=vqvae_config.vqvae_conv_block.dilation_growth_rate,
        dilation_cycle=vqvae_config.vqvae_conv_block.dilation_cycle,
        reverse_decoder_dilation=vqvae_config.vqvae_conv_block.vqvae_reverse_decoder_dilation
    )

def add_background(image_obj=None, save_suffix="b", raw_image_size_w=None, raw_image_size_h=None):
    image = image_obj
   
    sub_image_w = raw_image_size_w if raw_image_size_w is not None else image.size[0]
    sub_image_h = raw_image_size_h if raw_image_size_h is not None else image.size[1]

    new_image_size = (sub_image_w, sub_image_h)
    background_image = Image.new('RGB', new_image_size)

    background_image.paste(image, (0, 0))

    return background_image

class PluginVQVAE(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

vqvae = VQVAE(vqvae_config, multipliers=None, **block_kwargs)
plugin_vqvae = PluginVQVAE(vqvae)
checkpoint = torch.load(vqvae_config.ckpt_path)  # load vqvae ckpt
plugin_vqvae.load_state_dict(checkpoint['state_dict'])
plugin_vqvae.eval()
plugin_vqvae.cuda()
plugin_vqvae.model.half()

vq_test = []
for i in trange(len(dataset)):
    
    keys = tokenizer.decode(dataset[i]['text_input_ids'], skip_special_tokens=True)
    cur_save_case = {"keys": keys}
    zs = dataset[i]['svg_tensors'][1:]
    cur_save_case['zs_len'] = len(zs)
    with torch.no_grad():
        PI_RES = plugin_vqvae.model.decode(zs.unsqueeze(0).cuda(), 0, 1, padding_mask=None, path_interpolation=True, return_postprocess=True)[0]
        PC_RES = plugin_vqvae.model.decode(zs.unsqueeze(0).cuda(), 0, 1, padding_mask=None, path_interpolation=False, return_postprocess=True)[0]
        
        cur_save_case['pi_res_len'] = PI_RES.size(0)
        cur_save_case['pc_res_len'] = PC_RES.size(0)
        cur_save_case['gt_res_len'] = dataset[i]['mesh_data'].size(0)
        
        PI_RES_image, PI_RES_str = convert_svg(PI_RES, True)
        PC_RES_image, PC_RES_str = convert_svg(PC_RES, True)
        GOLDEN_image, GT_str = convert_svg(dataset[i]['mesh_data'], True)
        
        cur_save_case['pi_res_str'] = PI_RES_image.numericalize(n=200).to_str()
        cur_save_case['pc_res_str'] = PC_RES_image.numericalize(n=200).to_str()
        cur_save_case['gt_str'] = GOLDEN_image.numericalize(n=200).to_str()
        
        PI_RES_IMAGE_PATH = os.path.join("/zecheng2/evaluation/test_vq/version_8/image", f"PI_{i}.png")
        PC_RES_IMAGE_PATH = os.path.join("/zecheng2/evaluation/test_vq/version_8/image", f"PC_{i}.png")
        GT_IMAGE_PATH = os.path.join("/zecheng2/evaluation/test_vq/version_8/image", f"GT_{i}.png")
        
        # PI_RES_image_b = add_background(PI_RES_image)
        # PC_RES_image_b = add_background(PC_RES_image)
        # GT_RES_image_b = add_background(GT_IMAGE_PATH)
        
        PI_RES_image.save_png(PI_RES_IMAGE_PATH)
        PC_RES_image.save_png(PC_RES_IMAGE_PATH)
        GOLDEN_image.save_png(GT_IMAGE_PATH)
        
        cur_save_case['PI_RES_image_path'] = PI_RES_IMAGE_PATH
        cur_save_case['PC_RES_image_path'] = PC_RES_IMAGE_PATH
        cur_save_case['GT_image_path'] = GT_IMAGE_PATH
        
        vq_test.append(cur_save_case)
    
auto_save_data(vq_test, "/zecheng2/evaluation/test_vq/version_8/vq_test.pkl")
        

begin to read data from /zecheng2/svg/icon-shop/test_data_snaps/test_data_all_seq_with_mesh.pkl ...
load config files from /workspace/zecheng/modelzipper/projects/custom_llama/configs/deepspeed/vqvae_config.yaml
config loaded successfully!
config: namespace(ckpt_path='/zecheng2/vqllama/vqllama_quantizer/version_8/checkpoints/last.ckpt', vqvae=namespace(levels=2, downs_t=[1, 1], strides_t=[2, 2], emb_width=4096, l_bins=8192, l_mu=0.99, spectral=0.0, multispectral=1.0, hvqvae_multipliers=[2, 1, 1], loss_fn='l2', dilation_growth_rate=1, use_nonrelative_specloss=True, use_bottleneck=True, commit=1.0, recon=1.0, linf_k=2048, use_modified_block=False), vqvae_conv_block=namespace(depth=4, width=512, m_conv=1.0, dilation_growth_rate=1, dilation_cycle=None, vqvae_reverse_decoder_dilation=True), dataset=namespace(max_path_nums=512, min_path_nums=4, pad_token_id=0, train_batch_size=128, val_batch_size=32, nworkers=16, pin_memory=False, x_channels=9, inference_mode=False, vocab_size=200, return_al

 38%|███▊      | 750/2000 [03:28<08:36,  2.42it/s]

In [32]:
vq_test[0]

{'keys': 'Money, note, cash, bill, currency',
 'zs_len': 62,
 'pi_res_len': 189,
 'pc_res_len': 124,
 'pi_res_str': '<svg xmlns="http://www.w3.org/2000/svg" viewBox="0.0 0.0 200.0 200.0" height="200px" width="200px"><path fill="none" stroke="deepskyblue" stroke-width=".3" stroke-opacity="1.0"  filling="0" d="M94.0 46.0 L98.0 54.0"></path>\n<path fill="none" stroke="lime" stroke-width=".3" stroke-opacity="1.0"  filling="0" d="M82.0 89.0 L85.0 98.0"></path>\n<path fill="none" stroke="deeppink" stroke-width=".3" stroke-opacity="1.0"  filling="0" d="M86.0 97.0 L95.0 104.0 L97.0 110.0"></path>\n<path fill="none" stroke="gold" stroke-width=".3" stroke-opacity="1.0"  filling="0" d="M97.0 111.0 L87.0 118.0 L82.0 133.0"></path>\n<path fill="none" stroke="coral" stroke-width=".3" stroke-opacity="1.0"  filling="0" d="M81.0 133.0 C65.0 138.0 55.0 141.0 58.0 146.0 C49.0 134.0 52.0 127.0 74.0 116.0 L92.0 89.0 C62.0 55.0 65.0 52.0 107.0 73.0 C110.0 66.0 111.0 67.0 110.0 74.0 L109.0 80.0 L98.0 83.0 L9

In [19]:
vq_test[0]

{'keys': 'Money, note, cash, bill, currency',
 'zs_len': 62,
 'pi_res_len': 189,
 'pc_res_len': 124,
 'pi_res_str': '<svg xmlns="http://www.w3.org/2000/svg" viewBox="0.0 0.0 24.0 24.0" height="200px" width="200px"><path fill="none" stroke="deepskyblue" stroke-width=".3" stroke-opacity="1.0"  filling="0" d="M11.279999732971191 5.519999980926514 L11.760000228881836 6.480000019073486"></path>\n<path fill="none" stroke="lime" stroke-width=".3" stroke-opacity="1.0"  filling="0" d="M9.84000015258789 10.680000305175781 L10.199999809265137 11.760000228881836"></path>\n<path fill="none" stroke="deeppink" stroke-width=".3" stroke-opacity="1.0"  filling="0" d="M10.319999694824219 11.640000343322754 L11.399999618530273 12.479999542236328 L11.640000343322754 13.199999809265137"></path>\n<path fill="none" stroke="gold" stroke-width=".3" stroke-opacity="1.0"  filling="0" d="M11.640000343322754 13.319999694824219 L10.440000534057617 14.15999984741211 L9.84000015258789 15.960000038146973"></path>\n<pat

In [14]:
PI_RES.shape

torch.Size([189, 9])

In [2]:
data[0]

{'keys': ['Money', 'note', 'cash', 'bill', 'currency'],
 'zs': tensor([  82, 4071, 1379, 2577, 1424, 1791, 2885, 1875, 1875, 1347, 3367, 3230,
         1643, 1242, 2886, 1353, 2007, 2448,  602, 3053, 3472, 2881, 3036,  908,
          294, 3165, 3494, 3230, 3367, 1337, 2271, 2994,  646, 1794, 1337, 2663,
         4066, 2790, 2074, 1393, 4066, 2592, 2419,  666, 1439, 1448, 2441,  933,
         3983, 1280,  892, 2812, 3272, 3644,  380,  713, 1527, 2812, 3272, 3644,
          380,  436]),
 'level': 'short',
 'mesh_data': tensor([[  0.,   0.,   0.,   0.,   0.,   0.,   0.,   4.,  37.],
         [  2.,   4.,  37.,   4.,  70.,   4.,  70.,   6.,  67.],
         [  2.,   6.,  67.,   8.,  63.,  65.,   7.,  68.,   5.],
         [  2.,  68.,   5.,  70.,   4.,  62.,   4.,  37.,   4.],
         [  1.,  37.,   4.,   0.,   0.,   0.,   0.,   4.,   4.],
         [  1.,   4.,   4.,   0.,   0.,   0.,   0.,   4.,  37.],
         [  1.,   4.,  37.,   0.,   0.,   0.,   0.,   4.,  37.],
         [  0.,   4.,  