## 计算压缩率及统计

In [1]:
from modelzipper.tutils import *
from tqdm import trange


CodeLLaMA_PATH = "/zecheng2/model_hub/CodeLlama-7b-hf"
FILE_PATH = "/zecheng2/vqllama/test_vqllama_quantizer/test_1/visualized_compress_level_1/svg_paths.jsonl"
COMPRESSED_PATH = "/zecheng2/vqllama/test_vqllama_quantizer/test_1/compress_level_1_predictions.pkl"

llama_tokenizer = AutoTokenizer.from_pretrained(CodeLLaMA_PATH)
str_cotent = auto_read_data(FILE_PATH)
compress_content = auto_read_data(COMPRESSED_PATH)

# count compress svg tokens
p_predict = compress_content['p_predict']
golden = compress_content['golden']

def count_non_pad_rows(x):
    non_pad_rows = 0
    for row in x:
        row_list = row.tolist()
        if row_list[0] == 0:
            if any(row_list[1:]):
                non_pad_rows += 1
            else:
                break
        else:
            non_pad_rows += 1
    
    return non_pad_rows


def count_svg_tokens(batch_x):
    """
    batch_x: [b, l, 9]
    """
    total_tokens = 0
    for i in trange(len(batch_x)):
        item = batch_x[i]
        non_pad_rows = count_non_pad_rows(item)
        total_tokens += non_pad_rows * 9
    avg_tokens = total_tokens / len(batch_x) 
    return avg_tokens

golden_svg_tokens = count_svg_tokens(golden)
p_predict_svg_tokens = count_svg_tokens(p_predict)

print("golden_svg_tokens: ", golden_svg_tokens)
print("p_predict_svg_tokens: ", p_predict_svg_tokens)


def count_str_tokens(batch_x, tokenizer: AutoTokenizer):
    """
    batch_x: List[str]
    """
    total_tokens = 0
    for i in trange(len(batch_x)):
        item = batch_x[i]
        tokens = tokenizer(item)['input_ids']
        total_tokens += len(tokens)
    avg_tokens = total_tokens / len(batch_x) 
    return avg_tokens


p_svg_str = [item['p_svg_str'] for item in str_cotent]
g_svg_str = [item['g_svg_str'] for item in str_cotent]

p_svg_str_tokens = count_str_tokens(p_svg_str, llama_tokenizer)
g_svg_str_tokens = count_str_tokens(g_svg_str, llama_tokenizer)

print("p_str_tokens: ", p_svg_str_tokens)
print("golden_str_tokens: ", g_svg_str_tokens)


compress_codebook_tokens = compress_content['zs'].shape[-1]
print("compress_codebook_tokens: ", compress_codebook_tokens)

print(f"压缩率 (codebook V.S. str): {g_svg_str_tokens / compress_codebook_tokens} 倍")
print(f"压缩率 (codebook V.S. numerical matrix): {golden_svg_tokens / compress_codebook_tokens} 倍")



  from .autonotebook import tqdm as notebook_tqdm


ModelZipper is ready for launch🚀 | Current Version🦄 >>> 0.2.6 <<< | AOE Time🕒 2024-01-05 15:37:11


100%|██████████| 2000/2000 [00:01<00:00, 1668.19it/s]
100%|██████████| 2000/2000 [00:01<00:00, 1270.35it/s]


golden_svg_tokens:  811.4895
p_predict_svg_tokens:  4608.0


100%|██████████| 2000/2000 [00:44<00:00, 45.07it/s]
100%|██████████| 2000/2000 [00:10<00:00, 183.02it/s]

p_str_tokens:  22720.927
golden_str_tokens:  7261.193
compress_codebook_tokens:  256
压缩率 (codebook V.S. str): 28.36403515625 倍
压缩率 (codebook V.S. numerical matrix): 3.169880859375 倍





## 打印输出结果

In [27]:
raw = compress_content.get('raw_predict')[0][: 10]
golden = compress_content.get('golden')[0][: 10]
p_predict = compress_content.get('p_predict')[0][: 10]


def q_p(x):
    for line in x:
        print(line.tolist())


q_p(p_predict)

[0, 0, 0, 0, 0, 0, 0, 4, 105]
[1, 4, 105, 0, 0, 0, 0, 4, 200]
[1, 4, 200, 0, 0, 0, 0, 199, 199]
[1, 199, 199, 0, 0, 0, 0, 200, 4]
[1, 200, 4, 0, 0, 0, 0, 5, 4]
[1, 5, 4, 0, 0, 0, 0, 4, 101]
[1, 4, 101, 0, 0, 0, 0, 1, 103]
[0, 1, 103, 0, 0, 0, 0, 148, 89]
[1, 148, 89, 0, 0, 0, 0, 152, 161]
[1, 152, 161, 0, 0, 0, 0, 52, 170]


## 找到对应的SVG Path

In [11]:
from modelzipper.tutils import *
from tqdm import trange

ANALYSIS_DIR = "/zecheng2/vqllama/test_vqllama_quantizer/test_0/analysis"

CodeLLaMA_PATH = "/zecheng2/model_hub/CodeLlama-7b-hf"
FILE_PATH = "/zecheng2/vqllama/test_vqllama_quantizer/test_0/visualized_compress_level_1/svg_paths.jsonl"
COMPRESSED_PATH = "/zecheng2/vqllama/test_vqllama_quantizer/test_0/compress_level_1_predictions.pkl"

llama_tokenizer = AutoTokenizer.from_pretrained(CodeLLaMA_PATH)
str_content = auto_read_data(FILE_PATH)
compress_content = auto_read_data(COMPRESSED_PATH)

raw_predict = compress_content['raw_predict']
p_predict = compress_content['p_predict']
golden = compress_content['golden']

FILE_ID = [36, 66, 72, 80]


def convert_tensor_to_str(x):
    res = ""
    for i in range(len(x)):
        item = "[" + ",".join([format(j, '5d') for j in x[i].tolist()]) + "]"
        res += item + "\n"
    return res



# TEMPLATE = "raw predict:\n{raw_predict}\n\np predict:\n{p_predict}\n\ngolden:\n{golden}\n\n"

for i in FILE_ID:
    s_raw_predict = convert_tensor_to_str(raw_predict[i])
    s_p_predict = convert_tensor_to_str(p_predict[i])
    s_golden = convert_tensor_to_str(golden[i])
    auto_save_data([s_raw_predict], os.path.join(ANALYSIS_DIR, f"analysis_{i}_raw_predict.txt"))
    auto_save_data([s_raw_predict], os.path.join(ANALYSIS_DIR, f"analysis_{i}_p_predict.txt"))
    auto_save_data([s_raw_predict], os.path.join(ANALYSIS_DIR, f"analysis_{i}_golden.txt"))
    # s = TEMPLATE.format(raw_predict=s_raw_predict, p_predict=s_p_predict, golden=s_golden)
    # auto_save_data([s], os.path.join(ANALYSIS_DIR, f"analysis_{i}.txt"))

txt file saved successfully!
Save file to /zecheng2/vqllama/test_vqllama_quantizer/test_0/analysis/analysis_36_raw_predict.txt | len: 1
txt file saved successfully!
Save file to /zecheng2/vqllama/test_vqllama_quantizer/test_0/analysis/analysis_36_p_predict.txt | len: 1
txt file saved successfully!
Save file to /zecheng2/vqllama/test_vqllama_quantizer/test_0/analysis/analysis_36_golden.txt | len: 1
txt file saved successfully!
Save file to /zecheng2/vqllama/test_vqllama_quantizer/test_0/analysis/analysis_66_raw_predict.txt | len: 1
txt file saved successfully!
Save file to /zecheng2/vqllama/test_vqllama_quantizer/test_0/analysis/analysis_66_p_predict.txt | len: 1
txt file saved successfully!
Save file to /zecheng2/vqllama/test_vqllama_quantizer/test_0/analysis/analysis_66_golden.txt | len: 1
txt file saved successfully!
Save file to /zecheng2/vqllama/test_vqllama_quantizer/test_0/analysis/analysis_72_raw_predict.txt | len: 1
txt file saved successfully!
Save file to /zecheng2/vqllama/te

## 统计数据集平均长度，最长长度和最短长度

In [3]:
from modelzipper.tutils import *
from tqdm import trange

FILE_PATH = "/zecheng2/svg/icon-shop/mesh_data_svg_convert_p.pkl"

content = auto_read_data(FILE_PATH)


total_length, max_length, min_length = 0, 0, 1000
interval_counts = {}  # 新增一个字典来存储区间计数

for i in trange(len(content)):
    svg_data = content[i]['mesh_data']
    length = len(svg_data)
    total_length += length
    max_length = max(max_length, length)
    min_length = min(min_length, length)
    
    # 计算当前长度所在的区间，并更新对应区间的计数
    interval = (length // 100) * 100
    if interval not in interval_counts:
        interval_counts[interval] = 0
    interval_counts[interval] += 1

avg_length = total_length / len(content)

print(f"avg_length: {avg_length}")
print(f"max_length: {max_length}")
print(f"min_length: {min_length}")

# 打印出每个区间的条数
for k in sorted(interval_counts):
    print(f"区间 {k} 到 {k+99} 的条数: {interval_counts[k]}")

100%|██████████| 347000/347000 [00:00<00:00, 438812.38it/s]

avg_length: 98.70044380403458
max_length: 750
min_length: 7
区间 0 到 99 的条数: 228294
区间 100 到 199 的条数: 92352
区间 200 到 299 的条数: 17245
区间 300 到 399 的条数: 5117
区间 400 到 499 的条数: 2114
区间 500 到 599 的条数: 1054
区间 600 到 699 的条数: 608
区间 700 到 799 的条数: 216





In [21]:
content[0]['mesh_data'][:30]

tensor([[  0.,   0.,   0.,   0.,   0.,   0.,   0.,   4., 104.],
        [  1.,   4., 104.,   0.,   0.,   0.,   0.,   4., 199.],
        [  1.,   4., 199.,   0.,   0.,   0.,   0., 199., 199.],
        [  1., 199., 199.,   0.,   0.,   0.,   0., 199.,   4.],
        [  1., 199.,   4.,   0.,   0.,   0.,   0.,   4.,   4.],
        [  1.,   4.,   4.,   0.,   0.,   0.,   0.,   4., 104.],
        [  1.,   4., 104.,   0.,   0.,   0.,   0.,   4., 104.],
        [  0.,   4., 104.,   0.,   0.,   0.,   0., 156., 104.],
        [  1., 156., 104.,   0.,   0.,   0.,   0., 156., 193.],
        [  1., 156., 193.,   0.,   0.,   0.,   0.,  52., 193.],
        [  1.,  52., 193.,   0.,   0.,   0.,   0.,  52.,  15.],
        [  1.,  52.,  15.,   0.,   0.,   0.,   0., 156.,  15.],
        [  1., 156.,  15.,   0.,   0.,   0.,   0., 156., 104.],
        [  1., 156., 104.,   0.,   0.,   0.,   0., 156., 104.],
        [  0., 156., 104.,   0.,   0.,   0.,   0.,  96.,  23.],
        [  2.,  96.,  23.,  93.,  25.,  