## 计算压缩率及统计

In [16]:
from modelzipper.tutils import *
from tqdm import trange


CodeLLaMA_PATH = "/zecheng2/model_hub/CodeLlama-7b-hf"
FILE_PATH = "/zecheng2/vqllama/test_vqllama_quantizer/test_0/visualized_compress_level_3/svg_paths.jsonl"
COMPRESSED_PATH = "/zecheng2/vqllama/test_vqllama_quantizer/test_0/compress_level_3_predictions.pkl"

llama_tokenizer = AutoTokenizer.from_pretrained(CodeLLaMA_PATH)
str_cotent = auto_read_data(FILE_PATH)
compress_content = auto_read_data(COMPRESSED_PATH)

# count compress svg tokens
p_predict = compress_content['p_predict']
golden = compress_content['golden']

def count_non_pad_rows(x):
    non_pad_rows = 0
    for row in x:
        row_list = row.tolist()
        if row_list[0] == 0:
            if any(row_list[1:]):
                non_pad_rows += 1
            else:
                break
        else:
            non_pad_rows += 1
    
    return non_pad_rows


def count_svg_tokens(batch_x):
    """
    batch_x: [b, l, 9]
    """
    total_tokens = 0
    for i in trange(len(batch_x)):
        item = batch_x[i]
        non_pad_rows = count_non_pad_rows(item)
        total_tokens += non_pad_rows * 9
    avg_tokens = total_tokens / len(batch_x) 
    return avg_tokens

golden_svg_tokens = count_svg_tokens(golden)
p_predict_svg_tokens = count_svg_tokens(p_predict)

print("golden_svg_tokens: ", golden_svg_tokens)
print("p_predict_svg_tokens: ", p_predict_svg_tokens)


def count_str_tokens(batch_x, tokenizer: AutoTokenizer):
    """
    batch_x: List[str]
    """
    total_tokens = 0
    for i in trange(len(batch_x)):
        item = batch_x[i]
        tokens = tokenizer(item)['input_ids']
        total_tokens += len(tokens)
    avg_tokens = total_tokens / len(batch_x) 
    return avg_tokens


p_svg_str = [item['p_svg_str'] for item in str_cotent]
g_svg_str = [item['g_svg_str'] for item in str_cotent]

p_svg_str_tokens = count_str_tokens(p_svg_str, llama_tokenizer)
g_svg_str_tokens = count_str_tokens(g_svg_str, llama_tokenizer)

print("p_str_tokens: ", p_svg_str_tokens)
print("golden_str_tokens: ", g_svg_str_tokens)


compress_codebook_tokens = compress_content['zs'].shape[-1]
print("compress_codebook_tokens: ", compress_codebook_tokens)

print(f"压缩率 (codebook V.S. str): {g_svg_str_tokens / compress_codebook_tokens} 倍")
print(f"压缩率 (codebook V.S. numerical matrix): {golden_svg_tokens / compress_codebook_tokens} 倍")



100%|██████████| 2000/2000 [00:01<00:00, 1608.13it/s]
100%|██████████| 2000/2000 [00:01<00:00, 1155.87it/s]


golden_svg_tokens:  862.551
p_predict_svg_tokens:  4541.094


100%|██████████| 2000/2000 [00:04<00:00, 403.78it/s]
100%|██████████| 2000/2000 [00:05<00:00, 399.71it/s]

p_str_tokens:  2551.7985
golden_str_tokens:  2552.1795
compress_codebook_tokens:  64
压缩率 (codebook V.S. str): 39.8778046875 倍
压缩率 (codebook V.S. numerical matrix): 13.477359375 倍





## 打印输出结果

In [27]:
raw = compress_content.get('raw_predict')[0][: 10]
golden = compress_content.get('golden')[0][: 10]
p_predict = compress_content.get('p_predict')[0][: 10]


def q_p(x):
    for line in x:
        print(line.tolist())


q_p(p_predict)

[0, 0, 0, 0, 0, 0, 0, 4, 105]
[1, 4, 105, 0, 0, 0, 0, 4, 200]
[1, 4, 200, 0, 0, 0, 0, 199, 199]
[1, 199, 199, 0, 0, 0, 0, 200, 4]
[1, 200, 4, 0, 0, 0, 0, 5, 4]
[1, 5, 4, 0, 0, 0, 0, 4, 101]
[1, 4, 101, 0, 0, 0, 0, 1, 103]
[0, 1, 103, 0, 0, 0, 0, 148, 89]
[1, 148, 89, 0, 0, 0, 0, 152, 161]
[1, 152, 161, 0, 0, 0, 0, 52, 170]
