In [5]:
import os
import sys
import pickle
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
from tqdm import tqdm
from utils.format_translate import html_to_table
from utils.utils import format_table, format_table_1, format_layout, format_tokens, remove_empty_bboxes, get_html
import json
from pathlib import Path
from glob import glob
import shutil



In [6]:
# result_file = "./output/structure_result/test_A/structure_master_results_0.pkl"
# result_file_dir = os.path.dirname(result_file)

DATASET = "test_A_jpg480max"
PRED_PKL_DIR = f"../output/structure_result/test_A_jpg480max_10fold0_valid97.68"

# 输出路径
pred_save_dir = f"./output/structure_result/{DATASET}-pred.json"
if os.path.exists(pred_save_dir):
    shutil.rmtree(pred_save_dir)
Path(pred_save_dir).mkdir(parents=True, exist_ok=True)


def check_1(table):
    layout = table['layout']
    num = layout.max() + 1
    if num > 16:
        return
    assert np.all(layout != -1)

pkls = sorted(glob(os.path.join(PRED_PKL_DIR, "structure_master_results_*.pkl")))
html_to_table_error_cnt = 0
format_table_error_cnt = 0
for result_file in pkls:
    with open(result_file, 'rb') as f:
        result_data = pickle.load(f)

    for imgname, result in tqdm(result_data.items()):
        bboxes = remove_empty_bboxes(result['bbox'])
        tokens_list = format_tokens(result['text'])

        html = get_html(tokens_list, bboxes) # use pred data -> html format
        
        try:
            table = html_to_table(html, check=False)
        except:
            html_to_table_error_cnt += 1
            print("html_to_table error", imgname)
            continue
        
        try:
            table_new = format_table(table)
            # print(table['layout'])
        except:
            format_table_error_cnt += 1
            print("format_table error", imgname)
            print(result['text'])
            ### 去除layout中的-1
            table = format_table_1(table)
            table_new = format_table(table)
        
        img_id = imgname.split(".")[0]
        save_path = os.path.join(pred_save_dir, f'{img_id}-pred.json')

        json.dump(table_new, open(save_path, 'w'), indent=4, ensure_ascii=False)

print("html_to_table_error_cnt", html_to_table_error_cnt)
print("format_table_error_cnt", format_table_error_cnt)

 66%|██████▌   | 1703/2593 [00:12<00:06, 141.35it/s]

format_table error 04516.jpg
<thead>,</thead>,<tbody>,<tr>,<td, colspan="4",>,</td>,<td, colspan="4",>,</td>,<td, colspan="4",>,</td>,<td, colspan="3",>,</td>,<td, colspan="2",>,</td>,<td, colspan="2",>,</td>,<td, colspan="2",>,</td>,<td, colspan="3",>,</td>,<td, colspan="4",>,</td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td, rowspan="4",>,</td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<td></td>,<eb></eb>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<td></td

 83%|████████▎ | 2158/2593 [00:16<00:02, 147.23it/s]

format_table error 03175.jpg
<thead>,</thead>,<tbody>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,

 91%|█████████ | 2365/2593 [00:17<00:02, 89.83it/s] 

format_table error 03866.jpg
<thead>,</thead>,<tbody>,<tr>,<td, rowspan="3",>,</td>,<td, rowspan="3",>,</td>,<td, rowspan="3",>,</td>,<td, rowspan="3",>,</td>,<td, colspan="9",>,</td>,<td, colspan="9",>,</td>,<td, rowspan="2",>,</td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,</tr>,<tr>,<td, rowspan="2",>,</td>,<td, rowspan="2",>,</td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,</t

100%|██████████| 2593/2593 [00:19<00:00, 132.14it/s]
 58%|█████▊    | 1508/2594 [00:11<00:07, 148.49it/s]

format_table error 04166.jpg
<thead>,</thead>,<tbody>,<tr>,<td, rowspan="3",>,</td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td, rowspan="3",>,</td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td, rowspan="2",>,</td>,<td></td>

100%|██████████| 2594/2594 [00:20<00:00, 129.24it/s]

html_to_table_error_cnt 0
format_table_error_cnt 4



