In [1]:
import os
import sys
import pickle
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
from tqdm import tqdm
from utils.format_translate import html_to_table
from utils.utils import format_table, format_table_1, format_layout, format_tokens, remove_empty_bboxes, get_html
import json
from pathlib import Path
from glob import glob
import shutil



In [2]:
# result_file = "./output/structure_result/test_A/structure_master_results_0.pkl"
# result_file_dir = os.path.dirname(result_file)

DATASET = "test_A_jpg480max_wire"
PRED_PKL_DIR = f"../output/structure_result/test_A_jpg480max_wire_10fold1_valid97.58"
# 输出路径
pred_save_dir = f"./output/structure_result/{DATASET}-pred.json"
if os.path.exists(pred_save_dir):
    shutil.rmtree(pred_save_dir)
Path(pred_save_dir).mkdir(parents=True, exist_ok=True)


def check_1(table):
    layout = table['layout']
    num = layout.max() + 1
    if num > 16:
        return
    assert np.all(layout != -1)

pkls = sorted(glob(os.path.join(PRED_PKL_DIR, "structure_master_results_*.pkl")))
html_to_table_error_cnt = 0
format_table_error_cnt = 0
for result_file in pkls:
    with open(result_file, 'rb') as f:
        result_data = pickle.load(f)

    for imgname, result in tqdm(result_data.items()):
        img_id = imgname.split(".")[0]

        bboxes = remove_empty_bboxes(result['bbox'])
        tokens_list = format_tokens(result['text'])

        html = get_html(tokens_list, bboxes) # use pred data -> html format
        
        try:
            table = html_to_table(html, check=True)
        except:
            html_to_table_error_cnt += 1
            print("html_to_table error", imgname)
            continue
            
        # fill_first_row_minus1(table)
        # fill_last_row_minus1(table)
        # fill_first_col_minus1(table)

        try:
            table_new = format_table(table)
        except:
            format_table_error_cnt += 1
            print("format_table error", imgname)
            print(result['text'])
            ### 去除layout中的-1
            table = format_table_1(table)
            table_new = format_table(table)

        save_path = os.path.join(pred_save_dir, f'{img_id}-pred.json')

        json.dump(table_new, open(save_path, 'w'), indent=4, ensure_ascii=False)

print("html_to_table_error_cnt", html_to_table_error_cnt)
print("format_table_error_cnt", format_table_error_cnt)


  3%|▎         | 33/1183 [00:00<00:06, 165.76it/s]

html_to_table error 04016.jpg


  4%|▍         | 50/1183 [00:00<00:07, 161.52it/s]

html_to_table error 04231.jpg
html_to_table error 03348.jpg
html_to_table error 04452.jpg


 10%|▉         | 116/1183 [00:00<00:07, 141.32it/s]

html_to_table error 03722.jpg
html_to_table error 03762.jpg
html_to_table error 00205.jpg
html_to_table error 03573.jpg
html_to_table error 04176.jpg
html_to_table error 03648.jpg
html_to_table error 05131.jpg
html_to_table error 04021.jpg


 13%|█▎        | 155/1183 [00:01<00:06, 162.98it/s]

html_to_table error 05157.jpg
html_to_table error 04769.jpg
html_to_table error 03948.jpg
html_to_table error 04504.jpg


 16%|█▋        | 194/1183 [00:01<00:05, 165.15it/s]

html_to_table error 04336.jpg
html_to_table error 04554.jpg


 18%|█▊        | 211/1183 [00:01<00:06, 140.37it/s]

html_to_table error 04333.jpg
html_to_table error 02859.jpg
html_to_table error 02879.jpg


 22%|██▏       | 264/1183 [00:01<00:05, 155.62it/s]

html_to_table error 04295.jpg
html_to_table error 03495.jpg
html_to_table error 03818.jpg
html_to_table error 03746.jpg
html_to_table error 03651.jpg


 25%|██▌       | 297/1183 [00:02<00:06, 137.92it/s]

html_to_table error 03007.jpg
html_to_table error 04188.jpg


 28%|██▊       | 330/1183 [00:02<00:05, 145.53it/s]

html_to_table error 03765.jpg
html_to_table error 03424.jpg
html_to_table error 04312.jpg
html_to_table error 04139.jpg


 32%|███▏      | 375/1183 [00:02<00:04, 180.73it/s]

html_to_table error 05117.jpg
html_to_table error 03784.jpg
html_to_table error 04926.jpg


 35%|███▍      | 412/1183 [00:02<00:04, 162.25it/s]

html_to_table error 03678.jpg
html_to_table error 05023.jpg
html_to_table error 03026.jpg
html_to_table error 04527.jpg
html_to_table error 03005.jpg


 38%|███▊      | 445/1183 [00:02<00:05, 145.47it/s]

html_to_table error 03695.jpg
html_to_table error 04727.jpg


 40%|████      | 477/1183 [00:03<00:05, 133.58it/s]

html_to_table error 04989.jpg


 43%|████▎     | 512/1183 [00:03<00:04, 146.14it/s]

html_to_table error 04958.jpg
html_to_table error 04290.jpg


 46%|████▌     | 542/1183 [00:03<00:04, 135.58it/s]

html_to_table error 05073.jpg
html_to_table error 03159.jpg
html_to_table error 03182.jpg


 50%|█████     | 594/1183 [00:04<00:03, 155.03it/s]

html_to_table error 03013.jpg
html_to_table error 03643.jpg
html_to_table error 03895.jpg
html_to_table error 04882.jpg


 53%|█████▎    | 626/1183 [00:04<00:03, 146.27it/s]

html_to_table error 03291.jpg
html_to_table error 04690.jpg
html_to_table error 03798.jpg
html_to_table error 04099.jpg
html_to_table error 03576.jpg


 55%|█████▌    | 655/1183 [00:04<00:04, 108.31it/s]

html_to_table error 03092.jpg
html_to_table error 04206.jpg


 60%|█████▉    | 709/1183 [00:04<00:03, 143.22it/s]

html_to_table error 03122.jpg


 64%|██████▎   | 752/1183 [00:05<00:03, 118.12it/s]

html_to_table error 05089.jpg
html_to_table error 03534.jpg
html_to_table error 04942.jpg


 67%|██████▋   | 797/1183 [00:05<00:02, 136.38it/s]

html_to_table error 04721.jpg
html_to_table error 04156.jpg
html_to_table error 04516.jpg
html_to_table error 03527.jpg
html_to_table error 04237.jpg


 70%|███████   | 832/1183 [00:05<00:02, 147.89it/s]

html_to_table error 03507.jpg
html_to_table error 03806.jpg
html_to_table error 03480.jpg


 73%|███████▎  | 866/1183 [00:06<00:02, 128.99it/s]

html_to_table error 03951.jpg
html_to_table error 03047.jpg
html_to_table error 03060.jpg
html_to_table error 04405.jpg


 76%|███████▌  | 896/1183 [00:06<00:02, 136.72it/s]

html_to_table error 04118.jpg
html_to_table error 02926.jpg
html_to_table error 03615.jpg
html_to_table error 03536.jpg
html_to_table error 03055.jpg
html_to_table error 04373.jpg


 78%|███████▊  | 917/1183 [00:06<00:01, 155.54it/s]

html_to_table error 04548.jpg


 82%|████████▏ | 971/1183 [00:06<00:01, 159.20it/s]

html_to_table error 04966.jpg
html_to_table error 03079.jpg
html_to_table error 03175.jpg
html_to_table error 03864.jpg


 89%|████████▉ | 1051/1183 [00:07<00:00, 157.35it/s]

html_to_table error 03112.jpg
html_to_table error 04571.jpg
html_to_table error 04639.jpg
html_to_table error 03214.jpg
html_to_table error 03474.jpg
html_to_table error 03088.jpg


 92%|█████████▏| 1090/1183 [00:07<00:00, 162.90it/s]

html_to_table error 03696.jpg
html_to_table error 05006.jpg
html_to_table error 04197.jpg
html_to_table error 03866.jpg
html_to_table error 03753.jpg


100%|██████████| 1183/1183 [00:08<00:00, 142.40it/s]


html_to_table error 04177.jpg
html_to_table error 04085.jpg
html_to_table error 03764.jpg


  3%|▎         | 36/1184 [00:00<00:08, 137.99it/s]

html_to_table error 03766.jpg
html_to_table error 03158.jpg
html_to_table error 04120.jpg
html_to_table error 04616.jpg


  6%|▌         | 69/1184 [00:00<00:07, 151.00it/s]

html_to_table error 03780.jpg


 10%|▉         | 115/1184 [00:00<00:08, 129.08it/s]

html_to_table error 03712.jpg
html_to_table error 03031.jpg
html_to_table error 03468.jpg
html_to_table error 03592.jpg


 12%|█▏        | 146/1184 [00:01<00:07, 139.48it/s]

html_to_table error 04792.jpg
html_to_table error 03377.jpg


 15%|█▍        | 175/1184 [00:01<00:07, 133.03it/s]

html_to_table error 02989.jpg
html_to_table error 04133.jpg
html_to_table error 04408.jpg
html_to_table error 03329.jpg
html_to_table error 04988.jpg
html_to_table error 04430.jpg


 17%|█▋        | 204/1184 [00:01<00:07, 131.37it/s]

html_to_table error 04094.jpg
html_to_table error 03246.jpg
html_to_table error 04932.jpg
html_to_table error 03835.jpg


 20%|██        | 239/1184 [00:01<00:06, 144.15it/s]

html_to_table error 04820.jpg
html_to_table error 03549.jpg
html_to_table error 04403.jpg
html_to_table error 03394.jpg
html_to_table error 05039.jpg


 24%|██▍       | 284/1184 [00:02<00:06, 139.94it/s]

html_to_table error 03562.jpg
html_to_table error 03815.jpg


 27%|██▋       | 316/1184 [00:02<00:05, 148.17it/s]

html_to_table error 04453.jpg
html_to_table error 03245.jpg
html_to_table error 03769.jpg


 31%|███       | 365/1184 [00:02<00:05, 152.85it/s]

html_to_table error 03072.jpg


 33%|███▎      | 396/1184 [00:02<00:06, 122.96it/s]

html_to_table error 03012.jpg
html_to_table error 04491.jpg


 36%|███▌      | 425/1184 [00:03<00:06, 115.03it/s]

html_to_table error 04228.jpg


 39%|███▉      | 460/1184 [00:03<00:05, 132.83it/s]

html_to_table error 02858.jpg
html_to_table error 02916.jpg
html_to_table error 04067.jpg
html_to_table error 04070.jpg


 43%|████▎     | 508/1184 [00:03<00:05, 116.70it/s]

html_to_table error 03754.jpg


 45%|████▌     | 537/1184 [00:04<00:05, 117.10it/s]

html_to_table error 03947.jpg
html_to_table error 04298.jpg


 49%|████▉     | 585/1184 [00:04<00:04, 131.84it/s]

html_to_table error 03228.jpg
html_to_table error 03299.jpg
html_to_table error 04788.jpg


 54%|█████▍    | 644/1184 [00:04<00:03, 135.04it/s]

html_to_table error 04129.jpg


 58%|█████▊    | 685/1184 [00:05<00:02, 166.95it/s]

html_to_table error 03277.jpg
html_to_table error 05167.jpg
html_to_table error 03577.jpg
html_to_table error 04502.jpg
html_to_table error 03216.jpg
html_to_table error 03242.jpg


 62%|██████▏   | 735/1184 [00:05<00:03, 132.11it/s]

html_to_table error 02884.jpg
html_to_table error 03438.jpg
html_to_table error 03336.jpg


 65%|██████▍   | 767/1184 [00:05<00:03, 135.14it/s]

html_to_table error 03472.jpg
html_to_table error 04299.jpg
html_to_table error 02862.jpg
html_to_table error 03334.jpg


 67%|██████▋   | 797/1184 [00:05<00:02, 132.46it/s]

html_to_table error 04638.jpg
html_to_table error 03301.jpg
html_to_table error 05007.jpg
html_to_table error 04608.jpg


 73%|███████▎  | 868/1184 [00:06<00:01, 162.25it/s]

html_to_table error 03170.jpg
html_to_table error 04277.jpg
html_to_table error 04006.jpg
html_to_table error 03589.jpg
html_to_table error 03233.jpg
html_to_table error 03087.jpg
html_to_table error 03822.jpg


 76%|███████▌  | 901/1184 [00:06<00:02, 139.39it/s]

html_to_table error 02940.jpg
html_to_table error 03050.jpg
html_to_table error 03607.jpg


 79%|███████▉  | 935/1184 [00:06<00:01, 149.43it/s]

html_to_table error 03962.jpg
html_to_table error 03281.jpg
html_to_table error 05148.jpg


 83%|████████▎ | 986/1184 [00:07<00:01, 150.28it/s]

html_to_table error 04603.jpg
html_to_table error 05173.jpg
html_to_table error 03506.jpg


 86%|████████▌ | 1017/1184 [00:07<00:01, 126.37it/s]

html_to_table error 04192.jpg
html_to_table error 02964.jpg


 88%|████████▊ | 1045/1184 [00:07<00:01, 119.26it/s]

html_to_table error 03532.jpg
html_to_table error 03590.jpg


 91%|█████████ | 1076/1184 [00:07<00:00, 123.31it/s]

html_to_table error 03624.jpg
html_to_table error 02992.jpg


 93%|█████████▎| 1105/1184 [00:08<00:00, 132.82it/s]

html_to_table error 04445.jpg
html_to_table error 02899.jpg
html_to_table error 02889.jpg
html_to_table error 03176.jpg


 96%|█████████▋| 1140/1184 [00:08<00:00, 137.24it/s]

html_to_table error 03330.jpg
html_to_table error 03550.jpg
html_to_table error 04567.jpg
html_to_table error 03581.jpg


100%|██████████| 1184/1184 [00:08<00:00, 135.00it/s]

html_to_table error 04466.jpg
html_to_table error 04877.jpg
html_to_table error 03289.jpg
html_to_table_error_cnt 196
format_table_error_cnt 0



