In [4]:
import os
import sys
import pickle
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
from tqdm import tqdm
from utils.format_translate import html_to_table
from utils.utils import format_table, format_table_1, format_layout, format_tokens, remove_empty_bboxes, get_html
import json
from pathlib import Path
from glob import glob
import shutil


def format_layout(layout):
    new_layout = np.full_like(layout, -1)
    row_nums, col_nums = layout.shape
    cell_id = 0
    for row_id in range(row_nums):
        for col_id in range(col_nums):
            if layout[row_id, col_id] == -1: # 没办法，先这样
                continue
            if new_layout[row_id, col_id] == -1:
                y, x = np.where(layout==layout[row_id, col_id])
                new_layout[y, x] = cell_id
                cell_id += 1
    # assert new_layout.min() >= 0
    return new_layout

def fill_first_row_minus1(table):
    layout = table['layout']
    layout_row, layout_col = layout.shape
    pre_cell_idx = -1
    for idx in range(layout_col): #from left to right
        if layout[0, idx] == -1 and pre_cell_idx != -1:
            layout[0, idx] = pre_cell_idx
        else:
            pre_cell_idx = layout[0, idx]
    
    pre_cell_idx = -1
    for idx in range(layout_col): # from right to left
        if layout[0, -idx-1] == -1 and pre_cell_idx != -1:
            layout[0, -idx-1] = pre_cell_idx
        else:
            pre_cell_idx = layout[0, -idx-1]
    table['layout'] = layout
    
    return table

def fill_last_row_minus1(table):
    layout = table['layout']
    layout_row, layout_col = layout.shape
    pre_cell_idx = -1
    for idx in range(layout_col):
        if layout[-1, idx] == -1 and pre_cell_idx != -1:
            layout[-1, idx] = pre_cell_idx
        else:
            pre_cell_idx = layout[-1, idx]
    
    pre_cell_idx = -1
    for idx in range(layout_col):
        if layout[-1, -idx-1] == -1 and pre_cell_idx != -1:
            layout[-1, -idx-1] = pre_cell_idx
        else:
            pre_cell_idx = layout[-1, -idx-1]
    table['layout'] = layout
    return table

def fill_first_col_minus1(table):
    layout = table['layout']
    layout_row, layout_col = layout.shape
    pre_cell_idx = -1
    for idx in range(layout_row):
        if layout[idx, 0] == -1 and pre_cell_idx != -1:
            layout[idx, 0] = pre_cell_idx
        else:
            pre_cell_idx = layout[idx, 0]
    pre_cell_idx = -1
    for idx in range(layout_col):
        if layout[-1, -idx-1] == -1 and pre_cell_idx != -1:
            layout[-1, -idx-1] = pre_cell_idx
        else:
            pre_cell_idx = layout[-1, -idx-1]
    table['layout'] = layout
    return table
    


In [5]:
# result_file = "./output/structure_result/test_A/structure_master_results_0.pkl"
# result_file_dir = os.path.dirname(result_file)

DATASET = "test_A_jpg480max"
PRED_PKL_DIR = f"../output/structure_result/test_A_jpg480max_wireless_10fold0_valid94.01"

# 输出路径
pred_save_dir = f"./output/structure_result/{DATASET}-pred.json"
if os.path.exists(pred_save_dir):
    shutil.rmtree(pred_save_dir)
Path(pred_save_dir).mkdir(parents=True, exist_ok=True)


def check_1(table):
    layout = table['layout']
    num = layout.max() + 1
    if num > 16:
        return
    assert np.all(layout != -1)

pkls = sorted(glob(os.path.join(PRED_PKL_DIR, "structure_master_results_*.pkl")))
html_to_table_error_cnt = 0
format_table_error_cnt = 0
for result_file in pkls:
    with open(result_file, 'rb') as f:
        result_data = pickle.load(f)

    for imgname, result in tqdm(result_data.items()):
        img_id = imgname.split(".")[0]

        bboxes = remove_empty_bboxes(result['bbox'])
        tokens_list = format_tokens(result['text'])

        html = get_html(tokens_list, bboxes) # use pred data -> html format
        
        try:
            table = html_to_table(html, check=False)
        except:
            html_to_table_error_cnt += 1
            print("html_to_table error", imgname)
            continue
            
        # fill_first_row_minus1(table)
        # fill_last_row_minus1(table)
        # fill_first_col_minus1(table)

        try:
            table_new = format_table(table)
        except:
            format_table_error_cnt += 1
            print("format_table error", imgname)
            print(result['text'])
            ### 去除layout中的-1
            table = format_table_1(table)
            table_new = format_table(table)

        save_path = os.path.join(pred_save_dir, f'{img_id}-pred.json')

        json.dump(table_new, open(save_path, 'w'), indent=4, ensure_ascii=False)

print("html_to_table_error_cnt", html_to_table_error_cnt)
print("format_table_error_cnt", format_table_error_cnt)

  4%|▍         | 110/2593 [00:00<00:04, 540.75it/s]

html_to_table error 02495.jpg
html_to_table error 01780.jpg
html_to_table error 01599.jpg


 12%|█▏        | 308/2593 [00:00<00:07, 286.82it/s]

html_to_table error 01079.jpg
html_to_table error 00778.jpg


 15%|█▍        | 388/2593 [00:01<00:06, 338.95it/s]

html_to_table error 01837.jpg
html_to_table error 01042.jpg
html_to_table error 02800.jpg
html_to_table error 00215.jpg


 23%|██▎       | 599/2593 [00:01<00:05, 346.48it/s]

html_to_table error 01976.jpg


 27%|██▋       | 705/2593 [00:02<00:06, 305.75it/s]

html_to_table error 02507.jpg
html_to_table error 01781.jpg


 30%|███       | 781/2593 [00:02<00:05, 336.17it/s]

html_to_table error 00212.jpg
html_to_table error 01481.jpg
html_to_table error 02297.jpg
html_to_table error 02744.jpg


 36%|███▌      | 924/2593 [00:02<00:05, 314.25it/s]

html_to_table error 02082.jpg
html_to_table error 00871.jpg


 39%|███▉      | 1013/2593 [00:03<00:04, 374.31it/s]

html_to_table error 01115.jpg


 45%|████▍     | 1163/2593 [00:03<00:04, 327.69it/s]

html_to_table error 02765.jpg
html_to_table error 00597.jpg


 50%|█████     | 1302/2593 [00:03<00:03, 327.80it/s]

html_to_table error 01266.jpg
html_to_table error 02008.jpg
html_to_table error 02404.jpg


 56%|█████▌    | 1446/2593 [00:04<00:03, 290.23it/s]

html_to_table error 02407.jpg


 59%|█████▉    | 1536/2593 [00:04<00:04, 262.66it/s]

html_to_table error 00991.jpg
html_to_table error 02607.jpg


 70%|███████   | 1827/2593 [00:05<00:02, 288.54it/s]

html_to_table error 01100.jpg
html_to_table error 02106.jpg


 74%|███████▍  | 1919/2593 [00:06<00:02, 272.90it/s]

html_to_table error 00389.jpg


 76%|███████▌  | 1975/2593 [00:06<00:02, 253.62it/s]

html_to_table error 00207.jpg
html_to_table error 00660.jpg
html_to_table error 02307.jpg


 83%|████████▎ | 2146/2593 [00:06<00:01, 312.31it/s]

html_to_table error 02092.jpg


 86%|████████▌ | 2234/2593 [00:07<00:00, 378.13it/s]

html_to_table error 01305.jpg
html_to_table error 00602.jpg


 88%|████████▊ | 2272/2593 [00:07<00:01, 318.69it/s]

html_to_table error 02708.jpg
html_to_table error 02406.jpg


 91%|█████████▏| 2370/2593 [00:07<00:00, 240.06it/s]

html_to_table error 01514.jpg
html_to_table error 00112.jpg


 96%|█████████▌| 2483/2593 [00:08<00:00, 262.01it/s]

html_to_table error 01556.jpg
html_to_table error 01663.jpg
html_to_table error 01653.jpg
html_to_table error 01466.jpg
html_to_table error 00311.jpg


100%|██████████| 2593/2593 [00:08<00:00, 302.12it/s]
  3%|▎         | 83/2594 [00:00<00:05, 422.59it/s]

html_to_table error 01157.jpg
html_to_table error 00592.jpg
html_to_table error 01304.jpg


  8%|▊         | 217/2594 [00:00<00:09, 250.06it/s]

html_to_table error 01654.jpg


 11%|█         | 282/2594 [00:01<00:08, 277.77it/s]

html_to_table error 01646.jpg
html_to_table error 00591.jpg


 13%|█▎        | 350/2594 [00:01<00:08, 272.28it/s]

html_to_table error 01302.jpg


 18%|█▊        | 457/2594 [00:01<00:09, 235.26it/s]

html_to_table error 00533.jpg


 21%|██        | 538/2594 [00:02<00:08, 238.11it/s]

html_to_table error 00531.jpg


 25%|██▍       | 640/2594 [00:02<00:06, 287.28it/s]

html_to_table error 01102.jpg
html_to_table error 01662.jpg


 27%|██▋       | 707/2594 [00:02<00:06, 271.38it/s]

html_to_table error 00400.jpg


 30%|███       | 785/2594 [00:02<00:05, 317.59it/s]

html_to_table error 01180.jpg
html_to_table error 02608.jpg
html_to_table error 00435.jpg


 35%|███▌      | 919/2594 [00:03<00:06, 275.80it/s]

html_to_table error 02177.jpg
html_to_table error 00603.jpg


 39%|███▉      | 1007/2594 [00:03<00:04, 332.92it/s]

html_to_table error 00411.jpg


 41%|████▏     | 1074/2594 [00:03<00:04, 308.51it/s]

html_to_table error 00262.jpg
html_to_table error 02081.jpg


 48%|████▊     | 1239/2594 [00:04<00:04, 281.87it/s]

html_to_table error 02086.jpg
html_to_table error 00025.jpg


 50%|████▉     | 1296/2594 [00:04<00:05, 258.54it/s]

html_to_table error 00208.jpg


 53%|█████▎    | 1369/2594 [00:04<00:03, 310.36it/s]

html_to_table error 00028.jpg
html_to_table error 00862.jpg


 55%|█████▌    | 1429/2594 [00:05<00:04, 266.13it/s]

html_to_table error 00221.jpg
html_to_table error 01106.jpg


 60%|█████▉    | 1556/2594 [00:05<00:03, 286.09it/s]

html_to_table error 00582.jpg


 63%|██████▎   | 1624/2594 [00:05<00:03, 266.33it/s]

html_to_table error 02137.jpg
html_to_table error 01623.jpg
html_to_table error 02321.jpg
html_to_table error 01992.jpg


 66%|██████▌   | 1702/2594 [00:06<00:02, 314.28it/s]

html_to_table error 01939.jpg
html_to_table error 02794.jpg
html_to_table error 01158.jpg


 69%|██████▊   | 1781/2594 [00:06<00:02, 323.29it/s]

html_to_table error 00150.jpg
html_to_table error 00022.jpg
html_to_table error 00783.jpg


 71%|███████▏  | 1849/2594 [00:06<00:02, 262.11it/s]

html_to_table error 00189.jpg


 81%|████████  | 2105/2594 [00:07<00:01, 278.04it/s]

html_to_table error 00605.jpg
html_to_table error 01012.jpg


 82%|████████▏ | 2134/2594 [00:07<00:01, 272.57it/s]

html_to_table error 00598.jpg
html_to_table error 02327.jpg


 84%|████████▍ | 2191/2594 [00:07<00:01, 247.30it/s]

html_to_table error 00649.jpg
html_to_table error 00179.jpg


 87%|████████▋ | 2259/2594 [00:08<00:01, 276.81it/s]

html_to_table error 02085.jpg


 92%|█████████▏| 2378/2594 [00:08<00:00, 264.12it/s]

html_to_table error 00614.jpg
html_to_table error 00611.jpg


 95%|█████████▍| 2464/2594 [00:08<00:00, 344.60it/s]

html_to_table error 01220.jpg
html_to_table error 01402.jpg


100%|█████████▉| 2587/2594 [00:09<00:00, 368.15it/s]

html_to_table error 01169.jpg
html_to_table error 01956.jpg


100%|██████████| 2594/2594 [00:09<00:00, 283.41it/s]

html_to_table_error_cnt 97
format_table_error_cnt 0



