In [5]:
import os
import sys
import pickle
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
from tqdm import tqdm
from utils.teds_utils import html_to_table, format_table, format_tokens, format_layout, \
    remove_empty_bboxes, get_html

def format_table_1(table): 
    layout = table['layout']
    num = layout.max() + 1
    idx = 0
    new_cells = []
    cell_cord = set()
    for i, row in enumerate(layout):
        for j, cell_id in enumerate(row):
            if cell_id == -1:
                layout[i, j] = num + idx
                idx += 1
                empty_cell = dict(
                    col_start_idx=j,
                    row_start_idx=i,
                    col_end_idx=j,
                    row_end_idx=i,
                    transcript = '',
                    bbox = [0, 0, 0, 0],
                    segmentation = [[[0, 0], [0, 0], [0, 0], [0, 0]]]
                )
                new_cells.append(empty_cell)
            else:
                if cell_id not in cell_cord:
                    cell_cord.add(cell_id)
                    new_cells.append(table['cells'][cell_id])

    new_layout = format_layout(layout)
    assert len(new_cells) == new_layout.max() + 1

    table = dict(
        layout=new_layout,
        cells=new_cells
    )

    return table


In [6]:
import pickle
import os
import numpy as np
import json
from pathlib import Path
from glob import glob
import shutil

# result_file = "./output/structure_result/test_A/structure_master_results_0.pkl"
# result_file_dir = os.path.dirname(result_file)

DATASET = "test_A_jpg480max_wire"



PRED_PKL_DIR = f"../output/structure_result/{DATASET}"

# 输出路径
PRED_SAVE_DIR = f"./output/structure_result/{DATASET}-pred.json"
if os.path.exists(PRED_SAVE_DIR):
    shutil.rmtree(PRED_SAVE_DIR)
Path(PRED_SAVE_DIR).mkdir(parents=True, exist_ok=True)


def check_1(table):
    layout = table['layout']
    num = layout.max() + 1
    if num > 16:
        return
    assert np.all(layout != -1)

pkls = sorted(glob(os.path.join(PRED_PKL_DIR, "structure_master_results_*.pkl")))
for result_file in pkls:
    with open(result_file, 'rb') as f:
        result_data = pickle.load(f)

    for imgname, result in tqdm(result_data.items()):
        bboxes = remove_empty_bboxes(result['bbox'])
        tokens_list = format_tokens(result['text'])

        try:
            html = get_html(tokens_list, bboxes)
        except:
            print("get_html error", imgname)
            continue
        
        try:
            table = html_to_table(html)
        except:
            print("html_to_table error", imgname)
            # print(html)
            continue
        
        try:
            table_new = format_table(table)
        except:
            print("format_table error", imgname)
            print(result['text'])
            ### 去除layout中的-1
            table = format_table_1(table)
            table_new = format_table(table)
        
        img_id = imgname.split(".")[0]
        save_path = os.path.join(PRED_SAVE_DIR, f'{img_id}-pred.json')

        json.dump(table_new, open(save_path, 'w'), indent=4, ensure_ascii=False)



 26%|██▌       | 305/1165 [00:02<00:06, 142.26it/s]

format_table error 04941.jpg
<thead>,</thead>,<tbody>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td, colspan="4",>,</td>,<eb></eb>,<eb></eb>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td, colspan="4",>,</td>,<eb></eb>,<eb></eb>,<td></td>,</tr>,<tr>,<td, colspan="2",>,</td>,<td></td>,<td></td>,<td></td>,<td, colspan="2",>,</td>,<td,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td, colspan="2",>,</td>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,</tr>,<tr>,<td, rowspan="2",>,</td>,<td, rowspan="2",>,<

 83%|████████▎ | 971/1165 [00:08<00:02, 95.00it/s] 

format_table error 03175.jpg
<thead>,</thead>,<tbody>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,

 87%|████████▋ | 1010/1165 [00:08<00:01, 114.23it/s]

format_table error 03812.jpg
<thead>,</thead>,<tbody>,<tr>,<td, colspan="3",>,</td>,<td, colspan="6",>,</td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<eb></eb>,<td></td>,<td></td>,<eb></eb>,<td></td>,<td></td>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<eb></eb>,<td></td>,<td></td>,<eb></eb>,<td></td>,<td></td>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<eb></eb>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<eb></eb>,<td></td>,<td></td>,<td></td>,<td></td>,<t

 92%|█████████▏| 1072/1165 [00:09<00:01, 89.39it/s] 

format_table error 03866.jpg
<thead>,</thead>,<tbody>,<tr>,<td, rowspan="3",>,</td>,<td, rowspan="3",>,</td>,<td, rowspan="3",>,</td>,<td, rowspan="3",>,</td>,<td, colspan="9",>,</td>,</tr>,<tr>,<td, rowspan="2",>,</td>,<td, rowspan="2",>,</td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td, rowspan="2",>,</td>,<td, rowspan="2",>,</td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,</t

100%|██████████| 1165/1165 [00:10<00:00, 115.04it/s]
 23%|██▎       | 268/1165 [00:02<00:10, 85.82it/s] 

format_table error 04254.jpg
<thead>,</thead>,<tbody>,<tr>,<td, rowspan="2",>,</td>,<td, rowspan="2",>,</td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<eb></eb>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<eb></eb>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<eb></eb>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<eb></eb>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<eb></eb>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<eb></eb>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<eb></eb>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<eb></eb>,<td

 94%|█████████▍| 1095/1165 [00:09<00:00, 125.49it/s]

format_table error 04399.jpg
<thead>,</thead>,<tbody>,<tr>,<td, rowspan="2",>,</td>,<td, rowspan="2",>,</td>,<td, rowspan="2",>,</td>,<td, rowspan="2",>,</td>,<td, rowspan="2",>,</td>,<td, rowspan="2",>,</td>,<td, colspan="3",>,</td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<eb></eb>,<eb></eb>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<td></td>,<eb></eb>,</tr>,<tr>,<eb></eb>,<eb></eb>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<td></td>,<eb></eb>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,</tr>,<tr>,<td,<tr>,<td,<tr>,<td, rowspan="5",>,</td>,<td, rowspan="2",>,</td>,<td, rowspan="2",>,</td>,<td, rowspan="2",>,</td>,<td, rowspan="2",>,</td>,<td, rowspan="2",>,</td>,<td, rowspan="2",>,</td>,<eb></eb>,<eb></eb>,</tr>,<tr>,</tr>,<tr>,</tr>,<tr>,<td, colspan="3",>,</td>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,</tr>,<tr>,<td, colspan="3",>,</td>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,</tr

100%|██████████| 1165/1165 [00:09<00:00, 116.74it/s]
