In [1]:
import os
import sys
import pickle
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
from tqdm import tqdm
from utils.teds_utils import html_to_table, format_table, format_tokens, format_layout, \
    remove_empty_bboxes, get_html

def format_table_1(table): 
    layout = table['layout']
    num = layout.max() + 1
    idx = 0
    new_cells = []
    cell_cord = set()
    for i, row in enumerate(layout):
        for j, cell_id in enumerate(row):
            if cell_id == -1:
                layout[i, j] = num + idx
                idx += 1
                empty_cell = dict(
                    col_start_idx=j,
                    row_start_idx=i,
                    col_end_idx=j,
                    row_end_idx=i,
                    transcript = '',
                    bbox = [0, 0, 0, 0],
                    segmentation = [[[0, 0], [0, 0], [0, 0], [0, 0]]]
                )
                new_cells.append(empty_cell)
            else:
                if cell_id not in cell_cord:
                    cell_cord.add(cell_id)
                    new_cells.append(table['cells'][cell_id])

    new_layout = format_layout(layout)
    assert len(new_cells) == new_layout.max() + 1

    table = dict(
        layout=new_layout,
        cells=new_cells
    )

    return table


In [2]:
import pickle
import os
import numpy as np
import json
from pathlib import Path
from glob import glob
import shutil

# result_file = "./output/structure_result/test_A/structure_master_results_0.pkl"
# result_file_dir = os.path.dirname(result_file)

DATASET = "test_A_jpg480max"
JPG_JSON_PATH = f"/media/ubuntu/Date12/TableStruct/new_data/{DATASET}.json"
JPG_INFO = json.load(open(JPG_JSON_PATH, 'r'))
PRED_PKL_DIR = f"../output/structure_result/{DATASET}"

# 输出路径
PRED_SAVE_DIR = f"./output/structure_result/{DATASET}-pred.json"
if os.path.exists(PRED_SAVE_DIR):
    shutil.rmtree(PRED_SAVE_DIR)
Path(PRED_SAVE_DIR).mkdir(parents=True, exist_ok=True)


def check_1(table):
    layout = table['layout']
    num = layout.max() + 1
    if num > 16:
        return
    assert np.all(layout != -1)

pkls = sorted(glob(os.path.join(PRED_PKL_DIR, "structure_master_results_*.pkl")))
for result_file in pkls:
    with open(result_file, 'rb') as f:
        result_data = pickle.load(f)

    for imgname, result in tqdm(result_data.items()):
        # if imgname in ['02859.jpg', '04485.jpg', '03669.jpg']:
        #     print(imgname)
        # if imgname in ['00022.jpg']:
        #     print(imgname)
        # if imgname in ['03187.jpg', '04219.jpg']:
        #     print(imgname)
        bboxes = remove_empty_bboxes(result['bbox'])
        # 不能删 占bbox空间！！
        # result['text'] = result['text'].replace("<eb></eb>, rowspan", "<td, rowspan")
        # result['text'] = result['text'].replace("<eb></eb>, colspan", "<td, colspan")
        tokens_list = format_tokens(result['text'])

        try:
            html = get_html(tokens_list, bboxes)
        except:
            print("get_html error", imgname)
            continue
        
        try:
            table = html_to_table(html)
        except:
            print("html_to_table error", imgname)
            # print(html)
            continue
        
        try:
            table_new = format_table(table)
        except:
            print("format_table error", imgname)
            print(result['text'])
            ### 去除layout中的-1
            table = format_table_1(table)
            table_new = format_table(table)
        
        img_id = imgname.split(".")[0]
        save_path = os.path.join(PRED_SAVE_DIR, f'{img_id}-pred.json')

        json.dump(table_new, open(save_path, 'w'), indent=4, ensure_ascii=False)



 50%|████▉     | 1295/2593 [00:09<00:07, 170.40it/s]

format_table error 04882.jpg
<thead>,</thead>,<tbody>,<tr>,<td, rowspan="3",>,</td>,<td></td>,<td, colspan="2",>,</td>,<td, colspan="2",>,</td>,<td, colspan="2",>,</td>,<td, colspan="3",>,</td>,</tr>,<tr>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>, colspan="3",>,</td>,</tr>,<tr>,<td></td>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>,<eb></eb>, colspan="3",>,</td>,</tr>,<tr>,<td, rowspan="4",>,</td>,<td, rowspan="2",>,</td>,<td, rowspan="2", colspan="4",>,</td>,<td, rowspan="2", colspan="2",>,</td>,<td, rowspan="2",>,</td>,<td, colspan="2",>,</td>,<td, rowspan="2",>,</td>,</tr>,<tr>,<td, colspan="4",>,</td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td, colspan="4",>,</td>,<td, colspan="3",>,</td>,<td></td>,</tr>,<tr>,<td></td>,<td, colspan="8",>,</td>,<td></td>,</tr>,<tr>,<td,</thead>,<tbody>,<tr>,<td, rowspan="3",>,</td>,<td, rowspan="2",>,</td>,<td, colspan="4",>,</td>,<td, colspan="6",>,</td>,<td, rowspan="2",>,</td>,<td, rowspan="2

 98%|█████████▊| 2548/2593 [00:18<00:00, 131.74it/s]

format_table error 03669.jpg
<thead>,</thead>,<tbody>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<eb></eb>, rowspan="3",>,</td>,<td, rowspan="3",>,</td>,<td, rowspan="3",>,</td>,<td, rowspan="3",>,</td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,</tr>,<tr>,<td, rowspan="6",>,</td>,<td


100%|██████████| 2593/2593 [00:18<00:00, 138.39it/s]
 15%|█▌        | 394/2594 [00:03<00:21, 104.18it/s]

format_table error 01657.jpg
<thead>,</thead>,<tbody>,<tr>,<td, colspan="6",>,</td>,</tr>,<tr>,<td, colspan="6",>,</td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<eb></eb>,<eb></eb>,<eb></eb>,<td></td>,<eb></eb>,<eb></eb>,</tr>,<tr>,<eb></eb>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,</tr>,<tr>,<eb></eb>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,</tr>,<tr>,<eb></eb>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,</tr>,<tr>,<eb></eb>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,</tr>,<tr>,<eb></eb>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,</tr>,<tr>,<eb></eb>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,</tr>,<tr>,<eb></eb>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,</tr>,<tr>,<eb></eb>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,</tr>,<tr>,<eb></eb>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,</tr>,<tr>,<eb></eb>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,</tr>,<tr>,<eb></eb>,<td></t

100%|██████████| 2594/2594 [00:19<00:00, 136.53it/s]

format_table error 03289.jpg
<thead>,</thead>,<tbody>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td, rowspan="5",>,</td>,<td></td>,<td></td>,<td, rowspan="2",>,</td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td, rowspan="2",>,</td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td, rowspan="2",>,</td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td, rowspan="3",>,</td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td, rowspan="2",>,</td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td, rowspan="4",>,</td>,<td></td>,<td></td>,<td, rowspan="2",>,</td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td, rowspan="2",>,</td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td, rowspan=


