In [5]:
import os
import sys
import pickle
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
from tqdm import tqdm
from utils.format_translate import html_to_table
from utils.utils import format_table, format_table_1, format_layout, format_tokens, remove_empty_bboxes, get_html
import json
from pathlib import Path
from glob import glob
import shutil


def format_layout(layout):
    new_layout = np.full_like(layout, -1)
    row_nums, col_nums = layout.shape
    cell_id = 0
    for row_id in range(row_nums):
        for col_id in range(col_nums):
            if layout[row_id, col_id] == -1: # 没办法，先这样
                continue
            if new_layout[row_id, col_id] == -1:
                y, x = np.where(layout==layout[row_id, col_id])
                new_layout[y, x] = cell_id
                cell_id += 1
    # assert new_layout.min() >= 0
    return new_layout

def fill_first_row_minus1(table):
    layout = table['layout']
    layout_row, layout_col = layout.shape
    pre_cell_idx = -1
    for idx in range(layout_col): #from left to right
        if layout[0, idx] == -1 and pre_cell_idx != -1:
            layout[0, idx] = pre_cell_idx
        else:
            pre_cell_idx = layout[0, idx]
    
    pre_cell_idx = -1
    for idx in range(layout_col): # from right to left
        if layout[0, -idx-1] == -1 and pre_cell_idx != -1:
            layout[0, -idx-1] = pre_cell_idx
        else:
            pre_cell_idx = layout[0, -idx-1]
    table['layout'] = layout
    
    return table

def fill_last_row_minus1(table):
    layout = table['layout']
    layout_row, layout_col = layout.shape
    pre_cell_idx = -1
    for idx in range(layout_col):
        if layout[-1, idx] == -1 and pre_cell_idx != -1:
            layout[-1, idx] = pre_cell_idx
        else:
            pre_cell_idx = layout[-1, idx]
    
    pre_cell_idx = -1
    for idx in range(layout_col):
        if layout[-1, -idx-1] == -1 and pre_cell_idx != -1:
            layout[-1, -idx-1] = pre_cell_idx
        else:
            pre_cell_idx = layout[-1, -idx-1]
    table['layout'] = layout
    return table

def fill_first_col_minus1(table):
    layout = table['layout']
    layout_row, layout_col = layout.shape
    pre_cell_idx = -1
    for idx in range(layout_row):
        if layout[idx, 0] == -1 and pre_cell_idx != -1:
            layout[idx, 0] = pre_cell_idx
        else:
            pre_cell_idx = layout[idx, 0]
    pre_cell_idx = -1
    for idx in range(layout_col):
        if layout[-1, -idx-1] == -1 and pre_cell_idx != -1:
            layout[-1, -idx-1] = pre_cell_idx
        else:
            pre_cell_idx = layout[-1, -idx-1]
    table['layout'] = layout
    return table
    


In [6]:
# result_file = "./output/structure_result/test_A/structure_master_results_0.pkl"
# result_file_dir = os.path.dirname(result_file)

DATASET = "test_A_jpg480max_wireless"
DATASET = "test_A_jpg480max_wire"
PRED_PKL_DIR = f"../output/structure_result/test_A_jpg480max_wireless_10fold0_valid94.01"
PRED_PKL_DIR = f"../output/structure_result/test_A_jpg480max_wireless01_10fold1"
PRED_PKL_DIR = f"../output/structure_result/test_A_jpg480max_wire_10fold1_valid97.58"


# 输出路径
pred_save_dir = f"./output/structure_result/{DATASET}-pred.json"
if os.path.exists(pred_save_dir):
    shutil.rmtree(pred_save_dir)
Path(pred_save_dir).mkdir(parents=True, exist_ok=True)


def check_1(table):
    layout = table['layout']
    num = layout.max() + 1
    if num > 16:
        return
    assert np.all(layout != -1)

pkls = sorted(glob(os.path.join(PRED_PKL_DIR, "structure_master_results_*.pkl")))
html_to_table_error_cnt = 0
format_table_error_cnt = 0
for result_file in pkls:
    with open(result_file, 'rb') as f:
        result_data = pickle.load(f)

    for imgname, result in tqdm(result_data.items()):
        img_id = imgname.split(".")[0]

        bboxes = remove_empty_bboxes(result['bbox'])
        tokens_list = format_tokens(result['text'])

        html = get_html(tokens_list, bboxes) # use pred data -> html format
        
        try:
            table = html_to_table(html, check=False)
        except:
            html_to_table_error_cnt += 1
            print("html_to_table error", imgname)
            continue
            
        # fill_first_row_minus1(table)
        # fill_last_row_minus1(table)
        # fill_first_col_minus1(table)

        try:
            table_new = format_table(table)
        except:
            format_table_error_cnt += 1
            print("format_table error", imgname)
            print(result['text'])
            ### 去除layout中的-1
            table = format_table_1(table)
            table_new = format_table(table)

        save_path = os.path.join(pred_save_dir, f'{img_id}-pred.json')

        json.dump(table_new, open(save_path, 'w'), indent=4, ensure_ascii=False)

print("html_to_table_error_cnt", html_to_table_error_cnt)
print("format_table_error_cnt", format_table_error_cnt)

100%|██████████| 1183/1183 [00:08<00:00, 137.28it/s]
 50%|████▉     | 590/1184 [00:04<00:04, 145.62it/s]

format_table error 03299.jpg
<thead>,</thead>,<tbody>,<tr>,<td, rowspan="2",>,</td>,<td, rowspan="2",>,</td>,<td, rowspan="2",>,</td>,<td, rowspan="2",>,</td>,<td, colspan="6",>,</td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td,<UKN>,>,</td>,<td,<UKN>,>,</td>,<td,<UKN>,>,</td>,<td,<UKN>,>,</td>,<td></td>,<td></td>,<td, colspan="2",>,</td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<eb></eb>,<eb></eb>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,<td></td>,</tr>,<tr>,<td></td>

100%|██████████| 1184/1184 [00:08<00:00, 140.81it/s]

html_to_table_error_cnt 0
format_table_error_cnt 1



