In [1]:
import os
import sys
import cv2
import json
import shutil
from glob import glob
from tqdm import tqdm
from pathlib import Path

from utils.table2label import table2layout, fuse_gt_info, judge_error
from utils.table_helper import correct_table



def table2label(table_dir, label_dir, error_file_path):
    table_error = {}
    json_files = sorted(glob(os.path.join(table_dir, '*.json')))
    for idx, json_path in tqdm(enumerate(json_files), total=len(json_files)):
        json_dir = os.path.dirname(json_path)
        json_name = os.path.basename(json_path)
        # json_name = "06615.json"
        # json_path = os.path.join(json_dir, json_name)

        json_id = json_name.split('.')[0]
        table = json.load(open(json_path, 'r'))
        
        if not table['is_wireless']:
            continue

        # table['is_wireless'] = True

        # table = correct_table(table)
        try:
            gt_label = table2layout(table)
        except:
            table_error[json_id] = 'table2layout error'
            continue

        ## 有线表格得到的bbox还是cell框，不是text框
        try:
            gt_label = fuse_gt_info(gt_label, table)
        except:
            table_error[json_id] = "fuse_gt_info error" # 仅有1cell的有线表格 滤过
            continue

        valid, msg = judge_error(table, gt_label)
        if not valid:
            print(json_name, msg)
            table_error[json_id] = msg
            continue

        gt_json_path = os.path.join(label_dir, f'{json_id}-gt.json')
        json.dump(gt_label, open(gt_json_path, 'w'), indent=4)
    json.dump(table_error, open(error_file_path, 'w'), indent=4)

    print('table error: {}'.format(len(table_error)))

## STEP.1 gen_gt_labels

In [2]:
'''
输入
    训练集目录: {dataset_root}/train
输出
    训练集标注目录: {dataset_root}/train_gt_json/
    {dataset_root}/train_error.json

使用 fix_gt_table 前 报错的：684
使用 fix_gt_table 后 报错的：665
    + 清除area==0的数据：664
'''
DATASET = "train_jpg480max"
DATASET_ROOT = '/media/ubuntu/Date12/TableStruct/new_data'

TABLE_DIR = os.path.join(DATASET_ROOT, DATASET)
LABEL_DIR = os.path.join(DATASET_ROOT, f'{DATASET}_wireless_gt_json')
TABLE_ERROR_PATH = os.path.join(DATASET_ROOT, f'{DATASET}_wireless_error.json')

if os.path.exists(LABEL_DIR):
    shutil.rmtree(LABEL_DIR)
Path.mkdir(Path(LABEL_DIR), parents=True)

table2label(TABLE_DIR, LABEL_DIR, TABLE_ERROR_PATH)


  0%|          | 36/12104 [00:01<08:01, 25.08it/s]

00029.json line idx:[[6], [12], [113]] not find


  1%|          | 61/12104 [00:02<09:21, 21.45it/s]

00057.json line idx:[[34]] not find


  1%|          | 93/12104 [00:03<05:51, 34.21it/s]

00084.json line idx:[[20], [61], [62], [103], [104]] not find


  1%|▏         | 176/12104 [00:05<02:58, 66.90it/s]

00169.json line idx:[[0], [2], [9], [10], [16], [17]] not find


  2%|▏         | 259/12104 [00:06<03:30, 56.21it/s]

00250.json line idx:[[6]] not find


  2%|▏         | 280/12104 [00:07<03:24, 57.74it/s]

00273.json line idx:[[2], [8]] not find
00280.json line idx:[[35]] not find


  3%|▎         | 317/12104 [00:07<02:49, 69.50it/s]

00301.json line idx:[[17], [18], [20]] not find
00303.json line idx:[[9], [17], [18]] not find
00307.json line idx:[[4], [18], [20]] not find
00314.json line idx:[[32], [35]] not find


  3%|▎         | 332/12104 [00:08<03:39, 53.60it/s]

00325.json line idx:[[19], [20]] not find


  3%|▎         | 338/12104 [00:08<04:19, 45.32it/s]

00334.json line idx:[[0], [1], [2], [3]] not find


  3%|▎         | 352/12104 [00:08<05:42, 34.31it/s]

00353.json line idx:[[13], [23], [47], [48], [49], [50], [72], [73], [74], [75]] not find


  3%|▎         | 367/12104 [00:09<09:14, 21.18it/s]

00358.json line idx:[[0], [13], [18], [23]] not find


  3%|▎         | 399/12104 [00:10<03:07, 62.42it/s]

00374.json line idx:[[18]] not find


  4%|▍         | 493/12104 [00:14<16:53, 11.46it/s] 

00489.json line idx:[[26], [27], [36], [37]] not find


  5%|▍         | 589/12104 [00:15<02:21, 81.25it/s]

00573.json line idx:[[0], [2]] not find


  5%|▍         | 598/12104 [00:15<02:39, 72.04it/s]

00600.json line idx:[[0], [1], [35], [36]] not find
00602.json line idx:[[4], [11], [12]] not find


  5%|▌         | 621/12104 [00:16<03:33, 53.87it/s]

00611.json line idx:[[3]] not find


  5%|▌         | 648/12104 [00:17<04:17, 44.44it/s]

00641.json line idx:[[13], [14], [15], [16]] not find


  6%|▌         | 678/12104 [00:17<03:46, 50.40it/s]

00668.json line idx:[[4], [6]] not find
00674.json line idx:[[34]] not find


  6%|▌         | 738/12104 [00:18<04:36, 41.16it/s]

00731.json line idx:[[0], [1], [2], [3]] not find
00741.json line idx:[[14], [33], [34], [49]] not find


  6%|▌         | 750/12104 [00:19<04:28, 42.35it/s]

00745.json line idx:[[24]] not find


  6%|▋         | 767/12104 [00:19<05:17, 35.69it/s]

00759.json line idx:[[102]] not find


  6%|▋         | 779/12104 [00:20<06:17, 30.01it/s]

00773.json line idx:[[39]] not find


  7%|▋         | 788/12104 [00:20<06:26, 29.28it/s]

00782.json line idx:[[111], [127]] not find


  7%|▋         | 907/12104 [00:22<05:18, 35.17it/s] 

00902.json line idx:[[0], [1], [2], [3], [7], [8], [9], [10], [11], [12], [13], [14]] not find


  9%|▉         | 1123/12104 [00:28<02:36, 70.32it/s]

01119.json line idx:[[5], [40]] not find


  9%|▉         | 1138/12104 [00:28<02:54, 62.70it/s]

01131.json line idx:[[11], [61]] not find
01136.json line idx:[[87]] not find
01138.json line idx:[[16], [19], [40], [41]] not find


 10%|▉         | 1202/12104 [00:29<02:49, 64.13it/s]

01203.json line idx:[[28], [31]] not find


 11%|█▏        | 1365/12104 [00:33<06:54, 25.94it/s]

01361.json line idx:[[0], [4], [5], [6]] not find


 11%|█▏        | 1383/12104 [00:34<05:50, 30.60it/s]

01378.json line idx:[[63], [64]] not find


 12%|█▏        | 1484/12104 [00:35<02:35, 68.24it/s]

01475.json line idx:[[9]] not find


 12%|█▏        | 1506/12104 [00:36<03:35, 49.15it/s]

01498.json line idx:[[0]] not find
01499.json line idx:[[37]] not find


 13%|█▎        | 1599/12104 [00:38<02:38, 66.20it/s]

01586.json line idx:[[6], [7], [8], [9], [10], [14], [15], [16], [17], [18], [21], [22], [23], [24], [25], [26], [31], [32]] not find


 14%|█▍        | 1675/12104 [00:40<04:42, 36.89it/s]

01668.json line idx:[[57], [73], [114]] not find


 15%|█▍        | 1772/12104 [00:41<02:48, 61.26it/s] 

01768.json line idx:[[84], [88]] not find


 15%|█▍        | 1800/12104 [00:43<10:09, 16.91it/s]

01797.json line idx:[[137], [198]] not find


 16%|█▌        | 1952/12104 [00:47<02:45, 61.41it/s]

01947.json line idx:[[83], [84], [85], [86], [87], [88], [89], [90], [91], [92], [93], [94], [95]] not find
01948.json line idx:[[12]] not find


 16%|█▋        | 1992/12104 [00:48<03:10, 53.07it/s]

01990.json line idx:[[0]] not find


 17%|█▋        | 2003/12104 [00:48<05:27, 30.81it/s]

01997.json line idx:[[0], [1], [2], [3]] not find


 17%|█▋        | 2071/12104 [00:50<03:17, 50.79it/s]

02065.json line idx:[[25], [26], [27], [29], [30], [31], [32], [33], [38]] not find
02077.json line idx:[[0]] not find


 18%|█▊        | 2134/12104 [00:52<04:02, 41.06it/s]

02125.json line idx:[[16], [18], [19]] not find


 18%|█▊        | 2216/12104 [00:53<04:00, 41.08it/s] 

02209.json line idx:[[2], [3]] not find


 18%|█▊        | 2229/12104 [00:53<03:56, 41.82it/s]

02223.json line idx:[[1]] not find


 19%|█▉        | 2296/12104 [00:54<02:49, 58.00it/s]

02284.json line idx:[[26]] not find


 19%|█▉        | 2316/12104 [00:55<04:36, 35.39it/s]

02315.json line idx:[[67], [68], [69], [70], [127], [128]] not find


 20%|██        | 2473/12104 [00:58<02:27, 65.37it/s]

02465.json line idx:[[101]] not find


 21%|██        | 2485/12104 [00:58<02:05, 76.87it/s]

02478.json line idx:[[16]] not find
02489.json line idx:[[6], [32]] not find


 21%|██        | 2559/12104 [00:59<01:54, 83.40it/s]

02561.json line idx:[[224], [225], [226], [227], [228]] not find
02565.json line idx:[[1], [2], [3], [5], [7], [9]] not find


 21%|██        | 2569/12104 [01:00<04:02, 39.33it/s]

02569.json line idx:[[0], [1]] not find


 21%|██▏       | 2577/12104 [01:00<06:39, 23.86it/s]

02572.json line idx:[[0], [1], [2], [3], [4], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30]] not find
02575.json line idx:[[6], [8]] not find


 21%|██▏       | 2589/12104 [01:01<05:22, 29.51it/s]

02584.json line idx:[[19]] not find


 22%|██▏       | 2625/12104 [01:01<03:10, 49.74it/s]

02619.json line idx:[[56], [57]] not find


 22%|██▏       | 2660/12104 [01:02<04:05, 38.39it/s]

02655.json line idx:[[41], [42]] not find


 22%|██▏       | 2669/12104 [01:03<04:54, 31.99it/s]

02663.json line idx:[[18], [19], [20], [22], [23], [24], [25]] not find


 25%|██▍       | 2976/12104 [01:07<03:25, 44.45it/s] 

02967.json line idx:[[2], [6]] not find


 25%|██▍       | 3014/12104 [01:07<03:02, 49.89it/s]

03007.json line idx:[[5], [6], [7], [8]] not find


 25%|██▌       | 3049/12104 [01:08<03:48, 39.64it/s]

03051.json line idx:[[56], [62], [68]] not find


 25%|██▌       | 3062/12104 [01:09<05:50, 25.76it/s]

03056.json layout error: cell_idx: 0, row_span: [0, 12], col_span: [0, 9]


 25%|██▌       | 3074/12104 [01:09<05:23, 27.92it/s]

03068.json line idx:[[2], [3], [4]] not find


 26%|██▌       | 3155/12104 [01:11<02:09, 69.00it/s]

03141.json line idx:[[5], [9], [10]] not find


 27%|██▋       | 3298/12104 [01:13<04:14, 34.56it/s]

03296.json line idx:[[87]] not find


 28%|██▊       | 3411/12104 [01:16<02:14, 64.83it/s]

03395.json line idx:[[24], [25], [26], [27], [28], [29], [30], [31], [32], [33], [34], [35], [36], [37], [40], [41]] not find


 31%|███       | 3768/12104 [01:21<01:59, 69.76it/s] 

03766.json line idx:[[32], [33], [34]] not find


 33%|███▎      | 3970/12104 [01:25<02:40, 50.69it/s]

03952.json line idx:[[38], [40], [41], [47], [55], [73], [166], [180], [184]] not find
03956.json line idx:[[67], [68], [69]] not find


 33%|███▎      | 4032/12104 [01:27<03:42, 36.24it/s]

04023.json line idx:[[0], [1], [2], [3], [4], [5], [6], [7]] not find
04024.json line idx:[[37]] not find


 33%|███▎      | 4042/12104 [01:27<03:48, 35.33it/s]

04037.json line idx:[[24], [48], [49]] not find
04043.json line idx:[[30], [32], [39], [40]] not find


 34%|███▎      | 4072/12104 [01:28<02:30, 53.33it/s]

04067.json line idx:[[3]] not find


 34%|███▍      | 4091/12104 [01:28<02:37, 51.02it/s]

04081.json line idx:[[14]] not find


 36%|███▋      | 4404/12104 [01:33<02:03, 62.23it/s]

04399.json line idx:[[5], [18]] not find
04400.json line idx:[[21], [24], [25], [26], [27], [28], [29], [30], [35]] not find


 37%|███▋      | 4503/12104 [01:35<02:02, 62.15it/s]

04493.json line idx:[[0]] not find
04503.json layout error: cell_idx: 5, row_span: [1, 6], col_span: [0, 5]


 38%|███▊      | 4540/12104 [01:36<03:01, 41.74it/s]

04533.json line idx:[[70], [90]] not find


 39%|███▊      | 4669/12104 [01:38<02:28, 49.94it/s]

04668.json layout error: cell_idx: 2, row_span: [0, 2], col_span: [6, 9]


 39%|███▉      | 4704/12104 [01:39<02:16, 54.30it/s]

04701.json line idx:[[9], [10]] not find
04705.json line idx:[[0], [1]] not find


 39%|███▉      | 4748/12104 [01:39<01:30, 81.35it/s]

04753.json line idx:[[14]] not find
04755.json line idx:[[51], [52], [53], [54], [55], [56], [57], [58], [59]] not find


 40%|███▉      | 4783/12104 [01:40<02:09, 56.34it/s]

04778.json line idx:[[6], [8]] not find


 40%|███▉      | 4797/12104 [01:41<02:31, 48.17it/s]

04789.json layout error: cell_idx: 9, row_span: [0, 1], col_span: [9, 12]


 40%|███▉      | 4830/12104 [01:41<03:30, 34.59it/s]

04822.json line idx:[[0]] not find
04826.json line idx:[[47]] not find
04828.json line idx:[[0], [1]] not find


 40%|███▉      | 4840/12104 [01:42<03:36, 33.55it/s]

04835.json line idx:[[12]] not find


 40%|████      | 4857/12104 [01:42<03:50, 31.49it/s]

04849.json line idx:[[185]] not find


 42%|████▏     | 5046/12104 [01:45<01:37, 72.70it/s] 

05027.json line idx:[[4], [9], [10], [11], [25], [26], [27], [28], [40], [41], [42], [43], [63], [64], [65], [66], [67], [68], [69], [72]] not find


 43%|████▎     | 5253/12104 [01:48<04:03, 28.09it/s]

05250.json line idx:[[5]] not find


 45%|████▌     | 5476/12104 [01:52<01:32, 71.51it/s]

05456.json line idx:[[23]] not find


 45%|████▌     | 5491/12104 [01:53<01:14, 89.16it/s]

05491.json line idx:[[1], [4]] not find
05494.json line idx:[[1], [2], [3], [4], [5], [6], [7], [11], [12], [16], [17], [18], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32], [39]] not find
05495.json line idx:[[1]] not find


 45%|████▌     | 5502/12104 [01:53<02:05, 52.64it/s]

05500.json line idx:[[1], [4], [5], [6], [7], [8], [9], [10], [44], [46], [47], [48], [49]] not find


 46%|████▌     | 5550/12104 [01:54<01:51, 58.83it/s]

05539.json line idx:[[2], [36], [37], [38], [61], [105], [106], [107], [108], [127], [128], [129]] not find
05540.json line idx:[[2], [133], [134]] not find
05547.json line idx:[[2], [3]] not find


 46%|████▌     | 5574/12104 [01:54<02:36, 41.65it/s]

05568.json line idx:[[24]] not find


 46%|████▋     | 5604/12104 [01:55<03:03, 35.48it/s]

05600.json line idx:[[45]] not find


 48%|████▊     | 5785/12104 [01:58<00:52, 120.21it/s]

05769.json line idx:[[5], [8], [10], [19], [24], [27]] not find


 50%|████▉     | 5994/12104 [02:01<02:12, 46.16it/s] 

05993.json line idx:[[1]] not find


 50%|████▉     | 6026/12104 [02:02<02:08, 47.34it/s]

06020.json line idx:[[0], [1], [6], [9], [10], [104]] not find


 50%|████▉     | 6040/12104 [02:02<01:49, 55.27it/s]

06035.json line idx:[[40], [42], [43], [44], [57]] not find


 50%|█████     | 6057/12104 [02:02<02:21, 42.62it/s]

06048.json line idx:[[0], [1], [4], [8], [13], [96], [97], [111], [112], [113], [114]] not find


 50%|█████     | 6079/12104 [02:03<03:11, 31.49it/s]

06077.json line idx:[[157]] not find
06079.json line idx:[[7], [8], [12], [15], [16], [17], [20], [23], [24], [25], [26], [27], [32], [34], [36], [37], [43], [44], [45], [46], [48], [50], [51], [103]] not find


 50%|█████     | 6109/12104 [02:04<02:10, 45.97it/s]

06098.json line idx:[[1], [7], [8], [11]] not find


 51%|█████     | 6128/12104 [02:04<01:55, 51.66it/s]

06120.json line idx:[[3], [4], [5]] not find


 51%|█████     | 6153/12104 [02:05<01:57, 50.80it/s]

06142.json line idx:[[0], [1], [2], [3], [4], [5]] not find


 51%|█████     | 6180/12104 [02:05<01:44, 56.52it/s]

06175.json line idx:[[2], [9]] not find


 51%|█████     | 6197/12104 [02:06<02:17, 42.87it/s]

06193.json line idx:[[5], [80]] not find


 52%|█████▏    | 6273/12104 [02:08<01:28, 66.02it/s]

06267.json line idx:[[4], [5]] not find


 52%|█████▏    | 6297/12104 [02:08<02:39, 36.34it/s]

06287.json line idx:[[19], [21], [23], [24], [25]] not find


 52%|█████▏    | 6321/12104 [02:09<02:27, 39.25it/s]

06316.json line idx:[[5], [6], [17]] not find


 53%|█████▎    | 6364/12104 [02:10<02:36, 36.79it/s]

06360.json line idx:[[19], [81]] not find


 53%|█████▎    | 6375/12104 [02:10<02:57, 32.37it/s]

06365.json line idx:[[46], [48], [52], [53]] not find


 54%|█████▍    | 6545/12104 [02:14<02:26, 37.96it/s]

06543.json line idx:[[6], [8]] not find


 54%|█████▍    | 6589/12104 [02:16<02:55, 31.46it/s]

06584.json line idx:[[15]] not find


100%|██████████| 12104/12104 [02:34<00:00, 78.27it/s] 

table error: 133



