# 数据集可视化分析

In [27]:
from glob import glob
from tqdm import tqdm
import os
import json
import cv2
import numpy as np
from matplotlib import pyplot as plt
from collections import defaultdict

from utils.table2label import table2layout, fuse_gt_info, judge_error

DATASET = "train_jpg480max"
DATASER_ROOT = "/media/ubuntu/Date12/TableStruct/new_data"

json_paths = sorted(glob(os.path.join(DATASER_ROOT, DATASET, "*.json")))

row_span_count = defaultdict(int)
col_span_count = defaultdict(int)
row_span_file_count = defaultdict(int)
col_span_file_count = defaultdict(int)

valid_json_num = 0

for json_path in tqdm(json_paths):
    raw_label = json.load(open(json_path, 'r'))
    try:
        struct_label = table2layout(raw_label)
    except:
        continue #有两个json文件有问题，直接跳过

    struct_label = fuse_gt_info(struct_label, raw_label)
    valid, msg = judge_error(raw_label, struct_label)
    if not valid:
        continue

    file_row_span = defaultdict(int)
    file_col_span = defaultdict(int)
    for cell_idx in range(len(struct_label['cells'])):
        layout = np.array(struct_label['layout'])
        cell_positions = np.argwhere(layout == cell_idx)
        row_span = [np.min(cell_positions[:, 0]), np.max(cell_positions[:, 0]) + 1]
        col_span = [np.min(cell_positions[:, 1]), np.max(cell_positions[:, 1]) + 1]

        cell_row_span = row_span[1] - row_span[0]
        cell_col_span = col_span[1] - col_span[0]

        row_span_count[cell_row_span] += 1
        col_span_count[cell_col_span] += 1

        file_row_span[cell_row_span] = 1
        file_col_span[cell_col_span] = 1
    
    # for row_idx in file_row_span.keys():
    #     row_span_file_count[row_idx] += 1
    row_span_file_count[max(file_row_span.keys())] += 1
    # for col_idx in file_col_span.keys():
    #     col_span_file_count[col_idx] += 1
    col_span_file_count[max(file_col_span.keys())] += 1

    valid_json_num += 1


row_span_count[1] = 0 # clear idx1
col_span_count[1] = 0
# row_span_file_count[1] = 0
# col_span_file_count[1] = 0
row_span_count = sorted(row_span_count.items(), key=lambda x: x[0])
col_span_count = sorted(col_span_count.items(), key=lambda x: x[0])
row_span_file_count = sorted(row_span_file_count.items(), key=lambda x: x[0])
col_span_file_count = sorted(col_span_file_count.items(), key=lambda x: x[0])
sum_row_span = sum([i[1] for i in row_span_count])
sum_col_span = sum([i[1] for i in col_span_count])
sum_row_span_file = sum([i[1] for i in row_span_file_count])
sum_col_span_file = sum([i[1] for i in col_span_file_count])

print("-"*10+"row_span_count_precent:"+"-"*10)
accumulated = 0
# row_span_count_precent = [0] * len(row_span_count)
for i in range(len(row_span_count)):
    row_span = row_span_count[i][0]
    row_span_cnt = row_span_count[i][1]
    row_span_cnt_precent = row_span_count[i][1]/sum_row_span

    accumulated += row_span_count[i][1]
    accumulated_precent = accumulated/sum_row_span

    print(f"{row_span:2d}: {row_span_cnt:3d}, {row_span_cnt_precent*100:3.2f}%,\
        row_span <={row_span:3d}: {accumulated_precent*100:3.2f}%")

accumulated = 0
print("-"*10+"col_span_count_precent:"+"-"*10)
for i in range(len(col_span_count)):
    col_span = col_span_count[i][0]
    col_span_cnt = col_span_count[i][1]
    col_span_cnt_precnt = col_span_count[i][1]/sum_col_span

    accumulated += col_span_count[i][1]
    accumulated_precent = accumulated/sum_col_span
    print(f"{col_span:2d}: {col_span_cnt:3d}, {col_span_cnt_precnt*100:3.2f}%,\
        col_span <={col_span:3d}: {accumulated_precent*100:3.2f}%")

accumulated = 0
print("-"*10+"row_span_file_count_precent:"+"-"*10)
for i in range(len(row_span_file_count)):
    row_span, row_span_file_cnt = row_span_file_count[i]
    row_span_file_cnt_precent = row_span_file_cnt/valid_json_num

    accumulated += row_span_file_cnt
    accumulated_precent = accumulated/valid_json_num

    print(f"{row_span:2d}: {row_span_file_cnt:3d}, {row_span_file_cnt_precent*100:3.2f}%,\
        row_span <={row_span:3d}: {accumulated_precent*100:3.2f}%")

accumulated = 0
print("-"*10+"col_span_file_count_precent:"+"-"*10)
for i in range(len(col_span_file_count)):
    col_span, col_span_file_cnt = col_span_file_count[i]
    col_span_file_cnt_precnt = col_span_file_cnt/valid_json_num

    accumulated += col_span_file_cnt
    accumulated_precent = accumulated/valid_json_num
    print(f"{col_span:2d}: {col_span_file_cnt:3d}, {col_span_file_cnt_precnt*100:3.2f}%,\
        col_span <={col_span:3d}: {accumulated_precent*100:3.2f}%")


  0%|          | 0/12104 [00:00<?, ?it/s]

100%|██████████| 12104/12104 [09:23<00:00, 21.49it/s]

----------row_span_count_precent:----------
 1:   0, 0.00%,        row_span <=  1: 0.00%
 2: 8908, 64.53%,        row_span <=  2: 64.53%
 3: 2075, 15.03%,        row_span <=  3: 79.56%
 4: 1016, 7.36%,        row_span <=  4: 86.92%
 5: 623, 4.51%,        row_span <=  5: 91.44%
 6: 385, 2.79%,        row_span <=  6: 94.23%
 7: 194, 1.41%,        row_span <=  7: 95.63%
 8: 161, 1.17%,        row_span <=  8: 96.80%
 9: 130, 0.94%,        row_span <=  9: 97.74%
10:  68, 0.49%,        row_span <= 10: 98.23%
11:  40, 0.29%,        row_span <= 11: 98.52%
12:  40, 0.29%,        row_span <= 12: 98.81%
13:  27, 0.20%,        row_span <= 13: 99.01%
14:  38, 0.28%,        row_span <= 14: 99.28%
15:  14, 0.10%,        row_span <= 15: 99.38%
16:  18, 0.13%,        row_span <= 16: 99.51%
17:  16, 0.12%,        row_span <= 17: 99.63%
18:   9, 0.07%,        row_span <= 18: 99.70%
19:   2, 0.01%,        row_span <= 19: 99.71%
20:   7, 0.05%,        row_span <= 20: 99.76%
21:   1, 0.01%,        row_span 


