# Создание датасета

In [20]:
from bs4 import BeautifulSoup
import os
from matplotlib.image import imread
from matplotlib import pyplot as plt
import numpy as np

In [23]:
def hist_features(bits, dh, dv):
    bits = bits.ravel()
    difs = np.diff(np.pad(bits.astype(int), pad_width=1, mode='constant', constant_values=0))
    run_starts, = np.where(difs > 0)
    run_ends, = np.where(difs < 0)
    
    bins = np.linspace(0, 1, 11)
    lens = (run_ends - run_starts) / dv 
    binned, _ = np.histogram(lens, bins)
    return binned / dv

In [24]:
def get_additional_features(img, x, y, w, h, features):
    box = img[y:y+h, x:x+w]
    addf = {}
    
    area = w * h
    
    addf['feat1h'] = (area - features['n_black']) / ((features['n_h_strokes'] + h ) * h )
    addf['feat1v'] = (area - features['n_black']) / ((features['n_v_strokes'] + w ) * w )
    addf['feat2'] = (features['n_h_strokes'] + features['n_v_strokes']) / max(w, h)
    addf['feat3'] = features['n_black'] / (features['n_v_strokes'] + features['n_h_strokes'])
    
    histf_h = hist_features(box < 30, h, w)
    
    histf_v = hist_features((box < 30).T, w, h)
    
    for i in range(10):
        addf['histv' + str(i)] = histf_v[i]
        addf['histh' + str(i)] = histf_h[i]
    
    return addf

In [28]:
def process_file(img_filename, xml_filename, text=True):
    
    # Image reading -------------------------
    
    img = imread(img_filename)
    if len(img.shape) > 2:
        img = img[:, :, 0]
    
#     plt.figure(figsize=(15, 12))
#     plt.imshow(img, cmap='gray')
    print('Image shape:', img.shape)
    
    # xml reading ---------------------------
    
    with open(xml_filename, 'r') as xfile:
        xml = xfile.read()
    soup = BeautifulSoup(xml, 'xml')
    bb_list = soup.find("Image")
    bb_list = bb_list.find_all('WordFragment')
    
    ans = []
    
    for box in bb_list:
        features = {}
        features['target'] = text
        features['max_h_stroke_len'] = int(box.get('MaxHorzStrokeLength'))
        features['n_v_strokes'] = int(box.get('VertStrokesCount'))
        features['n_h_strokes'] = int(box.get('HorzStrokesCount'))
        features['n_black'] = int(box.get('BlackCount'))
        features['n_holes'] = int(box.get('WhiteHolesCount'))
        
        rect = box.find('Rect')
        x, y = int(rect.get('Left')), int(rect.get('Top'))
        w, h = int(rect.get('Right')), int(rect.get('Bottom'))
        w, h = w - x + 1, h - y + 1
        
        add_features = get_additional_features(img, x, y, w, h, features)
        
        features.update(add_features)
        ans.append(features)

    return ans

In [30]:
from tqdm import tqdm_notebook as tqdm

In [31]:
files = []
xmlfiles = []
mask = []

for r, d, f in os.walk('./Text'):
    for file in f:
        if '.xml' not in file:
            files.append(os.path.join(r, file))
            xmlfiles.append((os.path.join(r, file))[:-3] + 'xml')
            mask.append(True)
for r, d, f in os.walk('./Nontext'):
    for file in f:
        if '.xml' not in file:
            files.append(os.path.join(r, file))
            xmlfiles.append((os.path.join(r, file))[:-3] + 'xml')
            mask.append(False)

data = []
for i in tqdm(range(len(files))):
    data += process_file(files[i], xmlfiles[i], mask[i])
    # break

HBox(children=(IntProgress(value=0, max=34), HTML(value='')))

Image shape: (1819, 1178)
Image shape: (2844, 1264)
Image shape: (3443, 3048)
Image shape: (2202, 2889)
Image shape: (2900, 2181)
Image shape: (1656, 1432)
Image shape: (2134, 681)
Image shape: (1344, 1008)
Image shape: (825, 592)
Image shape: (920, 2208)
Image shape: (629, 512)
Image shape: (935, 1824)
Image shape: (3515, 2491)
Image shape: (1300, 1261)
Image shape: (564, 564)
Image shape: (601, 2416)
Image shape: (196, 480)
Image shape: (69, 480)
Image shape: (185, 480)
Image shape: (1682, 1300)
Image shape: (215, 215)
Image shape: (170, 170)
Image shape: (1964, 2400)
Image shape: (898, 1200)
Image shape: (346, 357)
Image shape: (1024, 1024)
Image shape: (977, 2359)
Image shape: (2985, 3307)
Image shape: (1300, 1214)
Image shape: (1600, 1500)
Image shape: (1250, 2500)
Image shape: (1200, 1200)
Image shape: (944, 1487)
Image shape: (1048, 762)



In [32]:
len(data)

43009

In [33]:
import pandas as pd

In [34]:
data = pd.DataFrame(data)
data.to_csv('data.csv', sep='\t', index=False)