In [1]:
# identify graphemes + graphemes stats
# train/val split by files

In [2]:
from pathlib import Path
import pickle
import numpy as np
from collections import Counter
from collections import defaultdict
import random

In [3]:
np.random.seed(678)
random.seed(678)

In [4]:
base_dir = Path("data/train")

In [5]:
uttids = []
char_counter = Counter()
for filepath in (base_dir / "words").glob("*.txt"):
    uttid = filepath.stem
    uttids.append(uttid)
    with open(filepath, "r", encoding="utf-8") as f:
        text = f.read().strip()
        char_counter.update(text)
uttids = sorted(uttids)

In [6]:
uttids_blocks = defaultdict(list)
for uttid in uttids:
    series, page, line = map(int, uttid.split("_"))
    uttids_blocks[(series, page)].append(line)

In [7]:
len(uttids_blocks)

677

In [8]:
keys = list(uttids_blocks.keys())

In [9]:
num_train_keys = len(keys) * 14 // 15

In [10]:
np.random.shuffle(keys)

In [13]:
train_keys = keys[:num_train_keys]
val_keys = keys[num_train_keys:]

In [15]:
train_uttids = []
val_uttids = []
for series, page in train_keys:
    for line in sorted(uttids_blocks[(series, page)]):
        uttid = f"{series}_{page}_{line}"
        train_uttids.append(uttid)
for series, page in val_keys:
    for line in sorted(uttids_blocks[(series, page)]):
        uttid = f"{series}_{page}_{line}"
        val_uttids.append(uttid)

In [16]:
len(train_uttids), len(val_uttids), len(train_uttids) + len(val_uttids)

(5754, 442, 6196)

In [18]:
with open("data/train_uttids_set.pkl", "wb") as f:
    pickle.dump(set(train_uttids), f)
with open("data/val_uttids_set.pkl", "wb") as f:
    pickle.dump(set(val_uttids), f)

In [19]:
with open("data/train_uttids.txt", "w", encoding="utf-8") as f:
    print("\n".join(sorted(train_uttids)), file=f)
with open("data/val_uttids.txt", "w", encoding="utf-8") as f:
    print("\n".join(sorted(val_uttids)), file=f)

In [20]:
with open("data/chars.txt", "w", encoding="utf-8") as f:
    print("\n".join(sorted(char_counter.keys())), file=f)
with open("data/chars_set.pkl", "wb") as f:
    pickle.dump(set(char_counter.keys()), f)
with open("data/chars_counter.pkl", "wb") as f:
    pickle.dump(char_counter, f)

In [21]:
with open("data/chars_stats.txt", "w", encoding="utf-8") as f:
    for char, cnt in char_counter.most_common():
        print(char, cnt, sep="\t", file=f)