In [1]:
import os
import re
import codecs

from typing import List

import pandas as pd

from keras.models import Sequential
from keras import layers

Using TensorFlow backend.


### settings

In [6]:
DATA_PATH = "data"
ENCODING = "utf-8"

# train settings
TRAIN_FPATH = "train-bel.txt"
TRAIN_RELPATH = os.path.join(DATA_PATH, TRAIN_FPATH)

# test settings
TEST_FPATH = "test-bel.txt"
TEST_RELPATH = os.path.join(DATA_PATH, TEST_FPATH)

In [25]:
def read_data(fpath, encoding=ENCODING):
    # type: (str, str) -> List[str]
    
    file = codecs.open(fpath, "r", encoding=encoding)
    sentences = [_.strip() for _ in file if _.strip()]
    return sentences

train = read_data(TRAIN_RELPATH)
test = read_data(TEST_RELPATH)

In [26]:
train[42]

'Заведзеная крымінальная справа па артыкуле «Забойства, здзейсненае агульнанебяспечным спосабам». Падазраванаму можа пагражаць турэмны тэрмін аж да пажыццёвага пазбаўлення волі.'

In [27]:
len(train), len(test)

(800000, 200000)

In [31]:
def _unspace_sentence(sentence):
    # type: (str) -> str
    
    return re.sub(" ", "", sentence)


def _transform_sentence(sentence):
    # type: (str) -> str
    
    tokens = []
    for s in sentence:
        if s.isspace():
            tokens[-1] = 1
        else:
            tokens.append(0)
    
    tokens = "".join(map(str, tokens))
    unspace_sent = _unspace_sentence(sentence)
    
    assert(len(unspace_sent) == len(tokens))
    
    return unspace_sent, tokens


def transform_data(dataset):
    # type: (List[str]) -> List[str]
    
    return list(map(_transform_sentence, dataset))

In [33]:
train = transform_data(train)

In [34]:
train[0]

('Першаязгэтыхпраблемаўсфармуляванаятак:',
 '00000110000100000000100000000000010000')

In [15]:
train_symbol_set = set(codecs.open("data/train-bel.txt", "r", encoding="utf-8").read())
test_symbol_set = set(codecs.open("data/test-bel.txt", "r", encoding="utf-8").read())

In [16]:
len(train_symbol_set), len(test_symbol_set)

(403, 322)

In [17]:
len(train_symbol_set & test_symbol_set), len(train_symbol_set | test_symbol_set)

(296, 429)

In [19]:
test_symbol_set - train_symbol_set

{'\x97',
 '÷',
 'ǝ',
 'ɔ',
 'ʁ',
 'ʏ',
 '̀',
 'Ѳ',
 'י',
 'כ',
 'ל',
 'מ',
 'ר',
 'ת',
 'ب',
 'ت',
 'د',
 'ر',
 'م',
 'و',
 'ي',
 '‐',
 '‒',
 '≠',
 '⬇',
 '￼'}