In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import os
import json
import pickle
import torch
import numpy as np
import re
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from transformers import AutoTokenizer
import warnings
warnings.filterwarnings("ignore")

In [2]:
model_type = 'SZTAKI-HLT/hubert-base-cc' # bert-base-multilingual-cased


data_path = f"D:/Data/neural-punctuator/"
os.makedirs(data_path + model_type, exist_ok=True)
file_path = data_path + "szeged.txt"

with open(file_path, 'r', encoding='utf-8') as f:
    text = f.readlines()
    
len(text), text[1]

(82100,
 ' A szállásunk egy Balaton melletti kis üdülőfaluban, Zamárdiban volt, a Postának az üdülőházában.\n')

In [3]:
from collections import OrderedDict

text = list(OrderedDict.fromkeys(text))
len(text)

80875

In [4]:
from sklearn.utils import shuffle

text = shuffle(text, random_state=0)

text[1]

' Ez jutott az eszébe önkéntelenül.\n'

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_type)

In [6]:
def clean_text(text):
    text = text.replace('!', '.')
    text = text.replace(':', ',')
    text = text.replace('--', ',')
    text = text.replace('-', ',')
    text = text.replace(';', '.')
    text = text.replace(' ,', ',')
    text = text.replace('♫', '')

    text = re.sub(r'--\s?--', '', text)
    text = re.sub(r'\s+', ' ', text)

    text = re.sub(r'\s+', ' ', text)
    
    text = re.sub(r',\s?,', ', ', text)
    text = re.sub(r'\.[\s+.]+', '. ', text)
    
    text = re.sub(r',\s?,', ',', text)
    
    remove_space_before = [',', '?', '.', '!', '\n']
    for c in remove_space_before:
        text = text.replace(' ' + c, c)
    
    text = re.sub(r',\s+[\.\s]+', ', ', text)
    text = re.sub(r',\s+[,\s]+', ', ', text)
    text = re.sub(r'\.\s+[,\s]+', '. ', text)
    text = text.lstrip('.,?')
    
    return text.strip().lower()

In [7]:
text = [clean_text(t) for t in text]
text = [t for t in text if len(t) > 0]
text[1]

'ez jutott az eszébe önkéntelenül.'

In [8]:
text[29545]

'nem lennék bunkó,., begyöpösödött agyú, naiv ember.'

In [9]:
for i, t in enumerate(text):
    if 'állományok visszaállításához' in t:
        print(i, t)

29544 újabb párbeszéddoboz jelenik meg, amelyben az ok parancsgombra kell kattintani az állományok visszaállításához.


In [10]:
tokenizer.encode(".?,")

[2, 4575, 8308, 3576, 3]

In [11]:
target_token2id = {t: tokenizer.encode(t)[-2] for t in ".?,"}
target_token2id

{'.': 4575, '?': 8308, ',': 3576}

In [12]:
target_ids = list(target_token2id.values())
target_ids

[4575, 8308, 3576]

In [13]:
id2target = {
    0: 0,
    -1: -1,
}
for i, ti in enumerate(target_ids):
    id2target[ti] = i+1
target2id = {value: key for key, value in id2target.items()}

def create_target(text):
    encoded_words, targets = [], []
    
    words = text.split(' ')

    for word in tqdm(words):
        target = 0
        for target_token, target_id in target_token2id.items():
            if word.endswith(target_token):
                word = word.rstrip(target_token)
                target = id2target[target_id]

        encoded_word = tokenizer.encode(word, add_special_tokens=False)
        
        for w in encoded_word:
            encoded_words.append(w)
        for _ in range(len(encoded_word)-1):
            targets.append(-1)
        targets.append(target)
#         print([tokenizer._convert_id_to_token(ew) for ew in encoded_word], target)
        assert(len(encoded_word)>0)

    return encoded_words, targets

In [14]:
train_n = 70_000
valid_n = 8_000

train_text = ' '.join(text[:train_n])
valid_text = ' '.join(text[train_n:train_n+valid_n])
test_text = ' '.join(text[train_n+valid_n:])

len(train_text.split(' ')), len(valid_text.split(' ')), len(test_text.split(' '))

(1076033, 123448, 44033)

In [15]:
len(text) - train_n - valid_n

2873

In [16]:
train_text.split(' ')[455735-5:455735+5]

['is',
 'történtek',
 'a',
 'ferencvárosban.',
 'megfürödtünk',
 'és',
 'megszárítottuk',
 'a',
 'ruháinkat.',
 'idén']

In [17]:
train_tokens, train_targets = create_target(train_text)
valid_tokens, valid_targets = create_target(valid_text)
test_tokens, test_targets = create_target(test_text)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1076033.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=123448.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=44033.0), HTML(value='')))




In [18]:
# For backward campatibility
train_tokens, train_targets = [train_tokens], [train_targets]
valid_tokens, valid_targets = [valid_tokens], [valid_targets]
test_tokens, test_targets = [test_tokens], [test_targets]

In [19]:
os.makedirs(data_path + model_type, exist_ok=True)
with open(data_path + f'{model_type}/train_data.pkl', 'wb') as f:
    pickle.dump((train_tokens, train_targets), f)
with open(data_path + f'{model_type}/valid_data.pkl', 'wb') as f:
    pickle.dump((valid_tokens, valid_targets), f)
with open(data_path + f'{model_type}/test_data.pkl', 'wb') as f:
    pickle.dump((test_tokens, test_targets), f)

In [20]:
from collections import Counter

for ds in (train_targets, valid_targets, test_targets):
    c = Counter((t for targets in ds for t in targets))
    print('\t'.join([str(c[i]) for i in (1,2,3,0,-1)]))

82375	1849	104632	887177	492591
9355	201	11991	101901	56059
3438	76	4232	36287	20047


In [21]:
for ds in (train_targets, valid_targets, test_targets):
    c = Counter((t for targets in ds for t in targets))
    print('\t'.join([str(c[i]) for i in (1,2,3,0,-1)]))

82375	1849	104632	887177	492591
9355	201	11991	101901	56059
3438	76	4232	36287	20047
