In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import os
import json
import pickle
from transformers import BertTokenizer
import torch
import numpy as np
import re

import warnings
warnings.filterwarnings("ignore")

from tqdm.notebook import tqdm

In [16]:
data_path = "D:/Data/neural-punctuator/szeged/"
file_path = data_path + "szeged.txt"

In [17]:
with open(file_path, 'r', encoding='utf-8') as f:
    text = f.readlines()

In [18]:
len(text), text[1]

(82100,
 ' A szállásunk egy Balaton melletti kis üdülőfaluban, Zamárdiban volt, a Postának az üdülőházában.\n')

In [19]:
from collections import OrderedDict

text = list(OrderedDict.fromkeys(text))
len(text)

80875

In [20]:
from sklearn.utils import shuffle

text = shuffle(text, random_state=0)

text[1]

' Ez jutott az eszébe önkéntelenül.\n'

In [21]:
text = [t.strip() for t in text]
text[1]

'Ez jutott az eszébe önkéntelenül.'

In [22]:
len(' '.join([t.strip() for t in text]).split(' '))

1265372

In [23]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-multilingual-uncased')

Using cache found in C:\Users\gbenc/.cache\torch\hub\huggingface_pytorch-transformers_master


In [24]:
def clean_text(text):
    text = text.replace('!', '.')
    text = text.replace(':', ',')
    text = text.replace('--', ',')
    text = text.replace('-', ',')
    text = text.replace(';', '.')
    text = text.replace(' ,', ',')
    text = text.replace('♫', '')

    text = re.sub(r'--\s?--', '', text)
    text = re.sub(r'\s+', ' ', text)

    text = text.replace(' ,', ',')
    
    text = re.sub(r',\s?,', ',', text)
    
    return text.strip()

In [25]:
# cleaned_text = [clean_text(t) for t in text]

# for t, tc in zip(' '.join(text).split(' ')[:100], ' '.join(cleaned_text).split(' ')):
#     print(f"{t:20}\t{tc}")

In [26]:
text = [clean_text(t) for t in text]
text[1]

'Ez jutott az eszébe önkéntelenül.'

In [27]:
tokenizer.encode(".?,")

[101, 119, 136, 117, 102]

In [28]:
len(' '.join(text).split(' '))

1248861

In [29]:
tokenized = tokenizer.tokenize(' '.join(text))

In [30]:
tokenized[:5]

['az', 'egyes', '_', 'lesz', '##allo']

In [31]:
len(tokenized)

2771074

In [32]:
text2 = tokenizer.convert_tokens_to_string(tokenized)

In [33]:
text2 = [t for t in text2.split(' ') if t not in [',', '.', '?']]
len(text2)

1332371

In [34]:
for t1, t2 in zip(' '.join(text).split(' ')[:1000], text2):
    print(f'{t1:20}{t2}')         

Az                  az
Egyes_Leszállópályánakegyes
például             _
nem                 leszallopalyanak
ez                  peldaul
volt                nem
a                   ez
neve                volt
akkoriban,          a
Angliának           neve
vagy                akkoriban
Nagy,Britanniának   anglianak
nevezték,           vagy
bár                 nagy
Londont,            britannianak
ebben               neveztek
meglehetősen        bar
bizonyos            londont
volt                ebben
Winston,            meglehetosen
mindig              bizonyos
Londonnak           volt
hívták.             winston
Ez                  mindig
jutott              londonnak
az                  hivtak
eszébe              ez
önkéntelenül.       jutott
Könyvtár            az
megosztása          eszebe
közben              onkentelenul
(                   konyvtar
a                   megosztasa
tulajdonságlap      kozben
Sharing,            (
Megosztás           a
fülén               tulajdonsa

haza,               es
hogy                novekedeshordozo
gyorsabban          szegmenserol
haladjanak,         a
vagy                neuer
akik                _
rendszergazdaként   marktrol
az                  jelentette
otthoni             be
gépről              penteken
távfelügyeleti      a
pillantást          tozsdet
szeretnének         mukodteto
vetni               deutsche
rendszerük          _
valamely            borse
elemére.            _
A                   ag
napokban            az
utaznak.            uj
Az                  szabalyozas
AOL                 2001
az                  oktober
elmúlt              1
év                  jen
karácsonyát         lep
megelőző            eletbe
időszakban          hiszen
2,5_milliárd        ma
dolláros            az
forgalmat           otthonukban
bonyolított.        szamitogepet
Hasonló             mukodtetok
mértékben           nagy
1,25                resze
százalékra          mar
csökkent            nem
a                   azoknak

In [35]:
id2target = {-1: 0,
              119: 1, # .
              136: 2, # ?
              117: 3,  # ,
              -2: -1, # will be masked
             }
target2id = {value: key for key, value in id2target.items()}
    
    
def create_target(encoded):
    targets = []
    text = []

    target = -2 # Always mask after [CLS] token

    text.append(encoded[0])
    idx = 1
    while idx < len(encoded):
        word = encoded[idx]
        if word in id2target.keys():
            if word in (119,117) and \
            tokenizer._convert_id_to_token(encoded[idx-1]).lstrip('▁').isnumeric() and \
            tokenizer._convert_id_to_token(encoded[idx+1]).isnumeric():
                targets.append(-2) # prev number
#                 print(-2)
#                 print(f"{tokenizer._convert_id_to_token(word):15}\t", end="")
                text.append(word)                
                target = -2 # '. or ,'
            else:
                target = word
        else:
            if tokenizer._convert_id_to_token(word).startswith('##'):
                target = -2
            targets.append(target)
#             print(target)
#             print(tokenizer._convert_id_to_token(word), '\t', end="")
            
            target = -1
            text.append(word)
            
        idx += 1

    targets.append(target)

    targets = [id2target[t] for t in targets]

    return text, targets

In [36]:
train_n = 70_000
valid_n = 8_000

train_text = ' '.join(text[:train_n])
valid_text = ' '.join(text[train_n:train_n+valid_n])
test_text = ' '.join(text[train_n+valid_n:])

len(train_text.split(' ')), len(valid_text.split(' ')), len(test_text.split(' '))

(1080644, 123975, 44242)

In [37]:
len(text) - train_n - valid_n

2875

In [38]:
train_tokens = tokenizer.encode(train_text)
valid_tokens = tokenizer.encode(valid_text)
test_tokens = tokenizer.encode(test_text)

Token indices sequence length is longer than the specified maximum sequence length for this model (2398794 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (274493 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (97793 > 512). Running this sequence through the model will result in indexing errors


In [39]:
train_tokens, train_targets = create_target(train_tokens)
valid_tokens, valid_targets = create_target(valid_tokens)
test_tokens, test_targets = create_target(test_tokens)

In [40]:
# For backward campatibility
train_tokens, train_targets = [train_tokens], [train_targets]
valid_tokens, valid_targets = [valid_tokens], [valid_targets]
test_tokens, test_targets = [test_tokens], [test_targets]

In [41]:
with open(data_path + 'train_data.pkl', 'wb') as f:
    pickle.dump((train_tokens, train_targets), f)
with open(data_path + 'valid_data.pkl', 'wb') as f:
    pickle.dump((valid_tokens, valid_targets), f)
with open(data_path + 'test_data.pkl', 'wb') as f:
    pickle.dump((test_tokens, test_targets), f)

In [42]:
(np.array(train_targets) >= 0).sum()

1146447

In [43]:
(np.array(train_targets) >= 0).sum(), (np.array(valid_targets) >= 0).sum(), (np.array(test_targets) >= 0).sum()

(1146447, 131452, 46717)

In [44]:
sum(((np.array(train_targets) >= 0).sum(), (np.array(valid_targets) >= 0).sum(), (np.array(test_targets) >= 0).sum()))

1324616

In [45]:
from collections import Counter

for tr in (train_targets, valid_targets, test_targets):
    c = Counter((t for targets in tr for t in targets))
    print('\t'.join([str(c[i]) for i in (1,2,3,0,-1)]))

80367	1681	120986	943413	1041397
9129	178	13811	108334	118997
3347	70	4888	38412	42440


In [46]:
sum((np.array(vt) != -1).sum() for vt in train_targets), \
sum((np.array(vt) != -1).sum() for vt in valid_targets), \
sum((np.array(vt) != -1).sum() for vt in test_targets)

(1146447, 131452, 46717)

In [47]:
sum((np.array(vt) == 2).sum() for vt in test_targets)

70

In [48]:
sum(len(vt) for vt in train_targets), \
sum(len(vt) for vt in valid_targets), \
sum(len(vt) for vt in test_targets)

(2187844, 250449, 89157)