In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import os
import json
import pickle
from transformers import BertTokenizer
import torch
import numpy as np
import re

import warnings
warnings.filterwarnings("ignore")

from tqdm.notebook import tqdm

In [2]:
data_path = "D:/Data/neural-punctuator/szeged/"
file_path = data_path + "szeged.txt"

In [3]:
with open(file_path, 'r', encoding='utf-8') as f:
    text = f.readlines()

In [4]:
len(text), text[1]

(82100,
 ' A szállásunk egy Balaton melletti kis üdülőfaluban, Zamárdiban volt, a Postának az üdülőházában.\n')

In [5]:
from collections import OrderedDict

text = list(OrderedDict.fromkeys(text))

len(text)

80875

In [6]:
text = [t.strip() for t in text]
text[1]

'A szállásunk egy Balaton melletti kis üdülőfaluban, Zamárdiban volt, a Postának az üdülőházában.'

In [7]:
len(' '.join([t.strip() for t in text]).split(' '))

1265372

In [8]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-multilingual-uncased')

Using cache found in C:\Users\gbenc/.cache\torch\hub\huggingface_pytorch-transformers_master


In [9]:
def clean_text(text):
    text = text.replace('!', '.')
    text = text.replace(':', ',')
    text = text.replace('--', ',')
    text = text.replace('-', ',')
    text = text.replace(';', '.')
    text = text.replace(' ,', ',')
    text = text.replace('♫', '')

    text = re.sub(r'--\s?--', '', text)
    text = re.sub(r'\s+', ' ', text)

    text = text.replace(' ,', ',')
    
    text = re.sub(r',\s?,', ',', text)
    
    return text.strip()

In [10]:
# cleaned_text = [clean_text(t) for t in text]

# for t, tc in zip(' '.join(text).split(' ')[:100], ' '.join(cleaned_text).split(' ')):
#     print(f"{t:20}\t{tc}")

In [11]:
text = [clean_text(t) for t in text]
text[1]

'A szállásunk egy Balaton melletti kis üdülőfaluban, Zamárdiban volt, a Postának az üdülőházában.'

In [12]:
tokenizer.encode(".?,")

[101, 119, 136, 117, 102]

In [13]:
len(' '.join(text).split(' '))

1248861

In [14]:
tokenized = tokenizer.tokenize(' '.join(text))

In [15]:
tokenized[:5]

['mult', 'ev', 'szeptember', '##eben', 'az']

In [16]:
len(tokenized)

2771074

In [17]:
text2 = tokenizer.convert_tokens_to_string(tokenized)

In [18]:
text2 = [t for t in text2.split(' ') if t not in [',', '.', '?']]
len(text2)

1332371

In [19]:
for t1, t2 in zip(' '.join(text).split(' ')[:1000], text2):
    print(f'{t1:20}{t2}')         

Múlt                mult
év                  ev
szeptemberében      szeptembereben
az                  az
osztállyal          osztallyal
elmentünk           elmentunk
kirándulni          kirandulni
a                   a
Balatonra.          balatonra
A                   a
szállásunk          szallasunk
egy                 egy
Balaton             balaton
melletti            melletti
kis                 kis
üdülőfaluban,       udulofaluban
Zamárdiban          zamardiban
volt,               volt
a                   a
Postának            postanak
az                  az
üdülőházában.       udulohazaban
Pénteki             penteki
napon               napon
indultunk           indultunk
kora                kora
hajnalban.          hajnalban
Mivel               mivel
a                   a
barátnőm,           baratnom
Kata                kata
nem                 nem
barcsi,             barcsi
ezért               ezert
ott                 ott
aludt               aludt
nálunk.             nalunk
A

a                   alltunk
fülébe.             igy
S                   megrokonyodve
úgy                 s
láttam,             csak
mintha              neztuk
kezdene             neztuk
megnyugodni.        egymast
Lassan              majd
kinyújtottam        elkezdtem
felé                egy
a                   indian
kezemet,            szoveget
s                   mormolni
megérintettem       a
a                   fulebe
nyakánál.           s
A                   ugy
teste               lattam
forró               mintha
volt                kezdene
s                   megnyugodni
izzadt.             lassan
A                   kinyujtottam
szája               fele
habzott.            a
Megpróbáltam        kezemet
a                   s
háta                megerintettem
mögé                a
kerülni,            nyakanal
mikor               a
a                   teste
box                 forro
égébe               volt
értem,              s
megpillantottam     izzadt
valami              a


In [31]:
id2target = {-1: 0,
              119: 1, # .
              136: 2, # ?
              117: 3,  # ,
              -2: -1, # will be masked
             }
target2id = {value: key for key, value in id2target.items()}
    
    
def create_target(encoded):
    targets = []
    text = []

    target = -2 # Always mask after [CLS] token

    text.append(encoded[0])
    idx = 1
    while idx < len(encoded):
        word = encoded[idx]
        if word in id2target.keys():
            if word in (119,117) and \
            tokenizer._convert_id_to_token(encoded[idx-1]).lstrip('▁').isnumeric() and \
            tokenizer._convert_id_to_token(encoded[idx+1]).isnumeric():
                targets.append(-2) # prev number
#                 print(-2)
#                 print(f"{tokenizer._convert_id_to_token(word):15}\t", end="")
                text.append(word)                
                target = -2 # '. or ,'
            else:
                target = word
        else:
            if tokenizer._convert_id_to_token(word).startswith('##'):
                target = -2
            targets.append(target)
#             print(target)
#             print(tokenizer._convert_id_to_token(word), '\t', end="")
            
            target = -1
            text.append(word)
            
        idx += 1

    targets.append(target)

    targets = [id2target[t] for t in targets]

    return text, targets

In [27]:
train_n = 72_000
valid_n = 8_000

train_text = ' '.join(text[:train_n])
valid_text = ' '.join(text[train_n:train_n+valid_n])
test_text = ' '.join(text[train_n+valid_n:])

len(train_text.split(' ')), len(valid_text.split(' ')), len(test_text.split(' '))

(1114487, 118467, 15907)

In [28]:
len(text) - train_n - valid_n

875

In [29]:
train_tokens = tokenizer.encode(train_text)
valid_tokens = tokenizer.encode(valid_text)
test_tokens = tokenizer.encode(test_text)

Token indices sequence length is longer than the specified maximum sequence length for this model (2474430 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (258942 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (37708 > 512). Running this sequence through the model will result in indexing errors


In [32]:
train_tokens, train_targets = create_target(train_tokens)
valid_tokens, valid_targets = create_target(valid_tokens)
test_tokens, test_targets = create_target(test_tokens)

In [33]:
# For backward campatibility
train_tokens, train_targets = [train_tokens], [train_targets]
valid_tokens, valid_targets = [valid_tokens], [valid_targets]
test_tokens, test_targets = [test_tokens], [test_targets]

In [34]:
with open(data_path + 'train_data.pkl', 'wb') as f:
    pickle.dump((train_tokens, train_targets), f)
with open(data_path + 'valid_data.pkl', 'wb') as f:
    pickle.dump((valid_tokens, valid_targets), f)
with open(data_path + 'test_data.pkl', 'wb') as f:
    pickle.dump((test_tokens, test_targets), f)

In [35]:
(np.array(train_targets) >= 0).sum()

1183071

In [36]:
(np.array(train_targets) >= 0).sum(), (np.array(valid_targets) >= 0).sum(), (np.array(test_targets) >= 0).sum()

(1183071, 123608, 17938)

In [37]:
sum(((np.array(train_targets) >= 0).sum(), (np.array(valid_targets) >= 0).sum(), (np.array(test_targets) >= 0).sum()))

1324617

In [40]:
from collections import Counter
c = Counter((t for targets in test_targets for t in targets))
[c[i] for i in (1,2,3,0,-1)]

[953, 7, 2137, 14841, 16649]

In [41]:
sum((np.array(vt) != -1).sum() for vt in train_targets), \
sum((np.array(vt) != -1).sum() for vt in valid_targets), \
sum((np.array(vt) != -1).sum() for vt in test_targets)

(1183071, 123608, 17938)

In [42]:
sum(len(vt) for vt in train_targets), \
sum(len(vt) for vt in valid_targets), \
sum(len(vt) for vt in test_targets)

(2260516, 232346, 34587)