This notebook was run on [kaggle](https://www.kaggle.com/).

In [1]:
!wget https://www.statmt.org/wmt12/un.es-en.tgz

--2025-03-04 05:26:29--  https://www.statmt.org/wmt12/un.es-en.tgz
Resolving www.statmt.org (www.statmt.org)... 129.215.32.28
Connecting to www.statmt.org (www.statmt.org)|129.215.32.28|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1103180390 (1.0G) [application/x-gzip]
Saving to: ‘un.es-en.tgz’


2025-03-04 05:27:46 (13.9 MB/s) - ‘un.es-en.tgz’ saved [1103180390/1103180390]



In [2]:
!tar -xvzf un.es-en.tgz

MultiUN4WMT12/README
MultiUN4WMT12/undoc.2000.es-en.en
MultiUN4WMT12/undoc.2000.es-en.es


In [3]:
!rm un.es-en.tgz

In [4]:
from typing import List
from tqdm import tqdm
import random
import json

In [5]:
random.seed(7)

In [6]:
train_f = 5e-4

In [7]:
ens = []
with open("/kaggle/working/MultiUN4WMT12/undoc.2000.es-en.en", "r") as f:
    while l := f.readline().strip().replace("\xa0", " "):
        ens.append(l)

In [8]:
en_last = ens[-1]
en_last

'Property transferred to other missions'

In [9]:
train_count = round(len(ens)*train_f)
train_count, len(ens)

(5598, 11196913)

In [10]:
ens_train = random.choices(ens, k=train_count)

In [11]:
del ens

In [12]:
ess = []
with open("/kaggle/working/MultiUN4WMT12/undoc.2000.es-en.es", "r") as f:
    while l := f.readline().strip().replace("\xa0", " "):
        ess.append(l)

In [13]:
es_last = ess[-1]
es_last

'Bienes transferidos a otras misiones'

In [14]:
ess_train = random.choices(ess, k=train_count)

In [15]:
del ess

In [16]:
def preprocess_sentence(s: str, bos: str = "<bos>", eos: str = "<eos>") -> List[str]:
    return [bos] + list(s.replace("  ", " ")) + [eos]

In [17]:
# preprocess_sentence(en_last), preprocess_sentence(es_last)

In [18]:
len(ens_train), len(ess_train)

(5598, 5598)

In [19]:
dataset = ens_train + ess_train

In [20]:
del ens_train

In [21]:
del ess_train

In [22]:
len(dataset), dataset[0], dataset[-1]

(11196,
 'The Prisons Bill prohibits the use of solitary confinement except in the case of extremely violent offenders.',
 'La evidencia de estas dificultades ha suscitado también la necesidad de trabajar en el futuro en el desarrollo de un sistema de información como el que se ha aludido en los párrafos 128 a 132.')

In [23]:
def preprocess_dataset(dataset: List[str]):
    preprocessed_dataset = []
    for s in dataset:
        preprocessed_dataset.append(preprocess_sentence(s))
    return preprocessed_dataset

In [24]:
preprocessed_dataset = preprocess_dataset(dataset)

In [25]:
def bpe_train(dataset: List[str], num_target_rules: int = 32_000):
    vocab = set()
    rules = []
    
    for d in tqdm(dataset, desc="create initial vocab"):
        vocab = vocab.union(set(d))
    
    for _ in tqdm(range(num_target_rules), desc="train"):
        bigram_counter = {}
        for d in dataset:
            # print(d)
            for j in range(len(d)-1):
                unite = tuple(d[j:j+2])
                if unite in bigram_counter:
                    bigram_counter[unite] += 1
                else:
                    bigram_counter[unite] = 1

        mx_cnt = 0
        target = ()
        for unite, cnt in bigram_counter.items():
            if cnt > mx_cnt:
                mx_cnt = cnt
                target = unite
        # print(target_lst)

        vocab.add("".join(target))
        rules.append(target)

        # y = 0
        for i in range(len(dataset)):
            while True:
                d = dataset[i]
                left = -1
                for j in range(len(d)-1):
                    if d[j:j+2] == list(target):
                        left = j
                        break
                if left >= 0:
                    dataset[i] = d[:left] + [''.join(d[left:left+2])] + d[left+2:]
                else:
                    break
            # y += len(dataset[i])
        
    return vocab, rules

In [26]:
vocab, rules = bpe_train(preprocessed_dataset, num_target_rules=10_000)

create initial vocab: 100%|██████████| 11196/11196 [00:00<00:00, 152519.65it/s]
train: 100%|██████████| 10000/10000 [1:54:06<00:00,  1.46it/s] 


In [27]:
rules[:20]

[('e', ' '),
 ('s', ' '),
 ('n', ' '),
 ('a', ' '),
 ('o', ' '),
 ('e', 'r'),
 ('e', 'n'),
 ('o', 'n'),
 ('t', 'i'),
 ('c', 'i'),
 ('l', ' '),
 ('t', 'h'),
 ('r', 'e'),
 ('d', 'e '),
 ('a', 'r'),
 (',', ' '),
 ('a', 'n'),
 ('d', ' '),
 ('i', 'n'),
 ('o', 'r')]

In [28]:
len(rules), len(vocab)

(10000, 10146)

In [29]:
vocab_map = {e : i for i, e in enumerate(vocab)}

In [30]:
with open("bpe-en-es-10k-vocab.json", "w") as f:
    json.dump(vocab_map, f)

In [31]:
with open("bpe-en-es-10k-rules.txt", "w") as f:
    for rule in rules:
        f.write(f"{vocab_map[rule[0]]} {vocab_map[rule[1]]}\n")

In [None]:
rules[:10]

In [32]:
rules[1000:1010]

[('<bos>L', 'os '),
 ('al', 'es, '),
 ('el', 'y '),
 ('particul', 'ar '),
 ('200', '1'),
 ('ed ', 'by '),
 ('ad', 'ministr'),
 ('Or', 'ganiz'),
 ('us', 't '),
 ('e ', 'of the ')]

In [36]:
preprocessed_dataset[:1]

[['<bos>The ',
  'Pri',
  'sons ',
  'B',
  'ill ',
  'prohib',
  'its ',
  'the ',
  'use of ',
  'sol',
  'it',
  'ary ',
  'con',
  'fin',
  'ement ',
  'excep',
  't ',
  'in the ',
  'case of ',
  'extrem',
  'ely ',
  'viol',
  'ent ',
  'off',
  'ender',
  's.<eos>']]

In [40]:
def apply_bpe_rules(dataset, rules, preprocessed=False):
    if not preprocessed:
        p_dataset = preprocess_dataset(dataset)
    else:
        p_dataset = dataset.copy()

    for rule in rules:
        for i in range(len(p_dataset)):
            while True:
                d = p_dataset[i]
                left = -1
                for j in range(len(d)-1):
                    if d[j:j+2] == list(rule):
                        left = j
                        break
                if left >= 0:
                    p_dataset[i] = d[:left] + [''.join(d[left:left+2])] + d[left+2:]
                else:
                    break
    return p_dataset

In [41]:
test_dataset = [
    "Hello, where are you?",
    "Is this making any kind of sense?",
    "This is an imaginary situation.",
    "It is such a shame!"
]

pretokenized_test_dataset = apply_bpe_rules(test_dataset, rules)

In [42]:
pretokenized_test_dataset

[['<bos>H', 'ello, ', 'where ', 'are ', 'you', '?<eos>'],
 ['<bos>I',
  's ',
  'this ',
  'making ',
  'any ',
  'kind ',
  'of ',
  'sen',
  'se',
  '?<eos>'],
 ['<bos>This ', 'is ', 'an ', 'im', 'ag', 'in', 'ary ', 'situ', 'ation.<eos>'],
 ['<bos>It is ', 'such a ', 'sh', 'ame', '!', '<eos>']]

In [46]:
def tokenize_dataset(pre_tokenized_dataset, vocab):
    tokenized_dataset = []
    for d in pre_tokenized_dataset:
        tokenized_dataset.append(list(map(lambda tok: vocab[tok], d)))
    return tokenized_dataset

In [49]:
tokenized_test_dataset = tokenize_dataset(pretokenized_test_dataset, vocab_map)

In [50]:
tokenized_test_dataset

[[5454, 9591, 1728, 7392, 3576, 1837],
 [4574, 2886, 8910, 4509, 6015, 7776, 9384, 8519, 9397, 1837],
 [3655, 3224, 4806, 5785, 5439, 4973, 2234, 2577, 8097],
 [8233, 9158, 2761, 4421, 418, 2992]]