In [1]:
import yaml
import torch
import torch.nn as nn
from argparse import Namespace
from collections import defaultdict, Counter
import onmt
from onmt.inputters.inputter import _load_vocab, _build_fields_vocab, get_fields, IterOnDevice
from onmt.inputters.corpus import ParallelCorpus
from onmt.inputters.dynamic_iterator import DynamicDatasetIter
from onmt.translate import GNMTGlobalScorer, Translator, TranslationBuilder
from onmt.utils.misc import set_random_seed

from onmt.utils.logging import init_logger, logger
init_logger()

<RootLogger root (INFO)>

In [2]:
is_cuda = torch.cuda.is_available()
set_random_seed(1111, is_cuda)

In [3]:
yaml_config = """
src_vocab_size: 128
tgt_vocab_size: 128

save_data: run/samples
src_vocab: vocabs/vocab.en
tgt_vocab: vocabs/vocab.hu

# Corpus opts:
data:
    hunglish:
        path_src: E://Data/hu-nmt/combined-en-hu/hunglish2-tiny-train.en
        path_tgt: E://Data/hu-nmt/combined-en-hu/hunglish2-tiny-train.hu
        transforms: [sentencepiece]
        weight: 1
    valid:
        path_src: E://Data/hu-nmt/combined-en-hu/hunglish2-tiny-valid.en
        path_tgt: E://Data/hu-nmt/combined-en-hu/hunglish2-tiny-valid.hu
        transforms: [sentencepiece]

#### Subword
src_subword_model: sp_models/bpe_en.model
tgt_subword_model: sp_models/bpe_hu.model
src_subword_nbest: 1
src_subword_alpha: 0.0
tgt_subword_nbest: 1
tgt_subword_alpha: 0.0

src_seq_length: 16  # maximum source sequence length
tgt_seq_length: 16  # maximum target sequence length

# TRAIN

# Train on a single GPU
world_size: 1
gpu_ranks: [0]

# Batching
batch_size: 96
#queue_size: 16
#accum_count: [3]

# General opts
save_model: run/model_no_qoutes
keep_checkpoint: 10
save_checkpoint_steps: 10000
average_decay: 0.0005
seed: 1234
report_every: 100
train_steps: 400000
valid_steps: 10000 
single_pass: False
early_stopping: 5 
early_stopping_criteria: ppl

# Optimization
model_dtype: "fp16"
optim: "adam"
learning_rate: 2.0
warmup_steps: 8000
decay_method: "noam"
adam_beta2: 0.998
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "tokens"

# Model
encoder_type: transformer
decoder_type: transformer
enc_layers: 2
dec_layers: 2
heads: 8
rnn_size: 512
word_vec_size: 512
transformer_ff: 2048
dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]
#share_decoder_embeddings: true

# Logging
log_file: run/logs_no_qoutes
"""
config = yaml.safe_load(yaml_config)
with open("config.yaml", "w") as f:
    f.write(yaml_config)

In [4]:
from onmt.utils.parse import ArgumentParser
parser = ArgumentParser(description='build_vocab.py')

In [5]:
from onmt.opts import dynamic_prepare_opts
dynamic_prepare_opts(parser, build_vocab_only=True) #build_vocab_only=True

In [6]:
data_path = config['data']['hunglish']['path_src']
vocab_size = config['src_vocab_size']
subword_model = config['src_subword_model']

!python utils/spm_train.py -d $data_path -p $subword_model --vocab-size $vocab_size

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: E://Data/hu-nmt/combined-en-hu/hunglish2-tiny-train.en
  input_format: 
  model_prefix: sp_models/bpe_en.model
  model_type: BPE
  vocab_size: 128
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  â�‡ 
}
normalizer_spec {
  name: nmt_nfkc
  add_dummy_prefix: 1
  remove_extra_whitespaces

In [7]:
!move sp_models\bpe_en.model.model sp_models\bpe_en.model
!move sp_models\bpe_en.model.vocab sp_models\bpe_en.vocab

        1 file(s) moved.
        1 file(s) moved.


In [8]:
data_path = config['data']['hunglish']['path_tgt']
vocab_size = config['tgt_vocab_size']
subword_model = config['tgt_subword_model']

!python utils/spm_train.py -d $data_path -p $subword_model --vocab-size $vocab_size
!move sp_models\bpe_hu.model.model sp_models\bpe_hu.model
!move sp_models\bpe_hu.model.vocab sp_models\bpe_hu.vocab

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: E://Data/hu-nmt/combined-en-hu/hunglish2-tiny-train.hu
  input_format: 
  model_prefix: sp_models/bpe_hu.model
  model_type: BPE
  vocab_size: 128
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  â�‡ 
}
normalizer_spec {
  name: nmt_nfkc
  add_dummy_prefix: 1
  remove_extra_whitespaces

        1 file(s) moved.
        1 file(s) moved.


In [9]:
base_args = (["-config", "config.yaml", "-n_sample", "-1"])
opts, unknown = parser.parse_known_args(base_args)

In [10]:
opts



In [11]:
from onmt.bin.build_vocab import build_vocab_main
build_vocab_main(opts)

[2021-04-06 10:50:45,110 INFO] Parsed 2 corpora from -data.
[2021-04-06 10:50:45,118 INFO] Counter vocab from -1 samples.
[2021-04-06 10:50:45,120 INFO] n_sample=-1: Build vocab on full datasets.
[2021-04-06 10:50:46,537 INFO] Counters src:124
[2021-04-06 10:50:46,538 INFO] Counters tgt:124


In [12]:
src_vocab_path = opts.src_vocab
tgt_vocab_path = opts.tgt_vocab

In [13]:
counters = defaultdict(Counter)
# load source vocab
_src_vocab, _src_vocab_size = _load_vocab(
    src_vocab_path,
    'src',
    counters)
# load target vocab
_tgt_vocab, _tgt_vocab_size = _load_vocab(
    tgt_vocab_path,
    'tgt',
    counters)

[2021-04-06 10:53:18,120 INFO] Loading src vocabulary from vocabs/vocab.en
[2021-04-06 10:53:18,122 INFO] Loaded src vocab has 124 tokens.
[2021-04-06 10:53:18,123 INFO] Loading tgt vocabulary from vocabs/vocab.hu
[2021-04-06 10:53:18,125 INFO] Loaded tgt vocab has 124 tokens.


In [14]:
# initialize fields
src_nfeats, tgt_nfeats = 0, 0 # do not support word features for now
fields = get_fields(
    'text', src_nfeats, tgt_nfeats)

In [15]:
fields

{'src': <onmt.inputters.text_dataset.TextMultiField at 0x23c7487ae50>,
 'tgt': <onmt.inputters.text_dataset.TextMultiField at 0x23c7487a190>,
 'indices': <torchtext.data.field.Field at 0x23c7487a2b0>}

In [21]:
# build fields vocab
share_vocab = False
vocab_size_multiple = 1
src_vocab_size = config['src_vocab_size']
tgt_vocab_size = config['tgt_vocab_size']
src_words_min_frequency = 1
tgt_words_min_frequency = 1
vocab_fields = _build_fields_vocab(
    fields, counters, 'text', share_vocab,
    vocab_size_multiple,
    src_vocab_size, src_words_min_frequency,
    tgt_vocab_size, tgt_words_min_frequency)

[2021-04-06 10:56:05,239 INFO]  * tgt vocab size: 128.
[2021-04-06 10:56:05,241 INFO]  * src vocab size: 126.


In [22]:
src_text_field = vocab_fields["src"].base_field
src_vocab = src_text_field.vocab
src_padding = src_vocab.stoi[src_text_field.pad_token]

tgt_text_field = vocab_fields['tgt'].base_field
tgt_vocab = tgt_text_field.vocab
tgt_padding = tgt_vocab.stoi[tgt_text_field.pad_token]

In [23]:
emb_size = 100
rnn_size = 500
# Specify the core model.

encoder_embeddings = onmt.modules.Embeddings(emb_size, len(src_vocab),
                                             word_padding_idx=src_padding)

encoder = onmt.encoders.RNNEncoder(hidden_size=rnn_size, num_layers=1,
                                   rnn_type="LSTM", bidirectional=True,
                                   embeddings=encoder_embeddings)

decoder_embeddings = onmt.modules.Embeddings(emb_size, len(tgt_vocab),
                                             word_padding_idx=tgt_padding)
decoder = onmt.decoders.decoder.InputFeedRNNDecoder(
    hidden_size=rnn_size, num_layers=1, bidirectional_encoder=True, 
    rnn_type="LSTM", embeddings=decoder_embeddings)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = onmt.models.model.NMTModel(encoder, decoder)
model.to(device)

# Specify the tgt word generator and loss computation module
model.generator = nn.Sequential(
    nn.Linear(rnn_size, len(tgt_vocab)),
    nn.LogSoftmax(dim=-1)).to(device)

loss = onmt.utils.loss.NMTLossCompute(
    criterion=nn.NLLLoss(ignore_index=tgt_padding, reduction="sum"),
    generator=model.generator)

In [24]:
lr = 1
torch_optimizer = torch.optim.SGD(model.parameters(), lr=lr)
optim = onmt.utils.optimizers.Optimizer(
    torch_optimizer, learning_rate=lr, max_grad_norm=2)

In [25]:
src_train = opts.data['hunglish']['path_src']
tgt_train =opts.data['hunglish']['path_tgt']
src_val = opts.data['valid']['path_src']
tgt_val = opts.data['valid']['path_tgt']

# build the ParallelCorpus
corpus = ParallelCorpus("corpus", src_train, tgt_train)
valid = ParallelCorpus("valid", src_val, tgt_val)

In [26]:
import sentencepiece

# build the training iterator
train_iter = DynamicDatasetIter(
    corpora={"corpus": corpus},
    corpora_info={"corpus": {"weight": 1}},
    transforms={sentencepiece},
    fields=vocab_fields,
    is_train=True,
    batch_type="sents",
    batch_size=8,
    batch_size_multiple=1,
    data_type="text")

In [27]:
# make sure the iteration happens on GPU 0 (-1 for CPU, N for GPU N)
train_iter = iter(IterOnDevice(train_iter, 0))

In [28]:
# build the validation iterator
valid_iter = DynamicDatasetIter(
    corpora={"valid": valid},
    corpora_info={"valid": {"weight": 1}},
    transforms={},
    fields=vocab_fields,
    is_train=False,
    batch_type="tokens",
    batch_size=8*16,
    batch_size_multiple=1,
    data_type="text")

valid_iter = IterOnDevice(valid_iter, 0)

In [36]:
report_manager = onmt.utils.ReportMgr(
    report_every=1, start_time=None, tensorboard_writer=None)

trainer = onmt.Trainer(model=model,
                       train_loss=loss,
                       valid_loss=loss,
                       optim=optim,
                       report_manager=report_manager,
                       dropout=[0.1])

trainer.train(train_iter=train_iter,
              train_steps=30,
              valid_iter=valid_iter,
              valid_steps=5)

[2021-04-06 11:04:03,829 INFO] Start training loop and validate every 5 steps...
[2021-04-06 11:04:03,873 INFO] Step 23/   30; acc:  10.00; ppl: 218.68; xent: 5.39; lr: 1.00000; 1904/1904 tok/s;      0 sec
[2021-04-06 11:04:03,904 INFO] Step 24/   30; acc:  50.00; ppl: 89.15; xent: 4.49; lr: 1.00000; 2482/1930 tok/s;      0 sec
[2021-04-06 11:04:03,946 INFO] Step 25/   30; acc:  81.82; ppl:  2.47; xent: 0.90; lr: 1.00000; 2800/2200 tok/s;      0 sec
[2021-04-06 11:04:03,949 INFO] Loading ParallelCorpus(E://Data/hu-nmt/combined-en-hu/hunglish2-tiny-valid.en, E://Data/hu-nmt/combined-en-hu/hunglish2-tiny-valid.hu, align=None)...
[2021-04-06 11:04:04,248 INFO] Validation perplexity: 5.25635
[2021-04-06 11:04:04,249 INFO] Validation accuracy: 78.0462
[2021-04-06 11:04:04,287 INFO] Step 26/   30; acc:  76.25; ppl:  6.90; xent: 1.93; lr: 1.00000; 260/236 tok/s;      0 sec
[2021-04-06 11:04:04,331 INFO] Step 27/   30; acc:  18.18; ppl: 19.91; xent: 2.99; lr: 1.00000; 1523/2095 tok/s;      0 s

<onmt.utils.statistics.Statistics at 0x23c2afdbca0>

In [47]:
opts.data['valid']['path_src']

'E://Data/hu-nmt/combined-en-hu/hunglish2-tiny-valid.en'

In [70]:
??onmt.encoders.RNNEncoder

In [49]:
from transformers import AutoModel


base = AutoModel.from_pretrained('bert-base-cased')

In [72]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

[2021-04-06 16:42:54,232 INFO] Lock 2464992055056 acquired on C:\Users\gbenc/.cache\huggingface\transformers\6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791.lock


Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

[2021-04-06 16:42:54,970 INFO] Lock 2464992055056 released on C:\Users\gbenc/.cache\huggingface\transformers\6508e60ab3c1200bffa26c95f4b58ac6b6d95fba4db1f195f632fa3cd7bc64cc.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791.lock
[2021-04-06 16:42:55,351 INFO] Lock 2464992054240 acquired on C:\Users\gbenc/.cache\huggingface\transformers\226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6.lock


Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

[2021-04-06 16:42:56,409 INFO] Lock 2464992054240 released on C:\Users\gbenc/.cache\huggingface\transformers\226a307193a9f4344264cdc76a12988448a25345ba172f2c7421f3b6810fddad.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6.lock
[2021-04-06 16:42:57,568 INFO] Lock 2459986215696 acquired on C:\Users\gbenc/.cache\huggingface\transformers\ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f.lock


Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

[2021-04-06 16:42:57,987 INFO] Lock 2459986215696 released on C:\Users\gbenc/.cache\huggingface\transformers\ec84e86ee39bfe112543192cf981deebf7e6cbe8c91b8f7f8f63c9be44366158.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f.lock


In [73]:
??tokenizer

In [82]:
tokenizer.save_vocabulary("tokenizer")

('tokenizer\\vocab.txt',)

In [76]:
tokenizer.save_pretrained("tokenizer/")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json')

In [117]:
tokenizer.

{'Catholic': 2336,
 'Dodge': 14205,
 'refugees': 8940,
 'Yi': 14141,
 'Lands': 17854,
 'communion': 27782,
 'spoken': 4606,
 'Finland': 5776,
 'mystery': 8069,
 'competitive': 6591,
 'association': 3852,
 'Fellowship': 9508,
 'souls': 11191,
 'Source': 5313,
 'XV': 16925,
 'elder': 8110,
 'adulthood': 22777,
 'stamped': 23245,
 'rescued': 10043,
 'Levy': 16809,
 'Enforcement': 22990,
 'Lublin': 20588,
 '##ogel': 27732,
 'succeed': 9381,
 'Confederation': 13052,
 'omitted': 17852,
 'nut': 22664,
 'nutrients': 22667,
 'launching': 12611,
 'vintage': 17787,
 'box': 2884,
 'flicked': 12988,
 'rejoined': 14944,
 '##ruba': 24325,
 '##ía': 7171,
 'Mix': 7347,
 'salaries': 23343,
 'See': 3969,
 'ordering': 13649,
 'shrug': 13786,
 'wines': 16728,
 'uk': 26006,
 '##ector': 20302,
 'Chester': 8459,
 '##leaf': 21407,
 'accordion': 22827,
 'Alive': 15907,
 'examinations': 17865,
 'ể': 747,
 'grass': 5282,
 '்': 682,
 'desperately': 9600,
 'drainage': 12779,
 '90s': 18476,
 'Gorge': 26496,
 'gaze':

In [115]:
tokenizer.encode("Catholic Yi")

[101, 2336, 14141, 102]

In [93]:
import pickle

with open("sp_models/bpe_en.model", 'rb') as f:
    tokenizer_model = pickle.load(f)

UnpicklingError: invalid load key, '\x0a'.

In [99]:
import sentencepiece as spm

processor = spm.SentencePieceProcessor()

In [104]:
processor.load("sp_models/bpe_en.model")

True

In [112]:
processor.load_vocabulary("sp_models/bpe_en.vocab", threshold = 100)

True

In [113]:
processor.encode_as_pieces("I am really sexy")

['▁',
 'I',
 '▁',
 'a',
 'm',
 '▁',
 'r',
 'e',
 'a',
 'l',
 'l',
 'y',
 '▁',
 's',
 'e',
 'x',
 'y']

In [100]:
??processor