In [1]:
import yaml
import torch
import torch.nn as nn
from argparse import Namespace
from collections import defaultdict, Counter
import onmt
from onmt.inputters.inputter import _load_vocab, _build_fields_vocab, get_fields, IterOnDevice
from onmt.inputters.corpus import ParallelCorpus
from onmt.inputters.dynamic_iterator import DynamicDatasetIter
from onmt.translate import GNMTGlobalScorer, Translator, TranslationBuilder
from onmt.utils.misc import set_random_seed

In [2]:
from onmt.utils.logging import init_logger, logger
init_logger()

<RootLogger root (INFO)>

In [3]:
is_cuda = torch.cuda.is_available()
set_random_seed(1111, is_cuda)

In [17]:
yaml_config = """
# src_vocab_size: 30000
# tgt_vocab_size: 30000

save_data: run/samples
src_vocab: vocabs/vocab.en
tgt_vocab: vocabs/vocab.hu

# Corpus opts:
data:
    hunglish:
        path_src: /home1/hu-nmt/hu-nmt/data/ftp.mokk.bme.hu/Hunglish2/combined-en-hu/hunglish2-short-no-qoutes-train.en
        path_tgt: /home1/hu-nmt/hu-nmt/data/ftp.mokk.bme.hu/Hunglish2/combined-en-hu/hunglish2-short-no-qoutes-train.hu
        transforms: [sentencepiece]
        weight: 1
    valid:
        path_src: /home1/hu-nmt/hu-nmt/data/ftp.mokk.bme.hu/Hunglish2/combined-en-hu/hunglish2-short-no-qoutes-valid.en
        path_tgt: /home1/hu-nmt/hu-nmt/data/ftp.mokk.bme.hu/Hunglish2/combined-en-hu/hunglish2-short-no-qoutes-valid.hu
        transforms: [sentencepiece]

#### Subword
src_subword_model: /home1/hu-nmt/hu-nmt/opennmt/experiments-en-hu/sp_models/bpe_en.model
tgt_subword_model: /home1/hu-nmt/hu-nmt/opennmt/experiments-en-hu/sp_models/bpe_hu.model
src_subword_nbest: 1
src_subword_alpha: 0.0
tgt_subword_nbest: 1
tgt_subword_alpha: 0.0

src_seq_length: 16  # maximum source sequence length
tgt_seq_length: 16  # maximum target sequence length

# TRAIN

# Train on a single GPU
world_size: 1
gpu_ranks: [0]

# Batching
batch_size: 96
#queue_size: 16
#accum_count: [3]

# General opts
save_model: run/model_no_qoutes
keep_checkpoint: 10
save_checkpoint_steps: 10000
average_decay: 0.0005
seed: 1234
report_every: 100
train_steps: 400000
valid_steps: 10000 
single_pass: False
early_stopping: 5 
early_stopping_criteria: ppl

# Optimization
model_dtype: "fp16"
optim: "adam"
learning_rate: 2.0
warmup_steps: 8000
decay_method: "noam"
adam_beta2: 0.998
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "tokens"

# Model
encoder_type: transformer
decoder_type: transformer
enc_layers: 2
dec_layers: 2
heads: 8
rnn_size: 512
word_vec_size: 512
transformer_ff: 2048
dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]
#share_decoder_embeddings: true

# Logging
log_file: run/logs_no_qoutes
"""
config = yaml.safe_load(yaml_config)
with open("config.yaml", "w") as f:
    f.write(yaml_config)

In [18]:
from onmt.utils.parse import ArgumentParser
parser = ArgumentParser(description='build_vocab.py')

In [19]:
from onmt.opts import dynamic_prepare_opts
dynamic_prepare_opts(parser, build_vocab_only=True) #build_vocab_only=True

In [20]:
base_args = (["-config", "config.yaml", "-n_sample", "100000"])
opts, unknown = parser.parse_known_args(base_args)

In [21]:
opts



In [22]:
from onmt.bin.build_vocab import build_vocab_main
build_vocab_main(opts)

[2021-04-03 18:27:34,583 INFO] Parsed 2 corpora from -data.
[2021-04-03 18:27:34,631 INFO] Counter vocab from 100000 samples.
[2021-04-03 18:27:34,633 INFO] Build vocab on 100000 transformed examples/corpus.
[2021-04-03 18:27:34,703 INFO] hunglish's transforms: TransformPipe(SentencePieceTransform(share_vocab=False, src_subword_model=/home1/hu-nmt/hu-nmt/opennmt/experiments-en-hu/sp_models/bpe_en.model, tgt_subword_model=/home1/hu-nmt/hu-nmt/opennmt/experiments-en-hu/sp_models/bpe_hu.model, src_subword_alpha=0.0, tgt_subword_alpha=0.0, src_subword_vocab=, tgt_subword_vocab=, src_vocab_threshold=0, tgt_vocab_threshold=0, src_subword_nbest=1, tgt_subword_nbest=1))
[2021-04-03 18:27:34,709 INFO] Loading ParallelCorpus(/home1/hu-nmt/hu-nmt/data/ftp.mokk.bme.hu/Hunglish2/combined-en-hu/hunglish2-short-no-qoutes-train.en, /home1/hu-nmt/hu-nmt/data/ftp.mokk.bme.hu/Hunglish2/combined-en-hu/hunglish2-short-no-qoutes-train.hu, align=None)...
[2021-04-03 18:27:44,299 INFO] Counters src:22550
[202

OSError: path vocabs/vocab.en exists, stop.

In [23]:
src_vocab_path = opts.src_vocab
tgt_vocab_path = opts.tgt_vocab

In [24]:
counters = defaultdict(Counter)
# load source vocab
_src_vocab, _src_vocab_size = _load_vocab(
    src_vocab_path,
    'src',
    counters)
# load target vocab
_tgt_vocab, _tgt_vocab_size = _load_vocab(
    tgt_vocab_path,
    'tgt',
    counters)

[2021-04-03 18:27:47,667 INFO] Loading src vocabulary from vocabs/vocab.en
[2021-04-03 18:27:47,780 INFO] Loaded src vocab has 22550 tokens.
[2021-04-03 18:27:47,803 INFO] Loading tgt vocabulary from vocabs/vocab.hu
[2021-04-03 18:27:47,868 INFO] Loaded tgt vocab has 25364 tokens.


In [25]:
# initialize fields
src_nfeats, tgt_nfeats = 0, 0 # do not support word features for now
fields = get_fields(
    'text', src_nfeats, tgt_nfeats)

In [26]:
fields

{'src': <onmt.inputters.text_dataset.TextMultiField at 0x7fa323c7ee50>,
 'tgt': <onmt.inputters.text_dataset.TextMultiField at 0x7fa323c2aa00>,
 'indices': <torchtext.data.field.Field at 0x7fa323c2ab80>}

In [27]:
# build fields vocab
share_vocab = False
vocab_size_multiple = 1
src_vocab_size = 30000
tgt_vocab_size = 30000
src_words_min_frequency = 1
tgt_words_min_frequency = 1
vocab_fields = _build_fields_vocab(
    fields, counters, 'text', share_vocab,
    vocab_size_multiple,
    src_vocab_size, src_words_min_frequency,
    tgt_vocab_size, tgt_words_min_frequency)

[2021-04-03 18:27:50,861 INFO]  * tgt vocab size: 25368.
[2021-04-03 18:27:50,898 INFO]  * src vocab size: 22552.


In [28]:
src_text_field = vocab_fields["src"].base_field
src_vocab = src_text_field.vocab
src_padding = src_vocab.stoi[src_text_field.pad_token]

tgt_text_field = vocab_fields['tgt'].base_field
tgt_vocab = tgt_text_field.vocab
tgt_padding = tgt_vocab.stoi[tgt_text_field.pad_token]

In [29]:
emb_size = 100
rnn_size = 500
# Specify the core model.

encoder_embeddings = onmt.modules.Embeddings(emb_size, len(src_vocab),
                                             word_padding_idx=src_padding)

encoder = onmt.encoders.RNNEncoder(hidden_size=rnn_size, num_layers=1,
                                   rnn_type="LSTM", bidirectional=True,
                                   embeddings=encoder_embeddings)

decoder_embeddings = onmt.modules.Embeddings(emb_size, len(tgt_vocab),
                                             word_padding_idx=tgt_padding)
decoder = onmt.decoders.decoder.InputFeedRNNDecoder(
    hidden_size=rnn_size, num_layers=1, bidirectional_encoder=True, 
    rnn_type="LSTM", embeddings=decoder_embeddings)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = onmt.models.model.NMTModel(encoder, decoder)
model.to(device)

# Specify the tgt word generator and loss computation module
model.generator = nn.Sequential(
    nn.Linear(rnn_size, len(tgt_vocab)),
    nn.LogSoftmax(dim=-1)).to(device)

loss = onmt.utils.loss.NMTLossCompute(
    criterion=nn.NLLLoss(ignore_index=tgt_padding, reduction="sum"),
    generator=model.generator)

In [30]:
lr = 1
torch_optimizer = torch.optim.SGD(model.parameters(), lr=lr)
optim = onmt.utils.optimizers.Optimizer(
    torch_optimizer, learning_rate=lr, max_grad_norm=2)

In [31]:
opts.data['hunglish']['path_src']

'/home1/hu-nmt/hu-nmt/data/ftp.mokk.bme.hu/Hunglish2/combined-en-hu/hunglish2-short-no-qoutes-train.en'

In [32]:
src_train = opts.data['hunglish']['path_src']
tgt_train =opts.data['hunglish']['path_tgt']
src_val = opts.data['valid']['path_src']
tgt_val = opts.data['valid']['path_tgt']

# build the ParallelCorpus
corpus = ParallelCorpus("corpus", src_train, tgt_train)
valid = ParallelCorpus("valid", src_val, tgt_val)

In [33]:
import sentencepiece

# build the training iterator
train_iter = DynamicDatasetIter(
    corpora={"corpus": corpus},
    corpora_info={"corpus": {"weight": 1}},
    transforms={sentencepiece},
    fields=vocab_fields,
    is_train=True,
    batch_type="sents",
    batch_size=8,
    batch_size_multiple=1,
    data_type="text")

In [34]:
# make sure the iteration happens on GPU 0 (-1 for CPU, N for GPU N)
train_iter = iter(IterOnDevice(train_iter, -1))

In [35]:
# build the validation iterator
valid_iter = DynamicDatasetIter(
    corpora={"valid": valid},
    corpora_info={"valid": {"weight": 1}},
    transforms={},
    fields=vocab_fields,
    is_train=False,
    batch_type="tokens",
    batch_size=8*16,
    batch_size_multiple=1,
    data_type="text")

In [36]:
valid_iter = IterOnDevice(valid_iter, -1)

In [None]:
report_manager = onmt.utils.ReportMgr(
    report_every=10, start_time=None, tensorboard_writer=None)

trainer = onmt.Trainer(model=model,
                       train_loss=loss,
                       valid_loss=loss,
                       optim=optim,
                       report_manager=report_manager,
                       dropout=[0.1])

trainer.train(train_iter=train_iter,
              train_steps=100,
              valid_iter=valid_iter,
              valid_steps=50)

[2021-04-03 18:28:01,021 INFO] Start training loop and validate every 50 steps...
[2021-04-03 18:28:01,023 INFO] corpus's transforms: TransformPipe()
[2021-04-03 18:28:01,025 INFO] Loading ParallelCorpus(/home1/hu-nmt/hu-nmt/data/ftp.mokk.bme.hu/Hunglish2/combined-en-hu/hunglish2-short-no-qoutes-train.en, /home1/hu-nmt/hu-nmt/data/ftp.mokk.bme.hu/Hunglish2/combined-en-hu/hunglish2-short-no-qoutes-train.hu, align=None)...


# Translate