##This is a code for training Transformer model with OpenNMT.

I used Google Colab environment.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# you may have to install these libraries
# if you use Google Colab
!pip install ConfigArgParse
!pip install torchtext==0.4

In [None]:
# path to ** ORIGINAL ** OpenNMT codes
# you may want to change this path
# put codes here (https://github.com/OpenNMT/OpenNMT-py/) to the path
CODE_PATH = '/content/drive/My Drive/Colab Notebooks/Attention_w_OpenNMT/original'

preprocess = CODE_PATH + '/preprocess.py'
train = CODE_PATH + '/train.py'

##Preprocess data

In [None]:
# you may want to change this sectionn
# I will use this dataset in this time
# https://github.com/odashi/small_parallel_enja
# put your data to this path

DATA_PATH = '/content/drive//My Drive/data/small_parallel_enja'
src_train = DATA_PATH + '/train.ja'
tgt_train = DATA_PATH + '/train.en'
src_dev = DATA_PATH + '/dev.ja'
tgt_dev = DATA_PATH + '/dev.en'
src_voc = DATA_PATH + '/train.ja.vocab.4k'
tgt_voc = DATA_PATH + '/train.en.vocab.4k'
save_data = DATA_PATH + '/ja2en_preprocessed'

In [None]:
!python '$preprocess' \
        -train_src '$src_train' \
        -train_tgt '$tgt_train' \
        -valid_src '$src_dev' \
        -valid_tgt '$tgt_dev' \
        -src_vocab '$src_voc' \
        -tgt_vocab '$tgt_voc' \
        -save_data '$save_data'

[2020-11-01 02:22:31,657 INFO] Extracting features...
[2020-11-01 02:22:31,660 INFO]  * number of source features: 0.
[2020-11-01 02:22:31,660 INFO]  * number of target features: 0.
[2020-11-01 02:22:31,660 INFO] Building `Fields` object...
[2020-11-01 02:22:31,661 INFO] Building & saving training data...
[2020-11-01 02:22:31,661 INFO] Using existing vocabulary...
[2020-11-01 02:22:32,177 INFO] Building vocab from text file...
[2020-11-01 02:22:32,177 INFO] Loading src vocabulary from /content/drive//My Drive/data/small_parallel_enja/train.ja.vocab.4k
[2020-11-01 02:22:32,188 INFO] Loaded src vocab has 4096 tokens.
[2020-11-01 02:22:32,190 INFO] Loading tgt vocabulary from /content/drive//My Drive/data/small_parallel_enja/train.en.vocab.4k
[2020-11-01 02:22:32,726 INFO] Loaded tgt vocab has 4096 tokens.
[2020-11-01 02:22:32,847 INFO] Building shard 0.
[2020-11-01 02:22:34,339 INFO]  * saving 0th train data shard to /content/drive//My Drive/data/small_parallel_enja/ja2en_preprocessed.tr

##Train Transformer (Making model)

In [None]:
# you may want to change this sectionn
MODEL_PATH = '/content/drive/My Drive/models/transformer'
!mkdir -p '$MODEL_PATH'
save_model = MODEL_PATH + '/small_parallel_ja2en_transformer'

In [None]:
!python '$train' -data '$save_data' -save_model '$save_model' \
        -layers 6 -rnn_size 512 -word_vec_size 512 -transformer_ff 2048 -heads 8  \
        -encoder_type transformer -decoder_type transformer -position_encoding \
        -train_steps 10000  -max_generator_batches 2 -dropout 0.1 \
        -batch_size 4096 -batch_type tokens -normalization tokens  -accum_count 2 \
        -optim adam -adam_beta2 0.998 -decay_method noam -warmup_steps 2000 -learning_rate 2 \
        -max_grad_norm 0 -param_init 0  -param_init_glorot \
        -label_smoothing 0.1 -valid_steps 2000 -save_checkpoint_steps 2000 \
        -world_size 1 -gpu_ranks 0

[2020-11-01 02:30:35,091 INFO]  * src vocab size = 4097
[2020-11-01 02:30:35,091 INFO]  * tgt vocab size = 4097
[2020-11-01 02:30:35,091 INFO] Building model...
[2020-11-01 02:30:45,771 INFO] NMTModel(
  (encoder): TransformerEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(4097, 512, padding_idx=1)
        )
        (pe): PositionalEncoding(
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (transformer): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linear_keys): Linear(in_features=512, out_features=512, bias=True)
          (linear_values): Linear(in_features=512, out_features=512, bias=True)
          (linear_query): Linear(in_features=512, out_features=512, bias=True)
          (softmax): Softmax(dim=-1)
          (dropout): Dropout(p=0.1, inplace=False)
          (final_linear): Linear(in_features=512, out_feature