In [None]:
# Connect your Google Drive so that you download config files and store models
# Change to your home directory on Google Drive
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive

In [None]:
# Download sample training, test, eval & config files needed for training model
!wget https://github.com/adaptNMT/mtdev/raw/main/transformer.zip

In [None]:
# Unpack the files and create the directory structure
!unzip -o transformer.zip

In [None]:
# Display GPU details provided by Google
gpu_info = !nvidia-smi
print(gpu_info)
# Check if version of python >= 3.8
!python --version
# Check if Pytorch > 2.0 is installed
!apt list | grep torch

In [None]:
# Install the chosen MT engine required for translation (OpenNMT used here)
!pip install OpenNMT-py

In [None]:
# We have to build SentencePiece in order to use the command line instructions
%cd /content/drive/MyDrive
!apt-get install cmake build-essential pkg-config libgoogle-perftools-dev
!git clone https://github.com/google/sentencepiece.git
%cd sentencepiece
%mkdir build
%cd build
!cmake ..
!make -j $(nproc)
!make install
! ldconfig -v
%cd ../../

In [None]:
# Combine the source and target training data for training SentencePiece model
%cd transformer/data
!cat src-train.txt tgt-train.txt> train.txt

/content/drive/MyDrive/mtdev/data


In [None]:
# Create the SentencePiece model with a vocab of 16k
!spm_train --input='train.txt' --model_prefix=spm \
      --vocab_size=16000 --character_coverage=1.0 --model_type=bpe

In [None]:
# Build the vocabulary using the hyperparameters set in transformer.yaml
%cd ../
!onmt_build_vocab -config data/transformer.yaml -n_sample=-1

In [None]:
# The model is trained using the hyperparameters stored in transformer.yaml
!onmt_train -config data/transformer.yaml

**TRANSLATION**

In [None]:
# OPTIONAL CELL:
# Download a sample model if you haven't completed the previous training step
!pip install -q gdown
import gdown
id = "1ZSsrxjNTfxwhD3LatwdFa1uBYRxE7XUL&export=download&confirm=t"
output="models/sample_transformer.pt"
gdown.download(id=id, output=output, quiet=False)

In [None]:
# Using the SentencePiece model, encode the source and target files
%cd /content/drive/MyDrive/transformer/data
!spm_encode --model=spm.model --output_format=piece \
          < src-test.txt > src-test.txt.sp
!spm_encode --model=spm.model --output_format=piece \
          < tgt-test.txt > tgt-test.txt.sp

In [None]:
# Using the OpenNMT command, translate the English Test file into Irish
# A sample model was provided when mtdev was downloaded.
%cd /content/drive/MyDrive/transformer/
!onmt_translate --model models/model_step_10000.pt \
          --src data/src-test.txt.sp --tgt data/tgt-test.txt.sp \
          --output data/pred.sp -replace_unk -verbose

**EVALUATION**

In [None]:
# Install the library which will be used for carrying out the evaluation
# The translations generated in pred.text are compared with the reference
# translations provided in tgt-test.txt
!pip3 install sacrebleu[ja]
%cd data
!spm_decode -model=spm.model \
-input_format=piece < pred.sp > pred.txt
!echo "++ using sacrebleu ++" | tee -a experiment_log.txt
!sacrebleu tgt-test.txt < pred.txt -m bleu --force