<a href="https://colab.research.google.com/github/atnafuatx/atx/blob/master/Eng_Wola.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# TODO: Set your source and target languages. Keep in mind, these traditionally use language codes as found here:
# These will also become the suffix's of all vocab and corpus files used throughout
import os
source_language = "en"
target_language = "wol"
tag = "baseline" # Give a unique name to your folder - this is to ensure you don't rewrite any models you've already submitted

os.environ["src"] = source_language # Sets them in bash as well, since we often use bash scripts
os.environ["tgt"] = target_language
os.environ["tag"] = tag

# This will save it to a folder in our gdrive instead!
!mkdir -p "/content/drive/My Drive/dawromodel/$src-$tgt"
os.environ["gdrive_path"] = "/content/drive/My Drive/wolmodel/%s-%s" % (source_language, target_language)

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd
import io
data = pd.read_csv(io.BytesIO(uploaded['eng-wolp.csv']), encoding='cp1252')
# Dataset is now stored in a Pandas Dataframe

In [None]:
data.head()

In [None]:
data[data.duplicated()]

In [None]:
data = data.rename(columns={"English":"source_sentence", "Dawuro":"target_sentence"})

In [None]:
print("Length of Data before Removing duplicate: ",len(data))
data = data.drop_duplicates()
print("Length of Data after Removing duplicate: ",len(data))

In [None]:
# Do the split between dev/test/train and create parallel corpora
num_dev_patterns = 1000
num_test_patterns = 1000
df = data
# Lower case the corpora
df["source_sentence"] = df["source_sentence"].str.lower()
df["target_sentence"] = df["target_sentence"].str.lower()


devtest = df.tail(num_dev_patterns + num_test_patterns)
test = devtest.tail(num_test_patterns)
dev = devtest.head(num_dev_patterns)
stripped = df.drop(df.tail(num_dev_patterns + num_test_patterns).index)

stripped[["source_sentence"]].to_csv("train.en", index=False)
stripped[["target_sentence"]].to_csv("train.wol", index=False)

dev[["source_sentence"]].to_csv("dev.en", index=False)
dev[["target_sentence"]].to_csv("dev.wol", index=False)

test[["source_sentence"]].to_csv("test.en", index=False)
test[["target_sentence"]].to_csv("test.wol", index=False)


In [None]:

# Install JoeyNMT
! git clone https://github.com/joeynmt/joeynmt.git
! cd joeynmt; pip3 install .

In [None]:
# One of the huge boosts in NMT performance was to use a different method of tokenizing. 
# Usually, NMT would tokenize by words. However, using a method called BPE gave amazing boosts to performance

# Do subword NMT
! mkdir joeynmt/data/
! mkdir joeynmt/data/enwol/
! export data_path=joeynmt/data/$src$tgt/
! subword-nmt learn-joint-bpe-and-vocab --input train.$src train.$tgt -s 4000 -o bpe.codes.4000 --write-vocabulary vocab.$src vocab.$tgt

! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$src < train.$src > train.bpe.$src
! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$tgt < train.$tgt > train.bpe.$tgt

! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$src < dev.$src > dev.bpe.$src
! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$tgt < dev.$tgt > dev.bpe.$tgt
! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$src < test.$src > test.bpe.$src
! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$tgt < test.$tgt > test.bpe.$tgt

# Create directory, move everyone we care about to the correct location
#! mkdir -p $data_path
! cp train.* joeynmt/data/enwol/
! cp test.* joeynmt/data/enwol/
! cp dev.* joeynmt/data/enwol/
! cp bpe.codes.4000 $data_path
! ls $data_path

# Create that vocab using build_vocab
! sudo chmod 777 joeynmt/scripts/build_vocab.py
! joeynmt/scripts/build_vocab.py joeynmt/data/$src$tgt/train.bpe.$src joeynmt/data/$src$tgt/train.bpe.$tgt --output_path joeynmt/data/$src$tgt/vocab.txt

# Some output
! echo "BPE Wolaita Sentences"
! tail -n 5 test.bpe.$tgt
! echo "Combined BPE Vocab"
! tail -n 10 joeynmt/data/enwol/vocab.txt


In [None]:

# Also move everything we care about to a mounted location in google drive (relevant if running in colab) at gdrive_path
! cp train.* "$gdrive_path"
! cp test.* "$gdrive_path"
! cp dev.* "$gdrive_path"
! cp bpe.codes.4000 "$gdrive_path"
! ls "$gdrive_path"

In [None]:
# This creates the config file for our JoeyNMT system. It might seem overwhelming so we've provided a couple of useful parameters you'll need to update
# (You can of course play with all the parameters if you'd like!)
name = '%s%s' % (source_language, target_language)

config = """
name: "{name}_transformer"

data:
    src: "{source_language}"
    trg: "{target_language}"
    train: "data/{name}/train.bpe"
    dev:   "data/{name}/dev.bpe"
    test:  "data/{name}/test.bpe"
    level: "bpe"
    lowercase: False
    max_sent_length: 100
    src_vocab: "data/{name}/vocab.txt"
    trg_vocab: "data/{name}/vocab.txt"

testing:
    beam_size: 5
    alpha: 1.0

training:
    #load_model: "models/{name}_transformer/12000.ckpt" # if given, load a pre-trained model from this checkpoint
    random_seed: 42
    optimizer: "adam"
    normalization: "tokens"
    adam_betas: [0.9, 0.999] 
    scheduling: "plateau"            # Try switching from plateau to Noam scheduling
    learning_rate_factor: 0.5       # factor for Noam scheduler (used with Transformer)
    learning_rate_warmup: 1000      # warmup steps for Noam scheduler (used with Transformer)
    patience: 5   #8
    decrease_factor: 0.7
    loss: "crossentropy"
    learning_rate: 0.0003
    learning_rate_min: 0.00000001
    weight_decay: 0.0
    label_smoothing: 0.1
    batch_size: 4096
    batch_type: "token"
    eval_batch_size: 3600
    eval_batch_type: "token"
    batch_multiplier: 1
    early_stopping_metric: "ppl"
    epochs: 10 #14  TODO: Decrease for when playing around and checking of working. Around 30 is sufficient to check if its working at all
    validation_freq: 100 #400 Decrease this for testing
    logging_freq: 100
    eval_metric: "bleu"
    model_dir: "models/{name}_transformer"
    overwrite: True
    shuffle: True
    use_cuda: True
    max_output_length: 100
    print_valid_sents: [0, 1, 2, 3]
    keep_last_ckpts: 3

model:
    initializer: "xavier"
    bias_initializer: "zeros"
    init_gain: 1.0
    embed_initializer: "xavier"
    embed_init_gain: 1.0
    tied_embeddings: True
    tied_softmax: True
    encoder:
        type: "transformer"
        num_layers: 6
        num_heads: 4 #TODO: Increase to 8 for larger data.
        embeddings:
            embedding_dim: 256 # TODO: Increase to 512 for larger data.
            scale: True
            dropout: 0.
        # typically ff_size = 4 x hidden_size
        hidden_size: 256   # TODO: Increase to 512 for larger data.
        ff_size: 1024  # TODO: Increase to 2048 for larger data.
        dropout: 0.3
    decoder:
        type: "transformer"
        num_layers: 6
        num_heads: 4 #8 TODO: Increase to 8 for larger data.
        embeddings:
            embedding_dim: 256 #512
            scale: True
            dropout: 0.
        # typically ff_size = 4 x hidden_size
        hidden_size: 256 #512
        ff_size: 1024  #2048
        dropout: 0.3
""".format(name=name, source_language=source_language, target_language=target_language)
with open("joeynmt/configs/transformer_{name}.yaml".format(name=name),'w') as f:
    f.write(config)

In [None]:
!cd joeynmt; python3 -m joeynmt train configs/transformer_$src$tgt.yaml

In [None]:
! cat joeynmt/models/enom_transformer/validations.txt

In [None]:
# Copy the created models from the notebook storage to google drive for persistant storage 
!mkdir "$gdrive_path/models/"
!cp -r joeynmt/models/* "$gdrive_path/models/${src}${tgt}_transformer/"

In [None]:
! cat "$gdrive_path/models/${src}${tgt}_transformer/validations.txt"

In [None]:
! cd joeynmt; python3 -m joeynmt test models/enom_transformer/config.yaml