<a href="https://colab.research.google.com/github/atnafuatx/atx/blob/master/Eng_Daw.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# TODO: Set your source and target languages. Keep in mind, these traditionally use language codes as found here:
# These will also become the suffix's of all vocab and corpus files used throughout
import os
source_language = "en"
target_language = "daw"
tag = "baseline" # Give a unique name to your folder - this is to ensure you don't rewrite any models you've already submitted

os.environ["src"] = source_language # Sets them in bash as well, since we often use bash scripts
os.environ["tgt"] = target_language
os.environ["tag"] = tag

# This will save it to a folder in our gdrive instead!
!mkdir -p "/content/drive/My Drive/dawromodel/$src-$tgt"
os.environ["gdrive_path"] = "/content/drive/My Drive/dawromodel/%s-%s" % (source_language, target_language)

In [3]:
from google.colab import files
uploaded = files.upload()

Saving eng-daw.csv to eng-daw (2).csv


In [4]:
import pandas as pd
import io
data = pd.read_csv(io.BytesIO(uploaded['eng-daw.csv']), encoding='cp1252')
# Dataset is now stored in a Pandas Dataframe

In [5]:
data.head()

Unnamed: 0,English,Dawuro
0,This is the family history of Jesus the Messia...,Hawe Daawita zariyaanne Abraahaame zare gideed...
1,Abraham was the father of Isaac. Isaac was the...,"Abraahaame, Yisaaqa yeleedda; Yisaaqi, Yayiqoo..."
2,Judah was the father of Perez and Zerah. (Thei...,Yihuday Ti7imaarippe Paareesanne Zaraahaa yele...
3,Ram was the father of Amminadab. Amminadab was...,"Raame Aminadaaba yeleedda; Aminadaabe, Na7asoo..."
4,Salmon was the father of Boaz. (His mother was...,"Selimoone, Ra7aabo geetettiyaa mishirattippe B..."


In [6]:
data[data.duplicated()]

Unnamed: 0,English,Dawuro
1575,"The whole world, earth and sky, will be destro...",Saluunne sa7ay aadhdhana; shin ta qaalay ubbak...
7460,Everyone who hears this should listen to what ...,‘Sisanaw haythay de7iyaa uray Geeshsha Ayyaana...


In [7]:
data = data.rename(columns={"English":"source_sentence", "Dawuro":"target_sentence"})

In [8]:
print("Length of Data before Removing duplicate: ",len(data))
data = data.drop_duplicates()
print("Length of Data after Removing duplicate: ",len(data))

Length of Data before Removing duplicate:  7804
Length of Data after Removing duplicate:  7802


In [9]:
# Do the split between dev/test/train and create parallel corpora
num_dev_patterns = 1000
num_test_patterns = 1000
df = data
# Lower case the corpora
df["source_sentence"] = df["source_sentence"].str.lower()
df["target_sentence"] = df["target_sentence"].str.lower()


devtest = df.tail(num_dev_patterns + num_test_patterns)
test = devtest.tail(num_test_patterns)
dev = devtest.head(num_dev_patterns)
stripped = df.drop(df.tail(num_dev_patterns + num_test_patterns).index)

stripped[["source_sentence"]].to_csv("train.en", index=False)
stripped[["target_sentence"]].to_csv("train.daw", index=False)

dev[["source_sentence"]].to_csv("dev.en", index=False)
dev[["target_sentence"]].to_csv("dev.daw", index=False)

test[["source_sentence"]].to_csv("test.en", index=False)
test[["target_sentence"]].to_csv("test.daw", index=False)


In [10]:

# Install JoeyNMT
! git clone https://github.com/joeynmt/joeynmt.git
! cd joeynmt; pip3 install .

fatal: destination path 'joeynmt' already exists and is not an empty directory.
Processing /content/joeynmt
Building wheels for collected packages: joeynmt
  Building wheel for joeynmt (setup.py) ... [?25l[?25hdone
  Created wheel for joeynmt: filename=joeynmt-1.0-cp36-none-any.whl size=80253 sha256=b4cc55bef2af283ca1b228ad41d9e3e59f954089e487e833916338ddb0c1f654
  Stored in directory: /tmp/pip-ephem-wheel-cache-n1enlju5/wheels/db/01/db/751cc9f3e7f6faec127c43644ba250a3ea7ad200594aeda70a
Successfully built joeynmt
Installing collected packages: joeynmt
  Found existing installation: joeynmt 1.0
    Uninstalling joeynmt-1.0:
      Successfully uninstalled joeynmt-1.0
Successfully installed joeynmt-1.0


In [11]:
# One of the huge boosts in NMT performance was to use a different method of tokenizing. 
# Usually, NMT would tokenize by words. However, using a method called BPE gave amazing boosts to performance

# Do subword NMT
! mkdir joeynmt/data/
! mkdir joeynmt/data/endaw/
! export data_path=joeynmt/data/$src$tgt/
! subword-nmt learn-joint-bpe-and-vocab --input train.$src train.$tgt -s 4000 -o bpe.codes.4000 --write-vocabulary vocab.$src vocab.$tgt

! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$src < train.$src > train.bpe.$src
! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$tgt < train.$tgt > train.bpe.$tgt

! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$src < dev.$src > dev.bpe.$src
! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$tgt < dev.$tgt > dev.bpe.$tgt
! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$src < test.$src > test.bpe.$src
! subword-nmt apply-bpe -c bpe.codes.4000 --vocabulary vocab.$tgt < test.$tgt > test.bpe.$tgt

# Create directory, move everyone we care about to the correct location
#! mkdir -p $data_path
! cp train.* joeynmt/data/endaw/
! cp test.* joeynmt/data/endaw/
! cp dev.* joeynmt/data/endaw/
! cp bpe.codes.4000 $data_path
! ls $data_path

# Create that vocab using build_vocab
! sudo chmod 777 joeynmt/scripts/build_vocab.py
! joeynmt/scripts/build_vocab.py joeynmt/data/$src$tgt/train.bpe.$src joeynmt/data/$src$tgt/train.bpe.$tgt --output_path joeynmt/data/$src$tgt/vocab.txt

# Some output
! echo "BPE Dawro Sentences"
! tail -n 5 test.bpe.$tgt
! echo "Combined BPE Vocab"
! tail -n 10 joeynmt/data/endaw/vocab.txt


mkdir: cannot create directory ‘joeynmt/data/’: File exists
mkdir: cannot create directory ‘joeynmt/data/endaw/’: File exists
cp: missing destination file operand after 'bpe.codes.4000'
Try 'cp --help' for more information.
 bpe.codes.4000   drive		     sample_data    train.bpe.daw   vocab.en
 dev.bpe.daw	 'eng-daw (1).csv'   test.bpe.daw   train.bpe.en
 dev.bpe.en	 'eng-daw (2).csv'   test.bpe.en    train.daw
 dev.daw	  eng-daw.csv	     test.daw	    train.en
 dev.en		  joeynmt	     test.en	    vocab.daw
BPE Dawro Sentences
"@@ geeshsha ayyaan@@ aynne ge@@ le@@ w@@ un@@ na, “@@ haa y@@ a” yaag@@ iino. qassi hawaa sis@@ iya@@ we oon@@ inne, “@@ haa y@@ a” yaa@@ go. saa@@ mett@@ eedda asay ooninne haa yo@@ . de7@@ uwa haathaa koyyiyaa asay ooninne coo akk@@ o. "
"taani ha maxaaf@@ an xaaf@@ etteedda han@@ ana geedda qaalaa sis@@ iyaa oonanne zor@@ ay. ooninne ha qaalaa bolla itti@@ baa gujj@@ ooppe, xoossay ha maxaaf@@ an xaaf@@ etteedda bo@@ sh@@ aa a bolla gujj@@ ana. "
"ooninne maxaaf

In [12]:

# Also move everything we care about to a mounted location in google drive (relevant if running in colab) at gdrive_path
! cp train.* "$gdrive_path"
! cp test.* "$gdrive_path"
! cp dev.* "$gdrive_path"
! cp bpe.codes.4000 "$gdrive_path"
! ls "$gdrive_path"

bpe.codes.4000	dev.daw  models        test.daw  train.bpe.daw	train.en
dev.bpe.daw	dev.en	 test.bpe.daw  test.en	 train.bpe.en	train.om
dev.bpe.en	dev.om	 test.bpe.en   test.om	 train.daw


In [13]:
# This creates the config file for our JoeyNMT system. It might seem overwhelming so we've provided a couple of useful parameters you'll need to update
# (You can of course play with all the parameters if you'd like!)
name = '%s%s' % (source_language, target_language)

config = """
name: "{name}_transformer"

data:
    src: "{source_language}"
    trg: "{target_language}"
    train: "data/{name}/train.bpe"
    dev:   "data/{name}/dev.bpe"
    test:  "data/{name}/test.bpe"
    level: "bpe"
    lowercase: False
    max_sent_length: 100
    src_vocab: "data/{name}/vocab.txt"
    trg_vocab: "data/{name}/vocab.txt"

testing:
    beam_size: 5
    alpha: 1.0

training:
    #load_model: "models/{name}_transformer/12000.ckpt" # if given, load a pre-trained model from this checkpoint
    random_seed: 42
    optimizer: "adam"
    normalization: "tokens"
    adam_betas: [0.9, 0.999] 
    scheduling: "noam"            # Try switching from plateau to Noam scheduling
    learning_rate_factor: 0.5       # factor for Noam scheduler (used with Transformer)
    learning_rate_warmup: 1000      # warmup steps for Noam scheduler (used with Transformer)
    patience: 8
    decrease_factor: 0.7
    loss: "crossentropy"
    learning_rate: 0.0002
    learning_rate_min: 0.00000001
    weight_decay: 0.0
    label_smoothing: 0.1
    batch_size: 4096
    batch_type: "token"
    eval_batch_size: 3600
    eval_batch_type: "token"
    batch_multiplier: 1
    early_stopping_metric: "ppl"
    epochs: 10 #14  TODO: Decrease for when playing around and checking of working. Around 30 is sufficient to check if its working at all
    validation_freq: 400 # Decrease this for testing
    logging_freq: 100
    eval_metric: "bleu"
    model_dir: "models/{name}_transformer"
    overwrite: True
    shuffle: True
    use_cuda: True
    max_output_length: 100
    print_valid_sents: [0, 1, 2, 3]
    keep_last_ckpts: 3

model:
    initializer: "xavier"
    bias_initializer: "zeros"
    init_gain: 1.0
    embed_initializer: "xavier"
    embed_init_gain: 1.0
    tied_embeddings: True
    tied_softmax: True
    encoder:
        type: "transformer"
        num_layers: 6
        num_heads: 8
        embeddings:
            embedding_dim: 512
            scale: True
            dropout: 0.
        # typically ff_size = 4 x hidden_size
        hidden_size: 512
        ff_size: 2048
        dropout: 0.3
    decoder:
        type: "transformer"
        num_layers: 6
        num_heads: 8
        embeddings:
            embedding_dim: 512
            scale: True
            dropout: 0.
        # typically ff_size = 4 x hidden_size
        hidden_size: 512
        ff_size: 2048
        dropout: 0.3
""".format(name=name, source_language=source_language, target_language=target_language)
with open("joeynmt/configs/transformer_{name}.yaml".format(name=name),'w') as f:
    f.write(config)

In [14]:
!cd joeynmt; python3 -m joeynmt train configs/transformer_$src$tgt.yaml

2020-12-23 16:29:32,456 - INFO - root - Hello! This is Joey-NMT (version 1.0).
2020-12-23 16:29:32,459 - INFO - joeynmt.data - loading training data...
2020-12-23 16:29:32,529 - INFO - joeynmt.data - building vocabulary...
2020-12-23 16:29:32,721 - INFO - joeynmt.data - loading dev data...
2020-12-23 16:29:32,732 - INFO - joeynmt.data - loading test data...
2020-12-23 16:29:32,746 - INFO - joeynmt.data - data loaded.
2020-12-23 16:29:33.549566: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2020-12-23 16:29:34,913 - INFO - joeynmt.training - Total params: 46190080
2020-12-23 16:29:34,916 - INFO - joeynmt.helpers - cfg.name                           : endaw_transformer
2020-12-23 16:29:34,917 - INFO - joeynmt.helpers - cfg.data.src                       : en
2020-12-23 16:29:34,917 - INFO - joeynmt.helpers - cfg.data.trg                       : daw
2020-12-23 16:29:34,917 - INFO - joeynmt.helpers - cfg.data.train    

In [None]:
while True:pass

In [None]:
! cat joeynmt/models/enom_transformer/validations.txt

cat: joeynmt/models/enom_transformer/validations.txt: No such file or directory


In [None]:
# Copy the created models from the notebook storage to google drive for persistant storage 
!mkdir "$gdrive_path/models/"
!cp -r joeynmt/models/* "$gdrive_path/models/${src}${tgt}_transformer/"

In [None]:
! cat "$gdrive_path/models/${src}${tgt}_transformer/validations.txt"

Steps: 400	Loss: 145913.60938	PPL: 157.19530	bleu: 0.06422	LR: 0.00027951	*


In [None]:
! cd joeynmt; python3 -m joeynmt test models/enom_transformer/config.yaml

Traceback (most recent call last):
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/content/joeynmt/joeynmt/__main__.py", line 41, in <module>
    main()
  File "/content/joeynmt/joeynmt/__main__.py", line 32, in main
    output_path=args.output_path, save_attention=args.save_attention)
  File "/content/joeynmt/joeynmt/prediction.py", line 267, in test
    cfg = load_config(cfg_file)
  File "/content/joeynmt/joeynmt/helpers.py", line 176, in load_config
    with open(path, 'r') as ymlfile:
FileNotFoundError: [Errno 2] No such file or directory: 'models/enom_transformer/config.yaml'
