Notebook was run on NVIDIA Tesla P100 16GB and it works in the standard Colab environment. Installing of additional packages is done in the notebook.

The excecution of this notebook will take 10-12h.

The submission generated by this notebook will be stored in submission.csv

External data/models:
* JW300 data, which when be automatically downloaded when the notebook is run,
* fasttext model is used to detect language, and will be also downloaded,
* camembert-base from transformers package is used to augment the data

The external data is also included with the submission, so if you don't wish to download it from the source, comment out the lines where it is downloaded.

In [None]:
!pip install sentencepiece
!pip install opustools-pkg
!pip install transformers
!pip install fasttext
!pip install pyarrow
# used this specific commit to avoid incompatibilities due to the package being in development
#!pip install git+git://github.com/pytorch/fairseq@d6855baec88f99ac776962027b91d404fe917eea
# the latest pip version should also work as of now
!pip install fairseq
!pip install --upgrade numpy

import random
import numpy as np
import sentencepiece as spm
import os
from tqdm import tqdm
import random
from transformers import pipeline
from typing import Tuple, Callable
import pandas as pd
import fasttext

## Data Preprocessing

In [5]:
# paths to the original data

TRAIN_CSV_PATH="Train.csv"
TEST_CSV_PATH="Test.csv"

In [6]:
!mkdir -p preprocessing/out

In [7]:
train = pd.read_csv(TRAIN_CSV_PATH, dtype={"ID":"str", "French":"str", "Target_Language":"str", "Target":"str"})
len(train)

75487

In [8]:
# removing sentences with matching source and target
train = train[train.French != train.Target]
len(train)

75439

In [9]:
# basic preprocessing

import re
def preprocess(text):
    text = text.replace("\t", " ")
    text = re.sub(r"\n\n+", r"\n", text)
    text = re.sub(r" *\n *", r"\n", text)
    text = re.sub(r" +", " ", text)
    text = text.strip()
    
    return text

train["French"] = train["French"].map(lambda x: preprocess(x))
train["Target"] = train["Target"].map(lambda x: preprocess(x))
train = train[-(train.French.str.contains("\n") | train.Target.str.contains("\n"))]
train = train[train.French.str.len() / train.Target.str.len() < 8]
len(train)

75228

In [10]:
# the data contains dictionary style entries (multiple on one line), which will be separated into separate lines

is_dictionary_entry = lambda french, target, target_lang: not (" - " in french or " – " in french) and " - " in target and target_lang == "Fon" and all(len(x) < 30 and x[-1] != "." for x in target.split(" - "))
condition = train.apply(lambda x: is_dictionary_entry(x.French, x.Target, x.Target_Language), axis=1)
train_dictionary_words = train[condition]
train = train[-condition]
len(train)

71857

In [11]:
# saving the extracted dictionary entries

new_sents = [(row.ID, row.French, row.Target_Language, sent) for i, row in train_dictionary_words.iterrows() for sent in row.Target.split(" - ")]
        
dictionary_df = pd.DataFrame(new_sents, columns =['ID', 'French', 'Target_Language', 'Target'])
dictionary_df.to_csv("preprocessing/out/dictionary.csv")
len(dictionary_df)

9618

In [12]:
train = train[train.Target.str.len() / train.French.str.len() < 8]
len(train)

71855

In [None]:
# downloading the fasttext model
!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin

In [16]:
# detecting incorrect languages
lid_model = fasttext.load_model("lid.176.bin") 

def detect_english(sent, model, threshold=0.85):
    pred = lid_model.predict(sent)
    return pred[0][0] == '__label__en' and pred[1][0] > 0.85

def detect_target(sent, char_threshold=4):
    ewe_chars = ("Ɖ", "ɖ", "Ɛ", "ɛ", "ɛ́" "Ƒ", "ƒ", "Ɣ", "ɣ", "Ŋ", "ŋ", "Ɔ", "ɔ", "ɔ̀",  "Ʋ", "ʋ")
    return sum(int(c in ewe_chars) for c in sent) > 4

train = train[-train["French"].map(lambda x: detect_english(x, lid_model))]
train = train[-train["French"].map(lambda x: detect_target(x))]
len(train)



71683

In [17]:
#removing duplicate lines

def get_unique_filter() -> Callable[[], Callable[[Tuple[str, str, str]], bool]]:
    unique = set()
    def unique_filter(pair: Tuple) -> bool:
        nonlocal unique
        
        if pair in unique:
            return False
        
        unique.add(pair)
        return True
    return unique_filter

f = get_unique_filter()
condition = train.apply(lambda x: f((x.French, x.Target, x.Target_Language)), axis=1)
train = train[condition]
len(train)

52307

In [18]:
train.to_csv("preprocessing/out/train.csv")

In [19]:
# split into train, dev, test
def split_df(df, test_size, dev_size, random_state=1):  
    df_test = df.sample(n=test_size, replace=False, random_state=random_state)
    df_train_dev = df.drop(df_test.index)
    
    df_dev = df_train_dev.sample(n=dev_size, replace=False, random_state=random_state)
    df_train = df_train_dev.drop(df_dev.index)
    return df_train, df_test, df_dev

In [20]:
fon = train[train.Target_Language == "Fon"]
ewe = train[train.Target_Language == "Ewe"]

fon_train, fon_test, fon_dev = split_df(fon, 400, 1000)
ewe_train, ewe_test, ewe_dev = split_df(ewe, 400, 1000)

In [21]:
fon_train.to_csv("preprocessing/out/fon_train.csv")
fon_test.to_csv("preprocessing/out/fon_test.csv")
fon_dev.to_csv("preprocessing/out/fon_dev.csv")
ewe_train.to_csv("preprocessing/out/ewe_train.csv")
ewe_test.to_csv("preprocessing/out/ewe_test.csv")
ewe_dev.to_csv("preprocessing/out/ewe_dev.csv")

In [22]:
# convert lang pair csv into two parallel files
langs = {"ewe": "ee",
        "fon": "fon",
        "fr": "fr",
        "ee":"ee"}

for target_lang in ["ewe", "fon"]:
    for split in ["train", "test", "dev"]:
        df = pd.read_csv(f"preprocessing/out/{target_lang}_{split}.csv", dtype={"ID":"str", "French":"str", "Target_Language":"str", "Target":"str"})
        tgt_lng = langs[target_lang]
        with open(f"preprocessing/out/fr-{tgt_lng}.{split}.fr", "w", encoding="utf-8") as src_out, open(f"preprocessing/out/fr-{tgt_lng}.{split}.{tgt_lng}", "w", encoding="utf-8") as tgt_out:
            for i, row in df.iterrows():
                src_out.write(row.French + "\n")
                tgt_out.write(row.Target + "\n")

In [23]:
with open(f"preprocessing/out/fr-fon.dict.fr", "w", encoding="utf-8") as src_out, open(f"preprocessing/out/fr-fon.dict.fon", "w", encoding="utf-8") as tgt_out:
    for i, row in dictionary_df.iterrows():
        src_out.write(row.French + "\n")
        tgt_out.write(row.Target + "\n")

In [24]:
def load_pairs(prefix, src_lang, tgt_lang):
    with open(f"{prefix}.{src_lang}", "r", encoding="utf8") as src, open(f"{prefix}.{tgt_lang}", "r", encoding="utf8") as tgt:
        return list(zip(src.readlines(), tgt.readlines()))

# checks that the new data does not overlap with the existing data
def add_to_train(train_prefix, dev_prefix, test_prefix, in_prefix, out_prefix, src_lang, tgt_lang, separate=False):
    train = load_pairs(train_prefix, src_lang, tgt_lang)
    present_sents = set(train)
    for split_prefix in [test_prefix, dev_prefix]:
        present_sents.update(load_pairs(split_prefix, src_lang, tgt_lang))
        
    new_data = load_pairs(in_prefix, src_lang, tgt_lang)
    
    new_train = train.copy() if not separate else []
    for entry in new_data:
        if entry not in present_sents:
            present_sents.add(entry)
            new_train.append(entry)

    print(f"kept {(len(new_train) - len(train)) if not separate else len(new_train)} pairs out of {len(new_data)}")

    with open(f"{out_prefix}.{src_lang}", "w", encoding="utf8") as src, open(f"{out_prefix}.{tgt_lang}", "w", encoding="utf8") as tgt:
        for (src_sent, tgt_sent) in new_train:
            src.write(src_sent)
            tgt.write(tgt_sent)
  

!mkdir -p preprocessing/clean_dict/out

# adding the previously extracted dictionary entries to the data

add_to_train(
    "preprocessing/out/fr-fon.train",
    "preprocessing/out/fr-fon.test",
    "preprocessing/out/fr-fon.dev",
    "preprocessing/out/fr-fon.dict",
    "preprocessing/clean_dict/out/fr-fon.train",
    "fr",
    "fon"
)

add_to_train(
    "preprocessing/out/fr-fon.train",
    "preprocessing/out/fr-fon.test",
    "preprocessing/out/fr-fon.dev",
    "preprocessing/out/fr-fon.dict",
    "preprocessing/clean_dict/out/fr-fon.train.dict",
    "fr",
    "fon",
    separate=True
)

kept 9600 pairs out of 9618
kept 9600 pairs out of 9618


## Downloading and cleaning JW300 data

In [None]:
# This downloads the JW300 data
!yes | opus_read -d JW300 -s fr -t fon -wm moses -w fr-fon.jw300.fr fr-fon.jw300.fon
!yes | opus_read -d JW300 -s fr -t ee -wm moses -w fr-ee.jw300.fr fr-ee.jw300.ee

In [26]:
!mkdir -p  jw300/out

# removing duplicates with the existing data
add_to_train(
    "preprocessing/out/fr-fon.train",
    "preprocessing/out/fr-fon.test",
    "preprocessing/out/fr-fon.dev",
    "fr-fon.jw300",
    "jw300/out/fr-fon.train",
    "fr",
    "fon",
    separate=True
)

kept 32174 pairs out of 33362


In [27]:
add_to_train(
    "preprocessing/out/fr-ee.train",
    "preprocessing/out/fr-ee.test",
    "preprocessing/out/fr-ee.dev",
    "fr-ee.jw300",
    "jw300/out/fr-ee.train",
    "fr",
    "ee",
    separate=True
)

kept 602775 pairs out of 634964


## Augmenting ee data

In [28]:
data = pd.read_csv("preprocessing/out/ewe_train.csv")

In [None]:
random.seed(42)
nlp = pipeline("fill-mask", model="camembert-base", device=0)

In [30]:
# replacing words in sentence by new words generated by camembert

def augment_sentence(sent, n):
    french = sent.split(" ")
    replace_idxs = random.sample(range(len(french)), n)
    
    for replace_idx in replace_idxs:
        original = french[replace_idx]
        french[replace_idx] = nlp.tokenizer.mask_token
        out = nlp(" ".join(french))

        draw = random.choices(out, cum_weights=list(map(lambda x: x["score"], out)), k=1)
        french[replace_idx] = draw[0]["token_str"]
    
    return " ".join(french)

In [31]:
augmented_sents = []

for i, row in tqdm(data.iterrows()):
    sent = row.French
    augmented_sent = augment_sentence(sent, n=max(1, len(sent.split(" ")) // 3))
    augmented_sents.append((augmented_sent, row.Target))

20580it [21:56, 15.63it/s]


In [32]:
!mkdir -p augmented_ee/out

with open("augmented_ee/out/fr-ee.train.fr", "w", encoding="utf8") as src, open("augmented_ee/out/fr-ee.train.ee", "w", encoding="utf8") as tgt:
    for line in augmented_sents:
        src.write(line[0] + "\n")
        tgt.write(line[1] + "\n")

In [58]:
!mkdir -p augmented_ee/out/cleaned

# removing duplicates
add_to_train(
    "preprocessing/out/fr-ee.train",
    "preprocessing/out/fr-ee.test",
    "preprocessing/out/fr-ee.dev",
    "augmented_ee/out/fr-ee.train",
    "augmented_ee/out/cleaned/fr-ee.train",
    "fr",
    "ee",
)

kept 18691 pairs out of 20580


## Training and applying SentencePiece BPE model

In [34]:
!mkdir -p sp
!touch sp/combined_data
!cat preprocessing/out/fr-fon.train.fon >> sp/combined_data
!cat preprocessing/out/fr-fon.train.fr >> sp/combined_data
!cat preprocessing/out/fr-ee.train.ee >> sp/combined_data
!cat preprocessing/out/fr-ee.train.fr >> sp/combined_data

In [35]:
def segment_file(in_file, out_file, model):
    with open(in_file, "r", encoding="utf-8") as inf, open(out_file, "w", encoding="utf-8") as outf:
        for line in inf:
            outf.write(' '.join(model.encode(line.rstrip(), out_type=str)) + "\n")

def segment_bitext(in_prefix, out_prefix, src_lang, tgt_lang, src_model, tgt_model):
    segment_file(f"{in_prefix}.{src_lang}", f"{out_prefix}.{src_lang}", src_model)
    segment_file(f"{in_prefix}.{tgt_lang}", f"{out_prefix}.{tgt_lang}", tgt_model)
    
def segment_dataset(in_prefix, out_prefix, src_lang, tgt_lang, src_model, tgt_model):
    segment_bitext(f"{in_prefix}.train", f"{out_prefix}.train", src_lang, tgt_lang, src_model, tgt_model)
    segment_bitext(f"{in_prefix}.dev", f"{out_prefix}.dev", src_lang, tgt_lang, src_model, tgt_model)
    segment_bitext(f"{in_prefix}.test", f"{out_prefix}.test", src_lang, tgt_lang, src_model, tgt_model)

In [36]:
spm.SentencePieceTrainer.train(input="sp/combined_data", model_prefix="sp/combined_model_8000", vocab_size=8000, character_coverage=1.0, model_type="bpe")

In [37]:
!tail -n +4 sp/combined_model_8000.vocab | cut -f1 | sed 's/$/ 100/g' > sp/fs.combined_8000.vocab

In [61]:
!mkdir -p sp/out
target_dir = "sp/out"

sp = spm.SentencePieceProcessor(model_file="sp/combined_model_8000.model")
segment_dataset("preprocessing/out/fr-fon", f"{target_dir}/fr-fon", "fr", "fon", sp, sp)
segment_dataset("preprocessing/out/fr-ee", f"{target_dir}/fr-ee", "fr", "ee", sp, sp)
segment_bitext("augmented_ee/out/cleaned/fr-ee.train", f"{target_dir}/fr-ee", "fr", "ee", sp, sp)

segment_bitext("preprocessing/clean_dict/out/fr-fon.train", f"{target_dir}/fr-fon.train.dict", "fr", "fon", sp, sp)
segment_bitext("preprocessing/clean_dict/out/fr-fon.train.dict", f"{target_dir}/fr-fon.dict", "fr", "fon", sp, sp)

segment_bitext("jw300/out/fr-ee.train", f"{target_dir}/fr-ee.jw300", "fr", "ee", sp, sp)
segment_bitext("jw300/out/fr-fon.train", f"{target_dir}/fr-fon.jw300", "fr", "fon", sp, sp)

## Pretraining with JW300 data

In [39]:
path=f"jw300/out/fr-ee.train"

path_2=f"preprocessing/out/fr-ee.train"

In [40]:
lines = None

# adding jw300 and zindi data together, while oversampling zindi

with open(f"{path}.ee", "r", encoding="utf8") as tgt_file, open(f"{path}.fr", "r", encoding="utf8") as src_file:
    lines = list(zip(src_file.readlines(), tgt_file.readlines()))
    
with open(f"{path_2}.ee", "r", encoding="utf8") as tgt_file, open(f"{path_2}.fr", "r", encoding="utf8") as src_file:
    lines.extend(list(zip(src_file.readlines(), tgt_file.readlines()))*4)
    
    
random.Random(42).shuffle(lines)

In [41]:
train_lines = lines[1000:]
dev_lines = lines[:1000]

In [42]:
for split, data in [("train", train_lines), ("dev", dev_lines)]:
    with open(f"{split}.ee", "w", encoding="utf8") as tgt_file, open(f"{split}.fr", "w", encoding="utf8") as src_file:
        for ds, dt in data:
            src_file.write(ds),
            tgt_file.write(dt)

In [43]:
fs_vocab_path = "sp/fs.combined_8000.vocab"
sp_model_path = "sp/combined_model_8000.model"

In [44]:
out_dir = "pretraining/sp"
!mkdir -p $out_dir
src_lang = "fr"
tgt_lang = "ee"

sp = spm.SentencePieceProcessor(model_file=sp_model_path)

segment_bitext("train", f"{out_dir}/train", src_lang, tgt_lang, sp, sp)
segment_bitext("dev", f"{out_dir}/dev", src_lang, tgt_lang, sp, sp)

In [None]:
# binarizing data
!mkdir -p pretraining/bin
    
!(fairseq-preprocess --source-lang fr --target-lang ee \
    --trainpref $out_dir/train \
    --validpref $out_dir/dev \
    --tgtdict $fs_vocab_path \
    --srcdict $fs_vocab_path \
    --destdir pretraining/bin)

In [None]:
# training
model_name="jw300_pretraining_8k_ee"
!mkdir -p checkpoints/$model_name

!(fairseq-train pretraining/bin \
  --tensorboard-logdir tensorboard_logs/$model_name \
  --save-dir checkpoints/$model_name \
  --max-epoch 49 \
  --task translation \
  --arch transformer \
  --encoder-ffn-embed-dim 1024 \
  --decoder-ffn-embed-dim 1024 \
  --encoder-embed-dim 256 \
  --decoder-embed-dim 256 \
  --encoder-attention-heads 4 \
  --decoder-attention-heads 4 \
  --encoder-layers 6 \
  --decoder-layers 6 \
  --num-workers 4 \
  --data-buffer-size 10 \
  --validate-interval 1 \
  --save-interval 1 \
  --max-tokens 14000 \
  --update-freq 1 \
  --dropout 0.2 --attention-dropout 0.1 --activation-dropout 0.1 \
  --share-all-embeddings \
  --lr 0.0008 --lr-scheduler inverse_sqrt \
  --optimizer adam --adam-betas '(0.9, 0.98)' \
  --warmup-updates 2000 --warmup-init-lr '1e-07' \
  --label-smoothing 0.1 \
  --criterion label_smoothed_cross_entropy \
  --no-epoch-checkpoints \
  --keep-best-checkpoints 5 \
  --patience 5 \
  --log-format=tqdm --log-interval=100)

## Binarizing data for training

In [None]:
!mkdir -p bin
!mkdir -p bin/fr-ee
!mkdir -p bin/fr-fon

!(fairseq-preprocess --source-lang fr --target-lang ee \
        --trainpref sp/out/fr-ee.train \
        --validpref sp/out/fr-ee.dev \
        --testpref sp/out/fr-ee.test \
        --tgtdict sp/fs.combined_8000.vocab \
        --srcdict sp/fs.combined_8000.vocab \
        --destdir bin/fr-ee)


!(fairseq-preprocess --source-lang fr --target-lang fon \
    --trainpref sp/out/fr-fon.train.dict \
    --validpref sp/out/fr-fon.dev \
    --testpref sp/out/fr-fon.test \
    --tgtdict sp/fs.combined_8000.vocab \
    --srcdict sp/fs.combined_8000.vocab \
    --destdir bin/fr-fon)

## Training fon model

In [None]:
!(fairseq-train bin/fr-fon \
  --tensorboard-logdir tensorboard_logs/fr-fon \
  --save-dir checkpoints/fr-fon \
  --max-epoch 200 \
  --finetune-from-model checkpoints/jw300_pretraining_8k_ee \
  --encoder-ffn-embed-dim 1024 \
  --decoder-ffn-embed-dim 1024 \
  --encoder-embed-dim 256 \
  --decoder-embed-dim 256 \
  --encoder-attention-heads 4 \
  --decoder-attention-heads 4 \
  --task translation \
  --arch transformer \
  --num-workers 4 \
  --data-buffer-size 10 \
  --validate-interval 5 \
  --save-interval 5 \
  --max-tokens 10000 \
  --update-freq 1 \
  --share-all-embeddings \
  --dropout 0.15 --attention-dropout 0.1 --activation-dropout 0.1 \
  --share-decoder-input-output-embed \
  --lr 0.0001 --lr-scheduler inverse_sqrt \
  --optimizer adam --adam-betas '(0.9, 0.98)' \
  --warmup-updates 2000 --warmup-init-lr '1e-07' \
  --label-smoothing 0.1 \
  --criterion label_smoothed_cross_entropy \
  --no-epoch-checkpoints \
  --keep-best-checkpoints 5 \
  --patience 5 \
  --log-format=tqdm --log-interval=100)

In [None]:
# Calculate BLEU for the test set

!(PYTHONIOENCODING=utf-8 fairseq-generate bin/fr-fon  --task translation \
  --gen-subset test \
  --source-lang fr --target-lang fon \
  --path checkpoints/fr-fon/checkpoint_best.pt \
  --beam 5 \
  --scoring sacrebleu --remove-bpe 'sentencepiece')

## Training ee model

In [None]:
!(fairseq-train bin/fr-ee \
  --tensorboard-logdir tensorboard_logs/fr-ee \
  --save-dir checkpoints/fr-ee \
  --finetune-from-model checkpoints/jw300_pretraining_8k_ee \
  --max-epoch 200 \
  --task translation \
  --arch transformer \
  --encoder-ffn-embed-dim 1024 \
  --decoder-ffn-embed-dim 1024 \
  --encoder-embed-dim 256 \
  --decoder-embed-dim 256 \
  --encoder-attention-heads 4 \
  --decoder-attention-heads 4 \
  --share-all-embeddings \
  --encoder-layers 6 \
  --num-workers 4 \
  --data-buffer-size 10 \
  --validate-interval 5 \
  --save-interval 5 \
  --max-tokens 12000 \
  --update-freq 1 \
  --dropout 0.2 --attention-dropout 0.15 --activation-dropout 0.15 \
  --weight-decay 0.0001 \
  --share-decoder-input-output-embed \
  --lr 0.0001 --lr-scheduler inverse_sqrt --warmup-updates 1 --warmup-init-lr 0.000099 \
  --optimizer adam --adam-betas '(0.9, 0.98)' \
  --label-smoothing 0.1 \
  --criterion label_smoothed_cross_entropy \
  --no-epoch-checkpoints \
  --keep-best-checkpoints 5 \
  --patience 5 \
  --log-format=tqdm --log-interval=100)


In [None]:
# calculate BLEU as a sanity check

!(PYTHONIOENCODING=utf-8 fairseq-generate bin/fr-ee  --task translation \
  --gen-subset test \
  --source-lang fr --target-lang ee \
  --path checkpoints/fr-ee/checkpoint_best.pt \
  --beam 5 \
  --scoring sacrebleu --remove-bpe 'sentencepiece')

## Generating submission

In [53]:
data = pd.read_csv(TEST_CSV_PATH)
data["French"] = data["French"].map(lambda x: x.rstrip().replace("\n", " "))
ewe_data = data[data.Target_Language == "Ewe"].copy()
fon_data = data[data.Target_Language == "Fon"].copy()

In [54]:
def french_to_file(df):
    with open("test", "w", encoding="utf8") as f:
        for idx, row in df.iterrows():
            f.write(row.French + "\n")
            
def read_target(output):
    with open(output, "r", encoding="utf8") as f:
        return [line.rstrip() for line in f]

In [55]:
french_to_file(ewe_data)

!(cat test | fairseq-interactive bin/fr-ee \
      --source-lang fr \
      --target-lang ee \
      --bpe sentencepiece \
      --scoring sacrebleu \
      --remove-bpe \
      --beam 7 \
      --sentencepiece-model sp/combined_model_8000.model \
      --path checkpoints/fr-ee/checkpoint_best.pt | grep -P "D-[0-9]+" | cut -f3 > ewe_translation)

ewe_data["Target"] = read_target("ewe_translation")

In [56]:
french_to_file(fon_data)

!(cat test | fairseq-interactive bin/fr-fon \
      --source-lang fr \
      --target-lang fon \
      --bpe sentencepiece \
      --scoring sacrebleu \
      --remove-bpe \
      --beam 7 \
      --sentencepiece-model sp/combined_model_8000.model \
      --path checkpoints/fr-fon/checkpoint_best.pt | grep -P "D-[0-9]+" | cut -f3 > fon_translation)

fon_data["Target"] = read_target("fon_translation")

In [57]:
# final submission
pd.concat([fon_data[["ID", "Target"]], ewe_data[["ID", "Target"]]]).to_csv("submission.csv", index=False)