#Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Clone WMT22-MasaKhane GitHub Repo

In [None]:
%cd /content/drive/MyDrive

!git clone https://github.com/abumafrim/WMT22-MasaKhane.git

#Set-up

*   Change directory to WMT22-MasaKhane
*   Install required libraries

In [None]:
%cd /content/drive/MyDrive/WMT22-MasaKhane

!pip install -r requirements.txt

#Download data

* MAFAND-MT
* Huggingface LASER
* Create Sentence-pair classification dataset

In [None]:
%cd /content/drive/MyDrive/WMT22-MasaKhane

#Clone mafand and preprocess data files
!git clone https://github.com/masakhane-io/lafand-mt.git
!python3 download-and-process-mafand.py

#Download Hugginface LASER and preprocess data files
!python3 download-and-process-hug-laser.py

%cd sentence-pair-classification
!python3 create-spc-data.py

#SPC Model


* Train SPC Model
* Use model to predict Huggingface LASER quality
* Select best sentences based on 3 thresholds: **0.4, 0.5, 0.7**



In [None]:
#Set-up PLM and target languages

%cd /content/drive/MyDrive/WMT22-MasaKhane/sentence-pair-classification

import os

##Provide the model to finetune
## Any of "albert-base-v2", "albert-large-v2", "albert-xlarge-v2", "albert-xxlarge-v2", "bert-base-uncased", etc

model = 'albert-base-v2'

##Provide the target languages in the same format
hug_langs = ['eng-hau', 'eng-ibo', 'eng-lug', 'eng-swh', 'eng-tsn', 'eng-yor', 'eng-zul', 'fra-wol']
maf_langs = ['en_hau', 'en_ibo', 'en_lug', 'en_swa', 'en_tsn', 'en_yor', 'en_zul', 'fr_wol']

In [None]:
#Train SPC Model on MAFAND-MT and Huggingface LASER Datasets

for hug_lang, maf_lang in zip(hug_langs, maf_langs):
  train_path = 'data/' + hug_lang + '/spc-' + maf_lang + '_train.tsv'
  dev_path = 'data/' + hug_lang + '/spc-' + maf_lang + '_dev.tsv'
  test_path = 'data/' + hug_lang + '/spc-' + maf_lang + '_test.tsv'

  model_path = 'models/' + hug_lang
  if not os.path.exists(model_path):
    print("Creation of the " + hug_lang + " model folder...")
    os.makedirs(model_path)

  !python3 run-sp-class.py \
      --train \
      --eval=True \
      --model={model} \
      --model_path={model_path} \
      --train_data={train_path} \
      --val_data={dev_path} \
      --test_data={test_path} \
      --epochs=4

In [None]:
#Predict quality of Hugginface LASER Sentences

for hug_lang, maf_lang in zip(hug_langs, maf_langs):
  data_to_classify = 'data/' + hug_lang + '/spc-' + maf_lang + '_to_classify.tsv'
  model_path = 'models/' + hug_lang
  val_loss = 100

  #Select the best model
  for x in os.listdir(model_path):
    if x.beginswith(model) and x.endswith(".pt"):
      if float(x[x.index('loss') + 5:x.index('loss') + 9]) < val_loss:
        val_loss = float(x[x.index('loss') + 5:x.index('loss') + 9])
        model_name = model_path + '/' + x

  output_path = 'data/' + hug_lang

  #Predict sentence quality
  !python3 predict.py \
      --predict \
      --model={model} \
      --model_path={model_name} \
      --data_path={data_to_classify} \
      --output_path={output_path}

In [None]:
import pandas as pd

thresholds = [0.4, 0.5, 0.7]

for hug_lang, maf_lang in zip(hug_langs, maf_langs):
  with open('data/' + hug_lang + '/predictions.txt', 'r') as f:
      preds = f.readlines()
  df_pred = pd.read_csv('data/' + hug_lang + '/spc-' + maf_lang + '_to_classify.tsv', sep='\t')

  for threshold in thresholds:
    src_correct = []
    tgt_correct = []

    for sent1, sent2, pred in zip(df_pred['sentence1'], df_pred['sentence2'], preds):
      if float(pred) >= threshold:
        src_correct.append(sent1)
        tgt_correct.append(sent2)

    df = pd.DataFrame(list(zip(src_correct, tgt_correct)), columns=['input', 'target'])
    df.to_csv(os.path.join('data/' + hug_lang + '/', 'correct-translations_t_' + str(threshold) + '.tsv'), sep='\t', index=False)

    print(hug_lang, threshold, len(df))
    print()