#Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Clone WMT22-MasaKhane GitHub Repo

In [None]:
%cd /content/drive/MyDrive

!git clone https://github.com/abumafrim/WMT22-MasaKhane.git

#Set-up

*   Change directory to WMT22-MasaKhane
*   Install required libraries

In [None]:
%cd /content/drive/MyDrive/WMT22-MasaKhane

!pip install -r requirements.txt

#Download data

* MAFAND-MT
* Huggingface LASER
* Create Sentence-pair classification dataset

In [None]:
%cd /content/drive/MyDrive/WMT22-MasaKhane

#Clone mafand and preprocess data files
!git clone https://github.com/masakhane-io/lafand-mt.git
!python3 download-and-process-mafand.py

#Download Hugginface LASER and preprocess data files
!python3 download-and-process-hug-laser.py

%cd sentence-pair-classification
!python3 create-spc-data.py

#Train SPC Model

In [None]:
%cd /content/drive/MyDrive/WMT22-MasaKhane/sentence-pair-classification

import os

##Provide the model to finetune
## Any of "albert-base-v2", "albert-large-v2", "albert-xlarge-v2", "albert-xxlarge-v2", "bert-base-uncased", etc

model = 'albert-base-v2'

#hug_langs = ['eng-hau', 'eng-ibo', 'eng-lug', 'eng-swh', 'eng-tsn', 'eng-yor', 'eng-zul', 'fra-wol']
#maf_langs = ['en_hau', 'en_ibo', 'en_lug', 'en_swa', 'en_tsn', 'en_yor', 'en_zul', 'fr_wol']

hug_langs = ['eng-lug', 'eng-tsn', 'eng-zul', 'fra-wol']
maf_langs = ['en_lug', 'en_tsn', 'en_zul', 'fr_wol']

for hug_lang, maf_lang in zip(hug_langs, maf_langs):
  train_path = 'data/' + hug_lang + '/spc-' + maf_lang + '_train.tsv'
  dev_path = 'data/' + hug_lang + '/spc-' + maf_lang + '_dev.tsv'
  test_path = 'data/' + hug_lang + '/spc-' + maf_lang + '_test.tsv'

  data_to_classify = 'data/' + hug_lang + '/spc-' + maf_lang + '_to_classify.tsv'

  model_path = 'models/' + hug_lang
  if not os.path.exists(model_path):
    print("Creation of the " + hug_lang + " model folder...")
    os.makedirs(model_path)

  !python3 run-sp-class.py \
      --train \
      --eval=True \
      --model={model} \
      --model_path={model_path} \
      --train_data={train_path} \
      --val_data={dev_path} \
      --test_data={test_path} \
      --epochs=4
 
  val_loss = 100

  for x in os.listdir(model_path):
    if x.endswith(".pt"):
      if float(x[x.index('loss') + 5:x.index('loss') + 9]) < val_loss:
        val_loss = float(x[x.index('loss') + 5:x.index('loss') + 9])
        model_name = model_path + '/' + x

  output_path = 'data/' + hug_lang

  !python3 predict.py \
      --predict \
      --model={model} \
      --model_path={model_name} \
      --data_path={data_to_classify} \
      --output_path={output_path}

/content/drive/MyDrive/WMT22-MasaKhane/sentence-pair-classification
PyTorch version 1.11.0+cu113 available.
TensorFlow version 2.8.2 available.
Gen RAM Free: 87.4 GB  | Proc size: 798.8 MB
GPU RAM Free: 40536MB | Used: 0MB | Util   0% | Total 40536MB
Reading training data...
Downloading: 100% 684/684 [00:00<00:00, 612kB/s]
Downloading: 100% 760k/760k [00:00<00:00, 7.46MB/s]
Reading validation data...
Downloading: 100% 47.4M/47.4M [00:00<00:00, 68.3MB/s]
 20% 101/510 [00:06<00:23, 17.69it/s]
Iteration 102/510 of epoch 1 complete. Loss : 0.17076843892972843 
 40% 203/510 [00:11<00:16, 18.64it/s]
Iteration 204/510 of epoch 1 complete. Loss : 0.09903182465510041 
 60% 305/510 [00:17<00:11, 18.46it/s]
Iteration 306/510 of epoch 1 complete. Loss : 0.0787077126796266 
 80% 407/510 [00:22<00:05, 17.97it/s]
Iteration 408/510 of epoch 1 complete. Loss : 0.07042094746915002 
100% 509/510 [00:28<00:00, 18.48it/s]
Iteration 510/510 of epoch 1 complete. Loss : 0.07026049139790748 
100% 510/510 [00:2

In [None]:
import pandas as pd
import os

%cd /content/drive/MyDrive/WMT22 MasaKhane/MasaKhaneNLP-WMT22/sentence-pair-classification

#hug_langs = ['eng-hau', 'eng-ibo', 'eng-lug', 'eng-swh', 'eng-tsn', 'eng-yor', 'eng-zul', 'fra-wol']
#maf_langs = ['en_hau', 'en_ibo', 'en_lug', 'en_swa', 'en_tsn', 'en_yor', 'en_zul', 'fr_wol']

hug_langs = ['eng-lug', 'eng-tsn', 'eng-zul', 'fra-wol']
maf_langs = ['en_lug', 'en_tsn', 'en_zul', 'fr_wol']

thresholds = [0.4, 0.5, 0.7]

for hug_lang, maf_lang in zip(hug_langs, maf_langs):

  with open('data/' + hug_lang + '/predictions.txt', 'r') as f:
      preds = f.readlines()

  df_pred = pd.read_csv('data/' + hug_lang + '/spc-' + maf_lang + '_to_classify.tsv', sep='\t')

  for threshold in thresholds:

    src_correct = []
    tgt_correct = []

    for sent1, sent2, pred in zip(df_pred['sentence1'], df_pred['sentence2'], preds):
      if float(pred) >= threshold:
        src_correct.append(sent1)
        tgt_correct.append(sent2)

    df = pd.DataFrame(list(zip(src_correct, tgt_correct)), columns=['input', 'target'])
    df.to_csv(os.path.join('data/' + hug_lang + '/', 'correct-translations_t_' + str(threshold) + '.tsv'), sep='\t', index=False)

    print(lang, threshold, len(df))