In [None]:
!pip install OpenNMT-py
!pip install bert-score
!pip install jiwer
!pip install torchmetrics

!mkdir data
!wget https://ahmadian.me/nmt/train.en -O data/train.en
!wget https://ahmadian.me/nmt/train.fa -O data/train.fa
!wget https://ahmadian.me/nmt/train-min.en -O data/train-min.en
!wget https://ahmadian.me/nmt/train-min.fa -O data/train-min.fa
!wget https://ahmadian.me/nmt/valid.en -O data/valid.en
!wget https://ahmadian.me/nmt/valid.fa -O data/valid.fa
!wget https://ahmadian.me/nmt/test.en -O data/test.en
!wget https://ahmadian.me/nmt/test.fa -O data/test.fa

!mkdir opennmt
!wget https://ahmadian.me/nmt/opennmt-config.yaml -O opennmt/config.yaml
!wget https://ahmadian.me/nmt/bpe-config.yaml -O opennmt/bpe-config.yaml

In [3]:
trainEnFile = 'data/train.en'
trainFaFile = 'data/train.fa'
validEnFile = 'data/valid.en'
validFaFile = 'data/valid.fa'
testEnFile = 'data/test.en'
testFaFile = 'data/test.fa'

In [4]:
with open(trainEnFile, 'r') as file:
    data = file.read().lower()

text_file = open(trainEnFile + '.proc', "w")
text_file.write(data)
text_file.close()

with open(validEnFile, 'r') as file:
    data = file.read().lower()

text_file = open(validEnFile + '.proc', "w")
text_file.write(data)
text_file.close()

with open(testEnFile, 'r') as file:
    data = file.read().lower()

text_file = open(testEnFile + '.proc', "w")
text_file.write(data)
text_file.close()



with open(trainFaFile, 'r') as file:
    data = file.read().replace('\u200c', ' ')

text_file = open(trainFaFile + '.proc', "w")
text_file.write(data)
text_file.close()

with open(trainMinFaFile, 'r') as file:
    data = file.read().replace('\u200c', ' ')

text_file = open(trainMinFaFile + '.proc', "w")
text_file.write(data)
text_file.close()

with open(validFaFile, 'r') as file:
    data = file.read().replace('\u200c', ' ')

text_file = open(validFaFile + '.proc', "w")
text_file.write(data)
text_file.close()

with open(testFaFile, 'r') as file:
    data = file.read().replace('\u200c', ' ')

text_file = open(testFaFile + '.proc', "w")
text_file.write(data)
text_file.close()

In [5]:
trainEnFileProc = trainEnFile + '.proc'
trainFaFileProc = trainFaFile + '.proc'
validEnFileProc = validEnFile + '.proc'
validFaFileProc = validFaFile + '.proc'
testEnFileProc = testEnFile + '.proc'
testFaFileProc = testFaFile + '.proc'

In [6]:
import pyonmttok

args = {
    "mode": "aggressive",
    "joiner_annotate": True,
    "preserve_placeholders": True,
    "case_markup": True,
    "soft_case_regions": True,
    "preserve_segmented_tokens": True,
}
n_symbols = 40000



tokenizer_default = pyonmttok.Tokenizer(**args)
learner = pyonmttok.BPELearner(tokenizer=tokenizer_default, symbols=n_symbols)
learner.ingest_file(trainEnFileProc)

tokenizer = learner.learn("opennmt/en.bpe")

tokenizer.tokenize_file(f"{trainEnFileProc}", f"{trainEnFile}.bpe")
tokenizer.tokenize_file(f"{validEnFileProc}", f"{validEnFile}.bpe")
tokenizer.tokenize_file(f"{testEnFileProc}", f"{testEnFile}.bpe")



tokenizer_default = pyonmttok.Tokenizer(**args)
learner = pyonmttok.BPELearner(tokenizer=tokenizer_default, symbols=n_symbols)
learner.ingest_file(trainFaFileProc)

tokenizer = learner.learn("opennmt/fa.bpe")

tokenizer.tokenize_file(f"{trainFaFileProc}", f"{trainFaFile}.bpe")
tokenizer.tokenize_file(f"{validFaFileProc}", f"{validFaFile}.bpe")
tokenizer.tokenize_file(f"{testFaFileProc}", f"{testFaFile}.bpe")



trainEnFileBPE = trainEnFile + '.bpe'
trainFaFileBPE = trainFaFile + '.bpe'
validEnFileBPE = validEnFile + '.bpe'
validFaFileBPE = validFaFile + '.bpe'
testEnFileBPE = testEnFile + '.bpe'
testFaFileBPE = testFaFile + '.bpe'

trainEnFile = trainEnFileBPE
trainFaFile = trainFaFileBPE
validEnFile = validEnFileBPE
validFaFile = validFaFileBPE
testEnFile = testEnFileBPE
testFaFile = testFaFileBPE

In [None]:
!onmt_build_vocab -config opennmt/bpe-config.yaml -n_sample -1

In [None]:
!onmt_train -config opennmt/config.yaml

In [5]:
import numpy as np
from nltk.translate.bleu_score import corpus_bleu
from torchmetrics.functional.text.bert import bert_score
from jiwer import wer

def calculateMetrics(referencesFile, candidatesFile):
    references = []
    candidates = []
    referencesSentences = []
    candidatesSentences = []

    with open(referencesFile) as f:
        sentences = f.readlines()

        for sentence in sentences:
            references.append([sentence.split()])
            referencesSentences.append(sentence)

    with open(candidatesFile) as f:
        sentences = f.readlines()

        for sentence in sentences:
            candidates.append(sentence.split())
            candidatesSentences.append(sentence)

    bleuScore = []
    bleuScore.append(corpus_bleu(references, candidates))
    bleuScore.append(corpus_bleu(references, candidates, weights=(1, 0 , 0 , 0)))
    bleuScore.append(corpus_bleu(references, candidates, weights=(0, 1 , 0 , 0)))
    bleuScore.append(corpus_bleu(references, candidates, weights=(0, 0 , 1 , 0)))
    bleuScore.append(corpus_bleu(references, candidates, weights=(0, 0 , 0 , 1)))

    bertScore = []
    bertCalc = bert_score(candidatesSentences, referencesSentences, device='cuda')
    bertScore.append(np.mean(bertCalc['f1']))

    werError = wer(referencesSentences, candidatesSentences)

    return bleuScore, bertScore, werError

In [None]:
!onmt_translate -model opennmt/opennmt_step_1000.pt -src data/test.en.proc -output data/pred_1000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_2000.pt -src data/test.en.proc -output data/pred_2000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_3000.pt -src data/test.en.proc -output data/pred_3000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_4000.pt -src data/test.en.proc -output data/pred_4000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_5000.pt -src data/test.en.proc -output data/pred_5000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_6000.pt -src data/test.en.proc -output data/pred_6000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_7000.pt -src data/test.en.proc -output data/pred_7000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_8000.pt -src data/test.en.proc -output data/pred_8000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_9000.pt -src data/test.en.proc -output data/pred_9000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_10000.pt -src data/test.en.proc -output data/pred_10000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_11000.pt -src data/test.en.proc -output data/pred_11000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_12000.pt -src data/test.en.proc -output data/pred_12000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_13000.pt -src data/test.en.proc -output data/pred_13000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_14000.pt -src data/test.en.proc -output data/pred_14000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_15000.pt -src data/test.en.proc -output data/pred_15000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_16000.pt -src data/test.en.proc -output data/pred_16000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_17000.pt -src data/test.en.proc -output data/pred_17000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_18000.pt -src data/test.en.proc -output data/pred_18000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_19000.pt -src data/test.en.proc -output data/pred_19000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_20000.pt -src data/test.en.proc -output data/pred_20000.fa -gpu 0

In [None]:
!onmt_translate -model opennmt/opennmt_step_1000.pt -src data/valid.en.proc -output data/pred_valid_1000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_2000.pt -src data/valid.en.proc -output data/pred_valid_2000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_3000.pt -src data/valid.en.proc -output data/pred_valid_3000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_4000.pt -src data/valid.en.proc -output data/pred_valid_4000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_5000.pt -src data/valid.en.proc -output data/pred_valid_5000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_6000.pt -src data/valid.en.proc -output data/pred_valid_6000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_7000.pt -src data/valid.en.proc -output data/pred_valid_7000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_8000.pt -src data/valid.en.proc -output data/pred_valid_8000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_9000.pt -src data/valid.en.proc -output data/pred_valid_9000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_10000.pt -src data/valid.en.proc -output data/pred_valid_10000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_11000.pt -src data/valid.en.proc -output data/pred_valid_11000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_12000.pt -src data/valid.en.proc -output data/pred_valid_12000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_13000.pt -src data/valid.en.proc -output data/pred_valid_13000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_14000.pt -src data/valid.en.proc -output data/pred_valid_14000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_15000.pt -src data/valid.en.proc -output data/pred_valid_15000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_16000.pt -src data/valid.en.proc -output data/pred_valid_16000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_17000.pt -src data/valid.en.proc -output data/pred_valid_17000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_18000.pt -src data/valid.en.proc -output data/pred_valid_18000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_19000.pt -src data/valid.en.proc -output data/pred_valid_19000.fa -gpu 0
!onmt_translate -model opennmt/opennmt_step_20000.pt -src data/valid.en.proc -output data/pred_valid_20000.fa -gpu 0

In [None]:
testSteps = ['1000', '2000', '3000', '4000', '5000', '6000', '7000', '8000', '9000', '10000', '11000', '12000', '13000', '14000', '15000', '16000', '17000', '18000', '19000', '20000']

for testStep in testSteps:
    print('Testing Step: ' + str(testStep))
    bleuScore, bertScore, werError = calculateMetrics(testFaFileProc, 'data/pred_'+str(testStep)+'.fa')
    print('bleu: ' + str(bleuScore))
    print('bertScore: ' + str(bertScore))
    print('werError: ' + str(werError))
    print()

In [None]:
testSteps = ['1000', '2000', '3000', '4000', '5000', '6000', '7000', '8000', '9000', '10000', '11000', '12000', '13000', '14000', '15000', '16000', '17000', '18000', '19000', '20000']

for testStep in testSteps:
    print('Testing Step: ' + str(testStep))
    bleuScore, bertScore, werError = calculateMetrics(validFaFileProc, 'data/pred_valid_'+str(testStep)+'.fa')
    print('bleu: ' + str(bleuScore))
    print('bertScore: ' + str(bertScore))
    print('werError: ' + str(werError))
    print()