# Setup Libraries and Dataset


In [None]:
!pip install bert-score

!mkdir data
!wget https://ahmadian.me/nmt/train.en -O data/train.en
!wget https://ahmadian.me/nmt/train.fa -O data/train.fa
!wget https://ahmadian.me/nmt/train-min.en -O data/train-min.en
!wget https://ahmadian.me/nmt/train-min.fa -O data/train-min.fa
!wget https://ahmadian.me/nmt/valid.en -O data/valid.en
!wget https://ahmadian.me/nmt/valid.fa -O data/valid.fa
!wget https://ahmadian.me/nmt/test.en -O data/test.en
!wget https://ahmadian.me/nmt/test.fa -O data/test.fa

In [2]:
trainEnFile = 'data/train.en'
trainFaFile = 'data/train.fa'
trainMinEnFile = 'data/train-min.en'
trainMinFaFile = 'data/train-min.fa'
validEnFile = 'data/valid.en'
validFaFile = 'data/valid.fa'
testEnFile = 'data/test.en'
testFaFile = 'data/test.fa'

In [3]:
trainEnFileProc = 'data/train.proc.en'
trainFaFileProc = 'data/train.proc.fa'
trainMinEnFileProc = 'data/train-min.proc.en'
trainMinFaFileProc = 'data/train-min.proc.fa'
validEnFileProc = 'data/valid.proc.en'
validFaFileProc = 'data/valid.proc.fa'
testEnFileProc = 'data/test.proc.en'
testFaFileProc = 'data/test.proc.fa'




with open(trainEnFile, 'r') as file:
    data = file.read().lower()

text_file = open(trainEnFileProc, "w")
text_file.write(data)
text_file.close()

with open(trainMinEnFile, 'r') as file:
    data = file.read().lower()

text_file = open(trainMinEnFileProc, "w")
text_file.write(data)
text_file.close()

with open(validEnFile, 'r') as file:
    data = file.read().lower()

text_file = open(validEnFileProc, "w")
text_file.write(data)
text_file.close()

with open(testEnFile, 'r') as file:
    data = file.read().lower()

text_file = open(testEnFileProc, "w")
text_file.write(data)
text_file.close()



with open(trainFaFile, 'r') as file:
    data = file.read().replace('\u200c', ' ')

text_file = open(trainFaFileProc, "w")
text_file.write(data)
text_file.close()

with open(trainMinFaFile, 'r') as file:
    data = file.read().replace('\u200c', ' ')

text_file = open(trainMinFaFileProc, "w")
text_file.write(data)
text_file.close()

with open(validFaFile, 'r') as file:
    data = file.read().replace('\u200c', ' ')

text_file = open(validFaFileProc, "w")
text_file.write(data)
text_file.close()

with open(testFaFile, 'r') as file:
    data = file.read().replace('\u200c', ' ')

text_file = open(testFaFileProc, "w")
text_file.write(data)
text_file.close()

In [4]:
%%capture
# W and B -- For Logging
! pip install wandb

# Sacremoses -- For Tokenizing
! pip install sacremoses

! git clone https://github.com/pytorch/fairseq
%cd fairseq
! pip install --editable ./
%cd ..

! echo $PYTHONPATH

import os
os.environ['PYTHONPATH'] += ":/content/fairseq/"

! echo $PYTHONPATH

In [None]:
import wandb
wandb.login()

## Pre-process and Binarize to build Vocabularies

In [None]:
! fairseq-preprocess --source-lang en --target-lang fa \
  --trainpref data/train.proc \
  --validpref data/valid.proc \
  --testpref  data/test.proc \
  --destdir data/tokenized \
  --thresholdsrc 2 \
  --thresholdtgt 2 \
  --bpe byte_bpe

# Train the model

In [None]:
! fairseq-train data/tokenized \
  --arch transformer \
  --dropout 0.1 \
  --attention-dropout 0.1 \
  --activation-dropout 0.1 \
  --encoder-embed-dim 256 \
  --encoder-ffn-embed-dim 512 \
  --encoder-layers 3 \
  --encoder-attention-heads 8 \
  --encoder-learned-pos \
  --decoder-embed-dim 256 \
  --decoder-ffn-embed-dim 512 \
  --decoder-layers 3 \
  --decoder-attention-heads 8 \
  --decoder-learned-pos \
  --max-epoch 20 \
  --optimizer adam \
  --lr 5e-4 \
  --batch-size 128 \
  --scoring bleu \
  --seed 1 \
  --wandb-project "fairseq" \
  --bpe byte_bpe

# Evaluate the model

In [None]:
! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint1.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint2.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint3.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint4.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint5.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint6.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint7.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint8.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint9.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint10.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint11.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint12.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint13.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint14.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint15.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint16.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint17.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint18.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint19.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint20.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bleu \
    --quiet \
    --wandb-project "fairseq"

In [None]:
! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint1.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint2.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint3.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint4.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint5.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint6.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint7.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint8.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint9.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint10.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint11.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint12.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint13.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint14.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint15.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint16.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint17.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint18.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint19.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint20.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring bert_score \
    --quiet \
    --wandb-project "fairseq"

In [None]:
! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint1.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint2.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint3.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint4.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint5.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint6.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint7.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint8.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint9.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint10.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint11.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint12.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint13.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint14.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint15.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint16.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint17.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint18.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint19.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"

! fairseq-generate data/tokenized \
    --path checkpoints/checkpoint20.pt \
    --batch-size 128 \
    --beam 10 \
    --seed 1 \
    --scoring wer \
    --quiet \
    --wandb-project "fairseq"