In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Models Finetuning

In this notebook several models will be finetuned to perform sentence simplification in Russian. All the models will be tuned with the parametres offered at RuSimpleSentEval competition. The main objective is not to achieve the best performance but rather compare different models trained with and without translated data. In every case training will last 5 epochs. Overall, there are five models:

* Model trained on origibal WikiLarge data to perform task for English
* Model trained on pairs: original english - simplified russian sentence. So, it learns both translate and simplify at the same time.
* Model trained only on the translated to Russian data.
* Model trained firstly on the original data and then on the translated corpus

All the models will be evaluated and compared 



### Necessary libraries

In [None]:
import pandas as pd
import re
import nltk
nltk.download('punkt')

In [None]:
! wget https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.cc25.v2.tar.gz
! tar -xzvf /content/mbart.cc25.v2.tar.gz
! apt-get install cmake build-essential pkg-config libgoogle-perftools-dev

In [None]:
!git clone https://github.com/google/sentencepiece.git 
%cd sentencepiece
!mkdir build

Cloning into 'sentencepiece'...
remote: Enumerating objects: 3677, done.[K
remote: Total 3677 (delta 0), reused 0 (delta 0), pack-reused 3677[K
Receiving objects: 100% (3677/3677), 28.56 MiB | 17.55 MiB/s, done.
Resolving deltas: 100% (2581/2581), done.
/content/sentencepiece


In [None]:
%cd build
!cmake ..
!make
!make install
!ldconfig -v

In [None]:
# from sentencepiece git
# !git clone https://github.com/google/sentencepiece.git 
# %cd sentencepiece
# %mkdir build
# %cd build
# !cmake ..
# !make -j $(nproc)
# !sudo make install
# !sudo ldconfig -v

In [None]:
%cd /content

/content


In [None]:
# !git clone https://github.com/pytorch/fairseq
# !cd fairseq
# %pip install --editable ./

In [None]:
!git clone https://github.com/pytorch/fairseq
%cd /content/fairseq/
!python -m pip install --editable .
%cd /content

! echo $PYTHONPATH

import os
os.environ['PYTHONPATH'] += ":/content/fairseq/"

! echo $PYTHONPATH

In [None]:
#  cd /content/fairseq

/content/fairseq


In [None]:
# %pip install --editable ./

In [None]:
# !python setup.py build develop

### Loading data...

In [None]:
! mkdir data
! gdown https://drive.google.com/uc?id=1bJo8TagTGKa0uyppQRqsHrKHyYO5tcZc
! gdown https://drive.google.com/uc?id=11lqipq6ggrgCk8bVxQ4-uuPVMCKN5ebU
! gdown https://drive.google.com/uc?id=1dB3X-Wx8qU_5nDG_pxAmLvo5H_sgnHrE

In [None]:
% cd /content/fairseq

/content/fairseq


In [None]:
data_train = pd.read_csv('/content/wiki_train_cleaned_translated_sd.csv')
data_dev = pd.read_csv('/content/wiki_dev_cleaned_translated_sd.csv')
data_test  = pd.read_csv('/content/wiki_test_cleaned_translated_sd.csv')

First training with en-ru data

In [None]:
with open('/content/fairseq/data/test.en', "a") as f:
  for i, row in data_test.iterrows():
    f.write(row['src']+'\n')

with open('/content/fairseq/data/train.en', "a") as f:
  for i, row in data_train.iterrows():
    f.write(row['src']+'\n')

with open('/content/fairseq/data/dev.en', "a") as f:
  for i, row in data_dev.iterrows():
    f.write(row['src']+'\n')

with open('/content/fairseq/data/test.ru', "a") as f:
  for i, row in data_test.iterrows():
    f.write(row['target_y']+'\n')

with open('/content/fairseq/data/train.ru', "a") as f:
  for i, row in data_train.iterrows():
    f.write(row['target_y']+'\n')

with open('/content/fairseq/data/dev.ru', "a") as f:
  for i, row in data_dev.iterrows():
    f.write(row['target_y']+'\n')

In [None]:
# %%bash
# export SPM="/content/sentencepiece/build/src/spm_encode"
# export BPE_MODEL="/content/mbart.cc25.v2/sentence.bpe.model"
# export DATA_DIR="/content/fairseq/data"
# export SRC="en"
# export TGT="en" #ru

In [None]:
! echo $DATA_DIR




In [None]:
SPM="/content/sentencepiece/build/src/spm_encode"
BPE_MODEL="/content/mbart.cc25.v2/sentence.bpe.model"
DATA_DIR="/content/fairseq/data"
SRC="en"
TGT="ru" #en

!$SPM --model=$BPE_MODEL < $DATA_DIR/train.$SRC > $DATA_DIR/train.spm.$SRC &
!$SPM --model=$BPE_MODEL < $DATA_DIR/train.$TGT > $DATA_DIR/train.spm.$TGT &
!$SPM --model=$BPE_MODEL < $DATA_DIR/dev.$SRC > $DATA_DIR/dev.spm.$SRC &
!$SPM --model=$BPE_MODEL < $DATA_DIR/dev.$TGT > $DATA_DIR/dev.spm.$TGT &
!$SPM --model=$BPE_MODEL < $DATA_DIR/test.$SRC > $DATA_DIR/test.spm.$SRC &
!$SPM --model=$BPE_MODEL < $DATA_DIR/test.$TGT > $DATA_DIR/test.spm.$TGT &

In [None]:
#%cd /content

/content


In [None]:

PREPROCESSED_DATA_DIR="/content/fairseq/data"
DICT="/content/mbart.cc25.v2/dict.txt"
!fairseq-preprocess \
  --source-lang en \
  --target-lang ru \
  --trainpref /content/fairseq/data/train.spm \
  --validpref /content/fairseq/data/dev.spm \
  --testpref /content/fairseq/data/test.spm \
  --destdir /content/fairseq/data \
  --thresholdtgt 0 \
  --thresholdsrc 0 \
  --srcdict /content/mbart.cc25.v2/dict.txt \
  --tgtdict /content/mbart.cc25.v2/dict.txt \
  --workers 70

Second training with ru-ru

In [None]:
with open('/content/fairseq/data/test.src', "a") as f:
  for i, row in data_test.iterrows():
    f.write(row['target_x']+'\n')

with open('/content/fairseq/data/train.src', "a") as f:
  for i, row in data_train.iterrows():
    f.write(row['target_x']+'\n')

with open('/content/fairseq/data/dev.src', "a") as f:
  for i, row in data_dev.iterrows():
    f.write(row['target_x']+'\n')

with open('/content/fairseq/data/test.dst', "a") as f:
  for i, row in data_test.iterrows():
    f.write(row['target_y']+'\n')

with open('/content/fairseq/data/train.dst', "a") as f:
  for i, row in data_train.iterrows():
    f.write(row['target_y']+'\n')

with open('/content/fairseq/data/dev.dst', "a") as f:
  for i, row in data_dev.iterrows():
    f.write(row['target_y']+'\n')

In [None]:
SPM="/content/sentencepiece/build/src/spm_encode"
BPE_MODEL="/content/mbart.cc25.v2/sentence.bpe.model"
DATA_DIR="/content/fairseq/data"
SRC="src"
TGT="dst"

!$SPM --model=$BPE_MODEL < $DATA_DIR/train.$SRC > $DATA_DIR/train.spm.$SRC &
!$SPM --model=$BPE_MODEL < $DATA_DIR/train.$TGT > $DATA_DIR/train.spm.$TGT &
!$SPM --model=$BPE_MODEL < $DATA_DIR/dev.$SRC > $DATA_DIR/dev.spm.$SRC &
!$SPM --model=$BPE_MODEL < $DATA_DIR/dev.$TGT > $DATA_DIR/dev.spm.$TGT &
!$SPM --model=$BPE_MODEL < $DATA_DIR/test.$SRC > $DATA_DIR/test.spm.$SRC &
!$SPM --model=$BPE_MODEL < $DATA_DIR/test.$TGT > $DATA_DIR/test.spm.$TGT &

In [None]:

PREPROCESSED_DATA_DIR="/content/fairseq/data"
DICT="/content/mbart.cc25.v2/dict.txt"
!fairseq-preprocess \
  --source-lang src \
  --target-lang dst \
  --trainpref /content/fairseq/data/train.spm \
  --validpref /content/fairseq/data/dev.spm \
  --testpref /content/fairseq/data/test.spm \
  --destdir /content/fairseq/data \
  --thresholdtgt 0 \
  --thresholdsrc 0 \
  --srcdict /content/mbart.cc25.v2/dict.txt \
  --tgtdict /content/mbart.cc25.v2/dict.txt \
  --workers 70

2021-04-19 15:37:34 | INFO | fairseq_cli.preprocess | Namespace(align_suffix=None, alignfile=None, all_gather_list_size=16384, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='/content/fairseq/data', empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, only_source=False, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, simul_type=None, source_lang='src', srcdict='/content/mbart.cc25.v2/dict.txt', suppress_crashes=False, target_lang='dst', task='translation', tensorboard_logdir=None, testpref='/content/fairseq/da

The code for training was the same all the times, just "src" and "dst" parts were changes. So, I do not repeated it six times, but rather altered this one, putting the necessary data in it


In [None]:
# mkdir to put the checkpoints
! mkdir /content/drive/MyDrive/checkpoints_ru_ru 

Also, it is necessary to make the following change in /content/fairseq/fairseq/tasks/translation_from_pretrained_bart.py:

```
def __init__(self, args, src_dict, tgt_dict):
        super().__init__(args, src_dict, tgt_dict)
        self.args = args                  # add this line !!!!!
        self.langs = args.langs.split(",")
        for d in [src_dict, tgt_dict]:
            for l in self.langs:
```


The next two cells should install apex for faster training, but some error occured:(

In [None]:
# %%writefile setup.sh

# export CUDA_HOME=/usr/local/cuda-10.1
# git clone https://github.com/NVIDIA/apex
# pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

Writing setup.sh


In [None]:
# !sh setup.sh

### Training-------------------------------

In [None]:
# those are just some variations in parameters that I tried
# > train_log.txt &
#  --update-freq 1
#  --ddp-backend no_c10d
# --max-tokens 1024
# --batch-size 4 2
# --max-epoch 25
# --fp16 \?????
# --update-freq? increase????
# --update-freq 2??? 5??
# 3
# --max-tokens 300
#  --ddp-backend no_c10d \
# --fp16 \
# --memory-efficient-fp16 \
# --save-interval-updates 5000 \
# /content/mbart.cc25.v2/model.pt
# --max-epoch 10

In [None]:
!fairseq-train /content/fairseq/data \
  --encoder-normalize-before --decoder-normalize-before \
  --arch mbart_large --layernorm-embedding \
  --task translation_from_pretrained_bart \
  --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
  --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
  --lr-scheduler polynomial_decay --lr 3e-05 --warmup-updates 2500 --total-num-update 54725  \
  --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
  --max-tokens 1024 --update-freq 5 \
  --source-lang src --target-lang dst \
  --batch-size 16 \
  --memory-efficient-fp16 \
  --validate-interval 1 \
  --patience 3 \
  --max-epoch 5 \
  --save-interval 1 --keep-last-epochs 10 --keep-best-checkpoints 2 \
  --ddp-backend no_c10d \
  --seed 42 --log-format simple --log-interval 500 \
  --restore-file /content/mbart.cc25.v2/model.pt \
  --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \
  --langs ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN \
  --scoring bleu \
  --save-dir /content/drive/MyDrive/checkpoints_ru_ru

2021-04-19 19:16:23 | INFO | fairseq_cli.train | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 500, 'log_format': 'simple', 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 42, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 1, 'di

### Test to check that everything is ok

In [None]:
! pip install sentencepiece

In [None]:
!fairseq-generate /content/fairseq/data \
  --path  /content/drive/MyDrive/checkpoints_ru_ru/checkpoint_best.pt \
  --task translation_from_pretrained_bart \
  --gen-subset test \
  --source-lang src --target-lang dst \
  --bpe 'sentencepiece' --sentencepiece-model /content/mbart.cc25.v2/sentence.bpe.model \
  --sacrebleu --remove-bpe 'sentencepiece' \
  --batch-size 32 --langs ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN > model_prediction.txt & 
!cat model_prediction.txt | grep -P "^H" |sort -V |cut -f 3- > model_prediction.hyp




In [None]:
# !fairseq-generate /content/fairseq/data \
#   --path /content/drive/MyDrive/checkpoint5.pt \
#   --task translation_from_pretrained_bart \
#   --gen-subset test \
#   --source-lang en --target-lang ru \
#   --bpe 'sentencepiece' --sentencepiece-model /content/mbart.cc25.v2/sentence.bpe.model \
#   --sacrebleu --remove-bpe 'sentencepiece' \
#   --batch-size 4 --langs ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN > model_prediction.txt & 


### try SARI evaluation

In [None]:
%cd /content

/content


In [None]:
! git clone https://github.com/feralvam/easse
! git clone https://github.com/Andoree/sent_simplification.git
%cp /content/sent_simplification/sari.py /content/easse/easse

In [None]:
%cd easse
! pip install .

In [None]:
!pwd

/content/easse


In [None]:
%cd /content
! mkdir prepared_data

Prepare data for SARI calculation

In [None]:
! wget https://raw.githubusercontent.com/dialogue-evaluation/RuSimpleSentEval/main/dev_sents.csv

In [None]:
! python /content/sent_simplification/refs_to_easse_format.py \
--input_path /content/dev_sents.csv \
--output_dataset_name test_ref_data \
--src_column "INPUT:source" \
--trg_column "OUTPUT:output" \
--output_dir /content/prepared_data

1000
3406
3406
Overall number of references: 3406


In [None]:
data_test = pd.read_csv('/content/dev_sents.csv')

In [None]:
data_test['OUTPUT:output'].shape

(3406,)

In [None]:
data_test.tail()

Unnamed: 0.1,Unnamed: 0,INPUT:source,OUTPUT:output
3401,9960,Язгуля́мский язы́к (самоназвание — Yuzdami zev...,"Язгулямский язык - один из языков, на котором ..."
3402,9961,Язгуля́мский язы́к (самоназвание — Yuzdami zev...,Язгуля́мский язы́к — один из памирских языков ...
3403,9975,Японский космический аппарат Хаябуса успешно д...,Японский космический аппарат Хаябуса успешно д...
3404,9976,Изображение соцветия подсолнечника на щите озн...,Соцветие подсолнечника - олицетворение сегодня...
3405,9977,Изображение соцветия подсолнечника на щите озн...,"Подсолнечник, который изображён на щите, симво..."


In [None]:
list1, list2 = [], []

with open('/content/fairseq/data/test.src', "r") as f:
    for i in f:
      list1.append(i)

with open('/content/fairseq/data/test.dst', "r") as f:
    for i in f:
      list2.append(i)


len(list1), len(list2)

(3406, 3406)

In [None]:
data_test  = pd.read_csv('/content/dev_sents.csv')
! rm -r /content/fairseq/data
! mkdir /content/fairseq/data

# file1 = open('/content/fairseq/data/test.src', "w")
# file2 = open('/content/fairseq/data/test.dst', "w")

# for i, row in data_test.iterrows():
#     file1.write(row['INPUT:source'] + '\n')
#     file2.write(row['OUTPUT:output'] + '\n')

with open('/content/fairseq/data/test.src', "w") as f:
  for i, row in data_test.iterrows():
    f.write(row['INPUT:source']+'\n')

with open('/content/fairseq/data/test.dst', "w") as f:
  for i, row in data_test.iterrows():
    f.write(re.sub(r'(\s){1,5}', "",row['OUTPUT:output'])+'\n')

In [None]:
data_test  = pd.read_csv('/content/dev_sents.csv')
# ! rm -r /content/fairseq/data
# ! mkdir /content/fairseq/data

# with open('/content/fairseq/data/test.src', "w") as f:
#   for i, row in data_test.iterrows():
#     f.write(row['INPUT:source']+'\n')

# with open('/content/fairseq/data/test.dst', "w") as f:
#   for i, row in data_test.iterrows():
#     f.write(row['OUTPUT:output']+'\n')
    
with open('/content/fairseq/data/train.src', "w") as f:
  for i, row in data_train.iterrows():
    f.write(row['target_x']+'\n')

with open('/content/fairseq/data/dev.src', "w") as f:
  for i, row in data_dev.iterrows():
    f.write(row['target_x']+'\n')


with open('/content/fairseq/data/train.dst', "w") as f:
  for i, row in data_train.iterrows():
    f.write(row['target_y']+'\n')

with open('/content/fairseq/data/dev.dst', "w") as f:
  for i, row in data_dev.iterrows():
    f.write(row['target_y']+'\n')

In [None]:
SPM="/content/sentencepiece/build/src/spm_encode"
BPE_MODEL="/content/mbart.cc25.v2/sentence.bpe.model"
DATA_DIR="/content/fairseq/data"
SRC="src"
TGT="dst"

!$SPM --model=$BPE_MODEL < $DATA_DIR/train.$SRC > $DATA_DIR/train.spm.$SRC &
!$SPM --model=$BPE_MODEL < $DATA_DIR/train.$TGT > $DATA_DIR/train.spm.$TGT &
!$SPM --model=$BPE_MODEL < $DATA_DIR/dev.$SRC > $DATA_DIR/dev.spm.$SRC &
!$SPM --model=$BPE_MODEL < $DATA_DIR/dev.$TGT > $DATA_DIR/dev.spm.$TGT &
!$SPM --model=$BPE_MODEL < $DATA_DIR/test.$SRC > $DATA_DIR/test.spm.$SRC &
!$SPM --model=$BPE_MODEL < $DATA_DIR/test.$TGT > $DATA_DIR/test.spm.$TGT &


In [None]:
PREPROCESSED_DATA_DIR="/content/fairseq/data"
DICT="/content/mbart.cc25.v2/dict.txt"
!fairseq-preprocess \
  --source-lang src \
  --target-lang dst \
  --trainpref /content/fairseq/data/train.spm \
  --validpref /content/fairseq/data/dev.spm \
  --testpref /content/fairseq/data/test.spm \
  --destdir /content/fairseq/data \
  --thresholdtgt 0 \
  --thresholdsrc 0 \
  --srcdict /content/mbart.cc25.v2/dict.txt \
  --tgtdict /content/mbart.cc25.v2/dict.txt \
  --workers 70


2021-04-20 02:08:50 | INFO | fairseq_cli.preprocess | Namespace(align_suffix=None, alignfile=None, all_gather_list_size=16384, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='/content/fairseq/data', empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=False, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, only_source=False, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, simul_type=None, source_lang='src', srcdict='/content/mbart.cc25.v2/dict.txt', suppress_crashes=False, target_lang='dst', task='translation', tensorboard_logdir=None, testpref='/content/fairseq/da

In [None]:
! easse evaluate \
--test_set custom \
--metrics sari \
--refs_sents_paths /content/prepared_data/test_ref_data.ref.0,/content/prepared_data/test_ref_data.ref.1,/content/prepared_data/test_ref_data.ref.2,/content/prepared_data/test_ref_data.ref.3,/content/prepared_data/test_ref_data.ref.4 \
--orig_sents_path /content/prepared_data/test_ref_data.src \
--sys_sents_path /content/fairseq/model_prediction.hyp -q

[nltk_data] Downloading package perluniprops to /root/nltk_data...
[nltk_data]   Unzipping misc/perluniprops.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
{'sari': 32.236, 'quality_estimation': {'Compression ratio': 0.903, 'Sentence splits': 1.051, 'Levenshtein similarity': 0.35, 'Exact copies': 0.0, 'Additions proportion': 0.695, 'Deletions proportion': 0.792, 'Lexical complexity score': 9.639}}
