# Simplification experiments
This code was used for finetuning all decoder LMs for the simplification task and their evaluation on this task.

## Part 1. Finetuning decoder LMs


In [None]:
# %%bash
# git clone https://github.com/huggingface/transformers
# cd transformers
# pip install .

#!pip install transformers==4.22.2
#!pip install pytest



In [None]:
!python run_clm.py \
    --model_name_or_path sberbank-ai/rugpt3small_based_on_gpt2 \
    --train_file ./data/simplification/train.txt \
    --validation_file ./data/simplification/dev.txt \
    --per_device_train_batch_size 2 \
    --per_device_eval_batch_size 2 \
    --block_size 512 \
    --dataset_config_name plain_text \
    --do_train \
    --output_dir models/simplification/RuGPT3-small

## Part 2. Decoder LMs evaluation


In [None]:
from transformers import GPT2LMHeadModel, AutoTokenizer
model_name = 'RuGPT3-small'
mymodel = GPT2LMHeadModel.from_pretrained('./models/simplification/' + model_name).cuda()
mytokenizer = AutoTokenizer.from_pretrained('./models/simplification/' + model_name)

In [None]:
#!pip install evaluate
#!pip pip install bert_score
import tqdm
import torch
import pandas as pd
from evaluate import load
import numpy as np

sari = load("sari")
bertscore = load("bertscore")
def add_eos_to_examples(example):
    return '</s>%s</s> ==> ' % (example.strip())

## Public

In [None]:
do_sample = False
top_p=None
repetition_penalty=None
top_k = None
num_beams = 5
temperature = None
data_answers = pd.read_csv("./data/simplification/public_test_sents.csv", sep=",")
sources = []
answers = []
ans_for_q = []
for source, ans in zip(data_answers["INPUT:source"], data_answers["OUTPUT:output"]):
    if source not in sources:
        if ans_for_q:
            answers.append(ans_for_q)
        ans_for_q = []
        sources.append(source)
        ans_for_q.append(ans)
    else:
        ans_for_q.append(ans)
answers.append(ans_for_q)


path_to_file = "./results/simplification/simplification_"+model_name.replace('/','_')+'_public.txt'
with open(path_to_file, "w") as out_file:
    for text in tqdm.tqdm(sources):
        formatted_text = add_eos_to_examples(text)
        input_ids = mytokenizer.encode(formatted_text, return_tensors="pt").cuda()
        length = input_ids.detach().cpu().numpy().shape[1]
        with torch.no_grad():
            out = mymodel.generate(input_ids.cuda(),
                                do_sample = do_sample, 
                                max_length=2 * length + 10,
                                num_beams = num_beams)
        dec = list(map(mytokenizer.decode, out))[0].split('==>')[1].split('</s>')[0].strip()
        out_file.write(dec.replace('\n','')+'\n')
        

In [None]:
with open(path_to_file) as inf:
    predictions = [i.strip().replace('\n','') for i in inf.readlines()]
    
print(len(answers), len(predictions), len(sources))
print(answers[20], predictions[20], sources[20])


print(model_name)
print('public results')
results = bertscore.compute(predictions=predictions, references=sources, lang="ru")
print('BertScore', np.mean(results["f1"]))
results = sari.compute(predictions=predictions, sources=sources, references=answers)
print('Sari', np.mean(results["sari"]))

## Private

In [None]:
do_sample = False
top_p=None
repetition_penalty=None
top_k = None
num_beams = 5
temperature = None
data_answers = pd.read_csv("./data/simplification/hidden_test_sents.csv", sep=",")
sources = []
answers = []
ans_for_q = []
for source, ans in zip(data_answers["INPUT:source"], data_answers["OUTPUT:output"]):
    if source not in sources:
        if ans_for_q:
            answers.append(ans_for_q)
        ans_for_q = []
        sources.append(source)
        ans_for_q.append(ans)
    else:
        ans_for_q.append(ans)
answers.append(ans_for_q)


path_to_file = "./results/simplification/simplification_"+model_name.replace('/','_')+'_hidden.txt'
with open(path_to_file, "w") as out_file:
    for text in tqdm.tqdm(sources):
        formatted_text = add_eos_to_examples(text)
        input_ids = mytokenizer.encode(formatted_text, return_tensors="pt").cuda()
        with torch.no_grad():
            out = mymodel.generate(input_ids.cuda(),
                                do_sample = do_sample, 
                                max_length=2 * length + 10,
                                num_beams = num_beams)
        dec = list(map(mytokenizer.decode, out))[0].split('==>')[1].split('</s>')[0].strip()
        out_file.write(dec.replace('\n','')+'\n')
        

In [None]:
with open(path_to_file) as inf:
    predictions = [i.strip().replace('\n','') for i in inf.readlines()]
    
print(len(answers), len(predictions), len(sources))
print(answers[20], predictions[20], sources[20])

print(model_name)
print('public results')
results = bertscore.compute(predictions=predictions, references=sources, lang="ru")
print('BertScore', np.mean(results["f1"]))
results = sari.compute(predictions=predictions, sources=sources, references=answers)
print('Sari', np.mean(results["sari"]))