In [None]:
!pip install jieba evaluate sacrebleu sacremoses datasets

In [78]:
import pandas as pd
import numpy as np
import jieba
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
from datasets import Dataset
# from easse.bleu import sentence_bleu
from evaluate import load
sari = load("sari")
HSK = pd.read_pickle("../data/Chinese/HSK_levels")
HSK_dict = HSK.to_dict()['level']

In [79]:
from transformers import BertTokenizer, BartForConditionalGeneration, Text2TextGenerationPipeline, AutoModelForSeq2SeqLM
tokenizer = BertTokenizer.from_pretrained("fnlp/bart-base-chinese")
model = BartForConditionalGeneration.from_pretrained("fnlp/bart-base-chinese")
# model = AutoModelForSeq2SeqLM.from_pretrained("fnlp/bart-base-chinese")

In [103]:
def tokenize_with_HSK(sentence, HSK_dict):
    split_sentence = jieba.lcut(sentence)
    HSK_sentence = "".join([f"{word}[{HSK_dict.get(word, 0)}]" for word in split_sentence])
    return HSK_sentence

tokenizer.add_tokens(["[1]", "[2]", "[3]", "[4]", "[5]", "[6]", "[7-9]"])
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


BartScaledWordEmbedding(51278, 768, padding_idx=0)

In [81]:
def preprocess_data(filename: str, start: int, stop: int):
    lines = []
    lines_HSK = []
    with open(filename, encoding="utf8") as f:
        lines_orig = f.read().splitlines()
        for line in lines_orig[start:stop]:
            lines_HSK.append(tokenize_with_HSK(line, HSK_dict))
    return lines_HSK

In [83]:
start = 0
stop = 20
split = 10

lines_complex = preprocess_data('../mcts-main/pseudo_data/zh_selected.ori', start, stop)
lines_simple = preprocess_data('../mcts-main/pseudo_data/zh_selected.sim', start, stop)

data_dict = {'complex': lines_complex[start:split], 'simple': lines_simple[start:split]}
ds_train = Dataset.from_dict(data_dict)
data_dict = {'complex': lines_complex[split:stop], 'simple': lines_simple[split:stop]}
ds_eval = Dataset.from_dict(data_dict)

In [90]:
# tokenize data
max_length = 256
def batch_tokenize_data(data):
    inputs = [example for example in data["complex"]]
    targets = [example for example in data["simple"]]

    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True)
    labels = tokenizer(targets, max_length=max_length, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data_train = ds_train.map(batch_tokenize_data, batched=True)
tokenized_data_eval = ds_eval.map(batch_tokenize_data, batched=True)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [85]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    sari_score = sari.compute(sources=[labels], predictions=preds, references=[labels])
    return {'sari': sari_score}

In [104]:
training_args = TrainingArguments(
    output_dir="./bart_simplification",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_data_train,
    eval_dataset = tokenized_data_eval,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

  trainer = Trainer(


In [105]:
trainer.train()

  0%|          | 0/30 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [85]:
idx = 35
#tokenize the conversation
sentence = '譬如，穆利诺对记者表示，他未看到美国军事力量夺取运河的严重威胁，并提议与美方进行技术层面的对话，以解决特朗普对中国影响的担忧。'
text2text_generator = Text2TextGenerationPipeline(model, tokenizer)
print(sentence)
print(text2text_generator(sentence, max_length=50, do_sample=False)[0]['generated_text'].replace(" ",""))

Device set to use cpu


譬如，穆利诺对记者表示，他未看到美国军事力量夺取运河的严重威胁，并提议与美方进行技术层面的对话，以解决特朗普对中国影响的担忧。
比如，穆利诺对记者表示，他从未看到美国军事力量夺取运河的严重威胁，并提议与美方进行技术层面的对
