### Evaluate performance
Compare MCTS sentences for a few methods:

Metrics: BERTscore, SARI, HSK1-3, mean freq.

Base data #1: Pseudo data

Base data #2: MCTS

Methods: (1) LS only

(2) BART only

(4) LS after BART

(5) LS before BART

#### Load packages, data, and models:

In [4]:
#imports: 
%load_ext autoreload
%autoreload 2
import torch
import pandas as pd
import numpy as np
import jieba
import utils.LS_pipeline as LS
import utils.TS_pipeline as TS
import pickle
from evaluate import load

# scoring metrics:
sari = load("sari")
bertscore = load("bertscore")

# vocab data:
blcu = pd.read_csv('../data/BLCU/literature_wordfreq.release_UTF-8.txt', header = None, sep="\t",)
blcu.rename(columns={0:"character", 1:"frequency"}, inplace=True)
blcu.set_index("character", inplace=True)
blcu["frequency"] = blcu["frequency"].rank(pct=True)
blcu = blcu.to_dict()['frequency']
with open("../data/HSK/HSK_levels.pickle", 'rb') as handle:
    hsk_dict = pickle.load(handle)

# parallel sentence data:
with open('../data/mcts-pseudo/zh_selected.ori', encoding="utf8") as f:
    pseudo_orig = f.readlines()
with open('../data/mcts-pseudo/zh_selected.sim', encoding="utf8") as f:
    pseudo_ref = f.readlines()
with open('../data/mcts/mcts.dev.orig', encoding="utf8") as f:
    mcts_orig = f.readlines()
mcts_ref = []
for dataset in range(0,5):
    filename = str('../data/mcts/mcts.dev.simp.'+str(dataset))
    with open(filename, encoding="utf8") as f:
        mcts_ref.append(f.readlines())



Downloading Model to directory: C:\Users\tempu\.cache\modelscope\hub\damo\nlp_raner_named-entity-recognition_chinese-base-news


2025-03-20 09:33:35,469 - modelscope - INFO - initiate model from C:\Users\tempu\.cache\modelscope\hub\damo\nlp_raner_named-entity-recognition_chinese-base-news
2025-03-20 09:33:35,469 - modelscope - INFO - initiate model from location C:\Users\tempu\.cache\modelscope\hub\damo\nlp_raner_named-entity-recognition_chinese-base-news.
2025-03-20 09:33:35,469 - modelscope - INFO - initialize model from C:\Users\tempu\.cache\modelscope\hub\damo\nlp_raner_named-entity-recognition_chinese-base-news
2025-03-20 09:33:38,118 - modelscope - INFO - head has no _keys_to_ignore_on_load_missing
  state_dict = torch.load(ckpt_file, map_location='cpu')
2025-03-20 09:33:39,017 - modelscope - INFO - All model checkpoint weights were used when initializing ModelForTokenClassificationWithCRF.

2025-03-20 09:33:39,017 - modelscope - INFO - All the weights of ModelForTokenClassificationWithCRF were initialized from the model checkpoint If your task is similar to the task the model of the checkpoint was trained

tokenizer_config.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/259k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/561M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

Device set to use cpu


#### Build custom metric functions:

In [5]:
def chinese_tokenizer(data: list):
    return [" ".join(jieba.cut(sentence)) for sentence in data] # tokenizes Chinese words with spaces

def sentence_metrics(sentence):
    tokens = [word for word in jieba.cut(sentence)] # get tokens
    ## find portion of words in HSK level 1-3:
    levels = [hsk_dict[word] for word in tokens if word in hsk_dict]
    if levels:
        l13 = (levels.count(1) + levels.count(2) + levels.count(3))/len(tokens)
    else:
        l13 = 0
    ## find frequency of words:
    freqs = [np.power(blcu[word], 2) for word in tokens if word in blcu] # get squared frequency
    freq = np.mean(freqs) # mean of squared freqs

    return l13, freq

def corpus_metrics(complex_sentences: list, simple_sentences: list):
    simple_metrics = [sentence_metrics(sentence) for sentence in simple_sentences]
    complex_metrics = [sentence_metrics(sentence) for sentence in complex_sentences]
    l13_simple = np.mean([simple_metrics[idx][0] for idx in range(len(simple_metrics))])
    l13_complex = np.mean([complex_metrics[idx][0] for idx in range(len(complex_metrics))])
    freq_simple = np.mean([simple_metrics[idx][1] for idx in range(len(simple_metrics))])
    freq_complex = np.mean([complex_metrics[idx][1] for idx in range(len(complex_metrics))])
    l13_score = 100*(l13_simple - l13_complex)/l13_complex # percent change in L1-3 proportion
    freq_score = 100*(freq_simple - freq_complex)/freq_complex # percent change in squared frequency
    return l13_score, freq_score

#### Run pipelines to generate simple sentences

In [2]:
print("Running LS...")
simple_LS = [LS.LS_pipeline(sentence) for sentence in mcts_orig]
print("Running BART...")
simple_BART = [TS.TS_with_BART(sentence) for sentence in mcts_orig]
print("Running BARTLS...")
simple_BARTLS = [TS.TS_with_BART_LS(sentence) for sentence in mcts_orig]
print("Running LSBART...")
simple_LSBART = [TS.TS_with_LS_BART(sentence) for sentence in mcts_orig]


with open("../data/results/simple_LS.pickle", "wb") as handle:
    pickle.dump(simple_LS, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("../data/results/simple_BART.pickle", "wb") as handle:
    pickle.dump(simple_BART, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("../data/results/simple_BARTLS.pickle", "wb") as handle:
    pickle.dump(simple_BARTLS, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("../data/results/simple_LSBART.pickle", "wb") as handle:
    pickle.dump(simple_LSBART, handle, protocol=pickle.HIGHEST_PROTOCOL)

Running LS...


NameError: name 'mcts_orig' is not defined

#### Evaluate

In [6]:
### Loading data if already generated from above:
with open("../data/results/simple_BART.pickle", 'rb') as handle:
    simple_BART = pickle.load(handle)
with open("../data/results/simple_BARTLS.pickle", 'rb') as handle:
    simple_BARTLS = pickle.load(handle)
with open("../data/results/simple_LS.pickle", 'rb') as handle:
    simple_LS = pickle.load(handle)
with open("../data/results/simple_LSBART.pickle", 'rb') as handle:
    simple_LSBART = pickle.load(handle)

In [7]:
def get_metrics_MCTS(simple_sentences: list):
    return get_metrics(simple_sentences, mcts_orig, mcts_ref)

def get_metrics_pseudo(simple_sentences: list):
    return get_metrics(simple_sentences, pseudo_orig[500000:500367], [pseudo_ref[500000:500367]])

def get_metrics(simple_sentences: list, complex_sentences: list, reference_sentences: list,):
    # tokenize sentences:
    tokenized_simplified = chinese_tokenizer(simple_sentences)
    tokenized_complex = chinese_tokenizer(complex_sentences)
    tokenized_reference = [chinese_tokenizer([reference_sentences[idx][ref] 
                                              for idx in range(len(reference_sentences))]) 
                                              for ref in range(len(complex_sentences))]

    # Compute SARI score:
    sari_score = sari.compute(
        predictions=tokenized_simplified, # model output
        references=tokenized_reference, # reference simple sentences
        sources=tokenized_complex # complex sentence
    )["sari"]

    # Compute BERT precision score:
    bert_score = bertscore.compute(
        predictions=tokenized_simplified, 
        references=tokenized_reference, 
        lang="zh"
        )["precision"][0]

    # Compute L1-3 and frequency scores:
    l13_score, freq_score = corpus_metrics(tokenized_complex, tokenized_simplified)

    # Print the result
    print("SARI Score:", sari_score)
    print("BERTScore (precision):", bert_score)
    print("L1-3 increase (%):", l13_score)
    print("Freq^2 increase (%):", freq_score)
    return {'sari_score': sari_score, 
            'bert_score': bert_score, 
            'l13_score': l13_score, 
            'freq_score': freq_score}

In [8]:
metric_baseline = get_metrics_MCTS(mcts_ref[0])

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\tempu\AppData\Local\Temp\jieba.cache
Loading model cost 1.357 seconds.
Prefix dict has been built successfully.


SARI Score: 55.75995861201505
BERTScore (precision): 1.0
L1-3 increase (%): 17.798284099434706
Freq^2 increase (%): 1.1999394822726044


In [9]:
metric_LS = get_metrics_MCTS(simple_LS)
metric_BART = get_metrics_MCTS(simple_BART)
metric_BARTLS = get_metrics_MCTS(simple_BARTLS)
metric_LSBART = get_metrics_MCTS(simple_LSBART)

KeyboardInterrupt: 