### Evaluate performance
First, evaluate MCTS data on itself. Could this information be used to generate a decision threshold between BART and LS? Then, compare a few methods results:

Metrics: BERTscore, SARI, HSK1-3, mean freq.

Base data #1: Pseudo data

Base data #2: MCTS

Methods: (1) LS only

(2) BART only

(4) LS after BART

(5) LS before BART

#### Load packages, data, and models:

In [3]:
#imports: 
%load_ext autoreload
%autoreload 2
import torch
import pandas as pd
import numpy as np
import jieba
import utils.LS_pipeline as LS
import utils.TS_pipeline as TS
import pickle
from evaluate import load

# scoring metrics:
sari = load("sari")
bertscore = load("bertscore")

# vocab data:
blcu = pd.read_csv('../data/BLCU/literature_wordfreq.release_UTF-8.txt', header = None, sep="\t",)
blcu.rename(columns={0:"character", 1:"frequency"}, inplace=True)
blcu.set_index("character", inplace=True)
blcu["frequency"] = blcu["frequency"].rank(pct=True)
blcu = blcu.to_dict()['frequency']
with open("../data/HSK/HSK_levels.pickle", 'rb') as handle:
    hsk_dict = pickle.load(handle)

# parallel sentence data:
with open('../data/mcts-pseudo/zh_selected.ori', encoding="utf8") as f:
    pseudo_orig = f.readlines()
with open('../data/mcts-pseudo/zh_selected.sim', encoding="utf8") as f:
    pseudo_ref = f.readlines()
with open('../data/mcts/mcts.dev.orig', encoding="utf8") as f:
    mcts_orig = f.readlines()
mcts_ref = []
for dataset in range(0,5):
    filename = str('../data/mcts/mcts.dev.simp.'+str(dataset))
    with open(filename, encoding="utf8") as f:
        mcts_ref.append(f.readlines())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Build custom metric functions:

In [4]:
def chinese_tokenizer(data: list):
    return [" ".join(jieba.cut(sentence)) for sentence in data] # tokenizes Chinese words with spaces

def sentence_metrics(sentence):
    tokens = [word for word in jieba.cut(sentence)] # get tokens
    ## find portion of words in HSK level 1-3:
    levels = [hsk_dict[word] for word in tokens if word in hsk_dict]
    if levels:
        l13 = (levels.count(1) + levels.count(2) + levels.count(3))/len(tokens)
    else:
        l13 = 0
    ## find frequency of words:
    freqs = [np.power(blcu[word], 2) for word in tokens if word in blcu] # get squared frequency
    freq = np.mean(freqs) # mean of squared freqs

    return l13, freq

def corpus_metrics(complex_sentences: list, simple_sentences: list):
    simple_metrics = [sentence_metrics(sentence) for sentence in simple_sentences]
    complex_metrics = [sentence_metrics(sentence) for sentence in complex_sentences]
    l13_simple = np.mean([simple_metrics[idx][0] for idx in range(len(simple_metrics))])
    l13_complex = np.mean([complex_metrics[idx][0] for idx in range(len(complex_metrics))])
    freq_simple = np.mean([simple_metrics[idx][1] for idx in range(len(simple_metrics))])
    freq_complex = np.mean([complex_metrics[idx][1] for idx in range(len(complex_metrics))])
    l13_score = 100*(l13_simple - l13_complex)/l13_complex # percent change in L1-3 proportion
    freq_score = 100*(freq_simple - freq_complex)/freq_complex # percent change in squared frequency
    return l13_score, freq_score

#### Run pipelines to generate simple sentences

In [8]:
# print("Running LS...")
# simple_LS = [LS.LS_pipeline(sentence) for sentence in mcts_orig]
print("Running BART...")
simple_BART = [TS.TS_with_BART(sentence) for sentence in mcts_orig]
# print("Running BARTLS...")
# simple_BARTLS = [TS.TS_with_BART_LS(sentence) for sentence in mcts_orig]
# print("Running LSBART...")
# simple_LSBART = [TS.TS_with_LS_BART(sentence) for sentence in mcts_orig]

[autoreload of utils.TS_pipeline failed: Traceback (most recent call last):
  File "c:\Users\tempu\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\utils\hub.py", line 403, in cached_file
    resolved_file = hf_hub_download(
                    ^^^^^^^^^^^^^^^^
  File "c:\Users\tempu\AppData\Local\Programs\Python\Python312\Lib\site-packages\huggingface_hub\utils\_validators.py", line 106, in _inner_fn
    validate_repo_id(arg_value)
  File "c:\Users\tempu\AppData\Local\Programs\Python\Python312\Lib\site-packages\huggingface_hub\utils\_validators.py", line 154, in validate_repo_id
    raise HFValidationError(
huggingface_hub.errors.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '../data/mosdels/checkpoint_14064'. Use `repo_type` argument if needed.

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\tempu\AppData\Local\Programs\Python\Python312\Lib\site-pack

Running BART...


KeyboardInterrupt: 

In [None]:
print("Running LS...")
simple_LS_ps = [LS.LS_pipeline(sentence) for sentence in pseudo_orig[500000:500005]]
print("Running BART...")
simple_BART_ps = [TS.TS_with_BART(sentence) for sentence in pseudo_orig[500000:500005]]
print("Running BARTLS...")
simple_BARTLS_ps = [TS.TS_with_BART_LS(sentence) for sentence in pseudo_orig[500000:500005]]
print("Running LSBART...")
simple_LSBART_ps = [TS.TS_with_LS_BART(sentence) for sentence in pseudo_orig[500000:500005]]

Running LS...




Running BART...
Running BARTLS...
Running LSBART...


In [266]:
with open("../data/results/simple_LS.pickle", "wb") as handle:
    pickle.dump(simple_LS, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("../data/results/simple_BART.pickle", "wb") as handle:
    pickle.dump(simple_BART, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("../data/results/simple_BARTLS.pickle", "wb") as handle:
    pickle.dump(simple_BARTLS, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open("../data/results/simple_LSBART.pickle", "wb") as handle:
    pickle.dump(simple_LSBART, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### Evaluate

In [5]:
def get_metrics_MCTS(simple_sentences: list):
    return get_metrics(simple_sentences, mcts_orig, mcts_ref)

def get_metrics_pseudo(simple_sentences: list):
    return get_metrics(simple_sentences, pseudo_orig[500000:500367], [pseudo_ref[500000:500367]])

def get_metrics(simple_sentences: list, complex_sentences: list, reference_sentences: list,):
    # tokenize sentences:
    tokenized_simplified = chinese_tokenizer(simple_sentences)
    tokenized_complex = chinese_tokenizer(complex_sentences)
    tokenized_reference = [chinese_tokenizer([reference_sentences[idx][ref] 
                                              for idx in range(len(reference_sentences))]) 
                                              for ref in range(len(complex_sentences))]

    # Compute SARI score:
    sari_score = sari.compute(
        predictions=tokenized_simplified, # model output
        references=tokenized_reference, # reference simple sentences
        sources=tokenized_complex # complex sentence
    )["sari"]

    # Compute BERT precision score:
    bert_score = bertscore.compute(
        predictions=tokenized_simplified, 
        references=tokenized_reference, 
        lang="zh"
        )["precision"][0]

    # Compute L1-3 and frequency scores:
    l13_score, freq_score = corpus_metrics(tokenized_complex, tokenized_simplified)

    # Print the result
    print("SARI Score:", sari_score)
    print("BERTScore (precision):", bert_score)
    print("L1-3 increase (%):", l13_score)
    print("Freq^2 increase (%):", freq_score)
    return {'sari_score': sari_score, 
            'bert_score': bert_score, 
            'l13_score': l13_score, 
            'freq_score': freq_score}

In [None]:
metric_baseline = get_metrics_MCTS(mcts_ref[0])

SARI Score: 55.75995861201505
BERTScore (precision): 1.0
L1-3 increase (%): 17.798284099434706
Freq^2 increase (%): 1.1999394822726044


In [7]:
# metric_LS = get_metrics_MCTS(simple_LS)
metric_BART = get_metrics_MCTS(simple_BART)
# metric_BARTLS = get_metrics_MCTS(simple_BARTLS)
# metric_LSBART = get_metrics_MCTS(simple_LSBART)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\tempu\AppData\Local\Temp\jieba.cache
Loading model cost 1.377 seconds.
Prefix dict has been built successfully.


SARI Score: 34.9127452771879
BERTScore (precision): 0.9389698505401611
L1-3 increase (%): -11.76362961373075
Freq^2 increase (%): -0.3636613371313572


In [280]:
metric_baseline_ps = get_metrics_pseudo(pseudo_ref[500000:500005])

SARI Score: 99.38011695906432
BERTScore (precision): 1.0000001192092896
L1-3 increase (%): 4.614116383851967
Freq^2 increase (%): 0.9541224335453964


In [279]:
metric_LS_ps = get_metrics_pseudo(simple_LS_ps)
metric_BART_ps = get_metrics_pseudo(simple_BART_ps)
metric_BARTLS_ps = get_metrics_pseudo(simple_BARTLS_ps)
metric_LSBART_ps = get_metrics_pseudo(simple_LSBART_ps)

SARI Score: 43.067211146786306
BERTScore (precision): 0.8094401955604553
L1-3 increase (%): 7.853118829895267
Freq^2 increase (%): -0.06386966014961797
SARI Score: 44.928491625673836
BERTScore (precision): 0.785178542137146
L1-3 increase (%): -24.088159449222445
Freq^2 increase (%): -1.5699054315998624
SARI Score: 41.2368886873214
BERTScore (precision): 0.7782107591629028
L1-3 increase (%): -14.433226009213602
Freq^2 increase (%): -2.168998479340594
SARI Score: 44.24405675106732
BERTScore (precision): 0.8079299926757812
L1-3 increase (%): -13.998877648201296
Freq^2 increase (%): -5.085667718351244


In [None]:
eval_df = pd.DataFrame(columns=["Method", "SARI", "BERTscore", "L1-3 (%)", "Mean freq rank"])