# BPE 算法评估

## RJieba 分词

In [1]:
import os 
import rjieba

data_path = "./data/"
train_file = os.path.join(data_path, "train_BPE.txt")
test_file = os.path.join(data_path, "test_BPE.txt")

vocab_size = int(2e4)
min_freq = 1
cut_res = {}

with open(train_file) as f:
    train_data = f.read()

with open(test_file) as f:
    test_data = f.read()
    test_data = test_data.replace(" ", "")
    cut_res["rjieba"] = rjieba.cut(test_data)
    example = "".join(cut_res["rjieba"][:38])
    print(f"example: {example}")
    print(f"rjieba:  {'|'.join(rjieba.cut(example))}")

example: 记者1月15日从上海铁路局淮南西站获悉,淮南铁路预计春运期间发送旅客33.3万人。预计客流最高峰日为2月6日,将发送旅客1.8万人,淮南东开往
rjieba:  记者|1|月|15|日|从|上海铁路局|淮南|西站|获悉|,|淮南|铁路|预计|春运期间|发送|旅客|33.3|万人|。|预计|客流|最高峰|日为|2|月|6|日|,|将|发送|旅客|1.8|万人|,|淮南|东|开往


## 腾讯 Texsmart 分词

In [2]:
import json
import requests
from tqdm import tqdm
def texsmart_cut(text: str):
    if text == "":
        return {
            "word_list": [],
            "phrase_list": []
        }

    obj = {"str": text}
    req_str = json.dumps(obj).encode()

    url = "https://texsmart.qq.com/api"
    
    r = requests.post(url, data=req_str)
    r.encoding = "utf-8"
    res = r.json()
    return {
        "word_list": list(map(lambda x: x['str'], res['word_list'])),
        "phrase_list": list(map(lambda x: x['str'], res['phrase_list']))
    }
t = texsmart_cut(example)
print(f"texsmart word_list: {'|'.join(t['word_list'])}")
print(f"texsmart phrase_list: {'|'.join(t['phrase_list'])}")

if os.path.exists("output/testsmart_word.json") and os.path.exists("output/testsmart_phrase.json"):
    with open("output/testsmart_word.json") as f:
        cut_res["T word"] = json.load(f)
    with open("output/testsmart_phrase.json") as f:
        cut_res["T phrase"] = json.load(f)
else:
    word_list = []
    phrase_list = []
    for data in tqdm(test_data.split("\n")):
        res = texsmart_cut(data)
        word_list.extend(res["word_list"] + ['\n'])
        phrase_list.extend(res["phrase_list"] + ['\n'])
    cut_res["T word"] = word_list[:-1]
    cut_res["T phrase"] = phrase_list[:-1]
    with open("output/testsmart_word.json", "w") as f:
        json.dump(cut_res["textsmart word"], f, ensure_ascii=False)
    with open("output/testsmart_phrase.json", "w") as f:
        json.dump(cut_res["textsmart phrase"], f, ensure_ascii=False)


texsmart word_list: 记者|1|月|15|日|从|上海|铁路|局|淮南|西站|获悉|,|淮南|铁路|预计|春运|期间|发送|旅客|33|.|3|万|人|。|预计|客流|最|高|峰|日|为|2|月|6|日|,|将|发送|旅客|1|.|8|万|人|,|淮南|东|开往
texsmart phrase_list: 记者|1月15日|从|上海铁路局|淮南|西站|获悉|,|淮南铁路|预计|春运期间|发送|旅客|33.3|万|人|。|预计|客流|最高峰|日|为|2月6日|,|将|发送|旅客|1.8|万|人|,|淮南东|开往


## Hugging Face Tokenizer

In [3]:

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
# set the environment variable TOKENIZERS_PARALLELISM=(true | false)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

tokenizer = Tokenizer(BPE(unk_token="<UNK>"))
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(
    vocab_size=vocab_size,
    show_progress=False,
    special_tokens=["<UNK>", "\n"],
    min_frequency=min_freq
)
tokenizer.train_from_iterator([train_data], trainer=trainer)

def hg_cut(tokenizer, text):
    tokens = tokenizer.encode(text).tokens
    pos = 0
    for i in range(len(tokens)):
        if tokens[i] == "<UNK>":
            tokens[i] = text[pos]
            pos += 1
        else:
            pos += len(tokens[i])
    return tokens
output = hg_cut(tokenizer, example)
print("|".join(output))
cut_res["HG"] = hg_cut(tokenizer, test_data)

记者|1月|15日|从|上海|铁路|局|淮|南|西|站|获悉|,|淮|南|铁路|预计|春运期间|发送|旅客|33|.|3|万人|。|预计|客流|最高|峰|日|为|2月|6日|,|将|发送|旅客|1|.|8|万人|,|淮|南|东|开往


## Subword NMT

In [4]:
import subprocess
nmt_log_path = f"./output/bpe{int(vocab_size // 1000)}k.txt"
nmt_vocab_path = f"./output/nmt_vocab.txt"
if not (os.path.exists(nmt_log_path) and os.path.exists(nmt_vocab_path)):
    subprocess.run(['subword-nmt', 'learn-joint-bpe-and-vocab', '-i', train_file, '--symbols', str(vocab_size), '-o', nmt_log_path, '--write-vocabulary', nmt_vocab_path])

In [5]:
import re                                                                                                                                                                                              
puncs_zh = [' ', '。', '，', '？', '！', '；', '：', '、', '（', '）', '「',
            '」', '“', '”', '‘', '’', '《', '》', '【', '】', '…', '—', '～']
puncs_en = ['.', ',', '?', '!', ';', ':', 
            '(', ')', '"', '"', '\'', '\'', '<', '>', '[', ']', '.','~']
puncs = {*puncs_zh, *puncs_en, "\n", "\t"}
pattern = re.compile(f"({'|'.join(map(re.escape, puncs))})")

def split_with_puncs(text: str) -> list:    
    return list(filter(None, pattern.split(text)))   

def nmt_cut(txt):
    tmp_in_file ="tmp/nmt_in.txt"
    tmp_out_file ="tmp/nmt_out.txt"
    with open(tmp_in_file, "w") as f:
        f.write(txt)
    subprocess.run(['subword-nmt', 'apply-bpe', '-c', nmt_log_path, '--vocabulary',nmt_vocab_path, '-i', tmp_in_file, '-o', tmp_out_file, '--dropout', '0'])
    with open(tmp_out_file) as f:
        res = []
        for line in f.readlines():
            for word in line.split("@@ "):
                res.extend(split_with_puncs(word))
    return res
cut_res["subword nmt"] = nmt_cut(test_data)
"|".join(nmt_cut(example))

'记者|1月|15日|从|上海|铁路|局|淮南|西|站|获悉|,|淮南|铁路|预计|春运|期间|发送|旅客|3|3|.|3|万人|。|预计|客流|最高|峰|日|为|2月|6日|,|将|发送|旅客|1|.|8万|人|,|淮南|东|开|往'

## Efficient BPE (My Implementation)

In [6]:
from ebpe import BPETrainer, BPE
tokenizer: BPE = BPETrainer(vocab_size=vocab_size, min_freq=min_freq, single_char=False, compress_threshold=0.3).train_from_iter([train_data])
tokenizer
print("forward \t" + "|".join(tokenizer.decode_forward(example)))
print("backward\t" + "|".join(tokenizer.decode_backward(example)))
print("bidirect\t" + "|".join(tokenizer.decode_bidirectional(example)))
print("bpe raw \t" + "|".join(tokenizer.tokenize(example)))
cut_res["forward*"] = tokenizer.decode_forward(test_data)
cut_res["backward*"] = tokenizer.decode_backward(test_data)
cut_res["bidirect*"] = tokenizer.decode_bidirectional(test_data)
cut_res["bpe_raw*"] = tokenizer.tokenize(test_data)

Using custom pairwise
forward 	记者|1月|15日|从|上海|铁路|局|淮南|西|站|获悉|,|淮南|铁路|预计|春运期间|发送|旅客|33|.|3万人|。|预计|客流|最高|峰|日|为2|月|6日|,|将|发送|旅客|1|.|8万|人|,|淮南|东|开往
backward	记者|1月|15日|从|上海|铁路|局|淮南|西|站|获悉|,|淮南|铁路|预计|春运期间|发送|旅客|33|.|3万人|。|预计|客流|最|高峰|日|为|2月|6日|,|将|发送|旅客|1|.|8|万人|,|淮南|东|开往
bidirect	记者|1月|15日|从|上海|铁路|局|淮南|西|站|获悉|,|淮南|铁路|预计|春运期间|发送|旅客|33|.|3万人|。|预计|客流|最|高峰|日|为|2月|6日|,|将|发送|旅客|1|.|8|万人|,|淮南|东|开往
bpe raw 	记者|1月|15日|从|上海|铁路|局|淮南|西|站|获悉|,|淮南|铁路|预计|春运期间|发送|旅客|33|.|3万人|。|预计|客流|最高|峰|日|为|2月|6日|,|将|发送|旅客|1|.|8|万人|,|淮南|东|开往


## Evaluate (F1 Score)

In [7]:
from typing import Set, Tuple, List, Dict
import pandas as  pd
def words2pos(words: List[str]):
    res = []
    pos = 0
    for word in words:
        nxt_pos = pos + len(word)
        res.append((pos, nxt_pos))
        pos = nxt_pos
    return set(res)

def F1_score(pred: Set[Tuple[int, int]], true: Set[Tuple[int, int]]):
    """
    pred: 预测的分词结果
    true: 真实的分词结果
    """
    TP = len(pred & true)
    precision = TP / len(pred)
    recall = TP / len(true)
    F1 = 2 * precision * recall / (precision + recall)
    return F1

def evaluate(data: Dict[str, List[str]]):
    data = {
        k: words2pos(v)
        for k, v  in data.items()
    }
    return pd.DataFrame({
        pred_name: {
            true_name: F1_score(pred, true) 
            for true_name, true in data.items()
        } for pred_name, pred in data.items()
    })

evaluate(cut_res)

Unnamed: 0,rjieba,T word,T phrase,HG,subword nmt,forward*,backward*,bidirect*,bpe_raw*
rjieba,1.0,0.809689,0.802598,0.646774,0.66796,0.665888,0.660791,0.663305,0.668211
T word,0.809689,1.0,0.755691,0.701192,0.721894,0.703979,0.698688,0.700842,0.707145
T phrase,0.802598,0.755691,1.0,0.624379,0.643529,0.6463,0.639902,0.643001,0.648838
HG,0.646774,0.701192,0.624379,1.0,0.923624,0.907419,0.905487,0.909452,0.944053
subword nmt,0.66796,0.721894,0.643529,0.923624,1.0,0.897476,0.896734,0.899905,0.92879
forward*,0.665888,0.703979,0.6463,0.907419,0.897476,1.0,0.93091,0.952883,0.958321
backward*,0.660791,0.698688,0.639902,0.905487,0.896734,0.93091,1.0,0.977997,0.958039
bidirect*,0.663305,0.700842,0.643001,0.909452,0.899905,0.952883,0.977997,1.0,0.962347
bpe_raw*,0.668211,0.707145,0.648838,0.944053,0.92879,0.958321,0.958039,0.962347,1.0
