In [46]:
# https://www.geeksforgeeks.org/byte-pair-encoding-bpe-in-nlp/
import re
from tqdm import tqdm
import pandas as pd

ids = [1, 1, 1, 2, 2, 3,4]
vales = ["十二郭", "十二郭 (11/11期)", "十二郭 (11/11期)", "優食-1 十二郭", "優食-十二郭", "優食-十二郭","好爽"]
data = {
        'id': ids,
        'value': vales
        }
df = pd.DataFrame(data)

In [48]:

class BytepairEncoding:
    def __init__(self, series):
        self.vocab = {}
        for val in tqdm(series.value_counts().reset_index().values):
            k = " ".join(val[0]) + " "
            self.vocab[k] = val[1]

    def _get_stats(self, vocab):
        from collections import defaultdict
        pairs = defaultdict(int)
        for word, freq in vocab.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pairs[symbols[i], symbols[i + 1]] += freq
        return pairs
    def merge_vocab(self, pair, v_in):
        """
        給定一對字符和一個詞彙表，返回一個新的詞彙表，
        其中將這對字符在詞彙表中出現的地方合併在一起。
        同時更新這對字符在新詞彙表中的頻率計數。
        """
        v_out = {}
        bigram = re.escape(' '.join(pair))
        p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
        for word in v_in:
            w_out = p.sub(''.join(pair), word)
            v_out[w_out] = v_in[word]
        return v_out
    def run(self, num_merges):
        vocab_res = []
        best_res = []
        for i in tqdm(range(num_merges)):
            pairs = self._get_stats(self.vocab)
            # 返回values最大的值

            best = max(pairs, key=pairs.get)

            best_res.append(best)
            print(f"best: {best}")
            self.vocab = self.merge_vocab(best, self.vocab)
            if len(max(best, key=len)) == 2:
                vocab_res.extend([x for x in best if len(x) >= 2])
            if len(max(best, key=len)) == 3:
                vocab_res.append(best[0] + best[1])
                print(f"add: {best}")
            
            print("-"*30)
        self.vocab_res = set(vocab_res)
        return best_res

In [49]:
bpe = BytepairEncoding(df["value"])
bpe.run(30)

100%|██████████| 5/5 [00:00<?, ?it/s]
100%|██████████| 30/30 [00:00<00:00, 1570.21it/s]

best: ('十', '二')
------------------------------
best: ('十二', '郭')
------------------------------
best: ('1', '1')
------------------------------
best: ('優', '食')
------------------------------
best: ('優食', '-')
------------------------------
best: ('優食-', '十二郭')
add: ('優食-', '十二郭')
------------------------------
best: ('十二郭', '(')
add: ('十二郭', '(')
------------------------------
best: ('十二郭', '(')
add: ('十二郭', '(')
------------------------------
best: ('十二郭', '(')
add: ('十二郭', '(')
------------------------------
best: ('十二郭', '(')
add: ('十二郭', '(')
------------------------------
best: ('十二郭', '(')
add: ('十二郭', '(')
------------------------------
best: ('十二郭', '(')
add: ('十二郭', '(')
------------------------------
best: ('十二郭', '(')
add: ('十二郭', '(')
------------------------------
best: ('十二郭', '(')
add: ('十二郭', '(')
------------------------------
best: ('十二郭', '(')
add: ('十二郭', '(')
------------------------------
best: ('十二郭', '(')
add: ('十二郭', '(')
------------------------------
best: 




[('十', '二'),
 ('十二', '郭'),
 ('1', '1'),
 ('優', '食'),
 ('優食', '-'),
 ('優食-', '十二郭'),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '('),
 ('十二郭', '(')]