针对文本中使用词的数量远远小于Bert的tokenizer的数量时，\
使用本函数可以有效减少embedding的参数量\

实现了一个类Mytokenizer\
会在实例化是生成中文和英文id字典，并提供token-id功能，并且添加起始控制符和填充符\


In [1]:
import numpy as np
import re

In [2]:
#创建一个字典，包含控制符<BOS><EOS><PAD>
def MakeDictWithControlCharacter(weather_control):
    #是否加入除<pad>之外的控制符
    dic = {}
    dic['<PAD>']=0
    if weather_control:
        dic['<BOS>']=1
        dic['<EOS>']=2
    return dic,len(dic)
    

In [3]:
def make_ch_dic(path,weather_control):
    """
    path@ 全文本的txt路径
    weather_control@ 是否加入控制符 如果为True会在字典最前面添加pad=0 bos=1 eos=2
    功能是遍历所有中文文本，将所有字映射一个id
    """
    text_path = path
    with open(text_path,'r',encoding='utf8') as f:
        lines = f.readlines()
        #cleaned_ch = re.sub("[,|,|.|。|?|？|！|!]", '', line[1])
    all_word_list = []
    for line in lines:
        ch = line.split('\t')[1]
        cleaned_ch = re.sub("[,|,|.|。|?|？|！|!]", '', ch)
        all_word_list.extend(list(cleaned_ch))
    #print(len(all_word_list))
    dic,count = MakeDictWithControlCharacter(weather_control)
    for word in all_word_list:
        if word not in dic:
            dic[word]=count
            count+=1
    print('中文字典字数',len(dic))
    return dic

In [4]:
def make_en_dic(path,weather_control):
    """
    path@ 全文本的txt路径
    weather_control@ 是否加入控制符
    功能是遍历所有英文文本，将所有字映射一个id
    """
    with open(path,'r',encoding='utf8') as f:
        lines = f.readlines()
    all_word_list = []
    for line in lines:
        en = line.split('\t')[0]
        cleaned_en = re.sub("[,|.|!|?]",'',en)
        #print(cleaned_en)
        word_list = cleaned_en.split(' ')
        all_word_list.extend(word_list)
    #print(len(all_word_list))
    dic,count = MakeDictWithControlCharacter(weather_control)
    for word in all_word_list:
        if word not in dic:
            dic[word]=count
            count+=1
    print('英文字典字数',len(dic))
    return dic

In [5]:
class Mytokenizer():
    """
    @path 全部文本的cmn.txt的路径
    @tgr 目标语言的英文小写缩写 ch为中文 en为英文  写错了应该写成trg
    """
    def __init__(self,path,tgr='en'):
        #pad的id为0,bos 1,eos 2
        self.ch_dic = make_ch_dic(path,True)
        self.en_dic = make_en_dic(path,True)
        self.id_ch_dic = {v:k for k,v in self.ch_dic.items()}
        self.id_en_dic = {v:k for k,v in self.en_dic.items()}
        self.tgr=tgr
        
    def ch_token_id(self,tokens:list,max_len):
        #输入应该是['词 用 空 格 分 开','句 子 用 列 表 隔 开']
        if_tgr = self.tgr=='ch'
        all_list=[]
        #遍历所有句子
        for sentence in tokens:
            sentence_ids = []
            #遍历所有词
            for word in sentence.split(" "):
                #将词转换为id并添加入句子id列表中
                sentence_ids.append(self.ch_dic[word])
            if if_tgr:
                #如果该字典是tgr输入则添加控制符
                sentence_ids = self.__add_control_element(sentence_ids)
            #添加padding
            sentence_ids = self.__add_pad(sentence_ids,max_len)
            all_list.append(sentence_ids)
        return all_list
    
    #中文id转token
    def ch_id_token(self,id_list:list):
        tokens_list = []
        for ids in id_list:
            tokens = []
            for id in ids:
                tokens.append(self.id_ch_dic[id])
            tokens_list.append(tokens)
        return tokens_list
    
    #英文id转token
    def en_id_token(self,id_list:list):
        tokens_list = []
        for ids in id_list:
            tokens = []
            for id in ids:
                tokens.append(self.id_en_dic[id])
            tokens_list.append(tokens)
        return tokens_list
    
    def en_token_id(self,tokens:list,max_len):
        #输入应该是['word distinct by space','sentence split with list']
        if_tgr = self.tgr=='en'
        all_list=[]
        for sentence in tokens:
            sentence_ids = []
            for word in sentence.split(" "):
                sentence_ids.append(self.en_dic[word])
            if if_tgr:
                sentence_ids = self.__add_control_element(sentence_ids)
            sentence_ids = self.__add_pad(sentence_ids,max_len)
            all_list.append(sentence_ids)
        return all_list
    
    def get_vocab(self):
        #返回两个字典的长度 顺序为src trg
        if self.tgr=='en':
            return len(self.ch_dic),len(self.en_dic)
        if self.tgr=='ch':
            return len(self.en_dic),len(self.ch_dic)
    
    #为一句已经id化的列表加控制符
    def __add_control_element(self,ids:list):
        #输入应该为[5,6,7,4,9]
        #输出为[1,5,6,7,4,9,2]
        #插入<BOS>
        processed_ids = ids.copy()
        processed_ids.insert(0,1)
        #插入<EOS>
        processed_ids.append(2)
        return processed_ids
    
    #为一句已经id化并添加控制符后的列表添加填充符
    def __add_pad(self,ids:list,max_len):
        #输入超过了最大长度
        length = len(ids)
        #如果已经超过最大长度报错
        assert length<=max_len
        paded_ids = ids.copy()
        if length==max_len:
            return paded_ids
        while len(paded_ids)<max_len:
            paded_ids.append(0)
        return paded_ids
    
    
        

使用方式展示，请生成python文件，并在主函数中调用，以下为使用演示，请勿直接在这使用

In [6]:
if __name__=="__main__":
    tokenizer = Mytokenizer('./cmn.txt','en')
    #句子转id
    print(tokenizer.en_dic)
    id_ch = tokenizer.ch_token_id(["这 句 话 你 会 说 么","我 会 个 锤 子"],10)
    id_en = tokenizer.en_token_id(["can you do this one","Hi"],10)
    print(id_ch)
    print(id_en)
    #id转句子
    tokens_ch = tokenizer.ch_id_token(id_ch)
    tokens_en = tokenizer.en_id_token(id_en)
    print(tokens_ch)
    print(tokens_en)

中文字典字数 3643
英文字典字数 8349
[[195, 2204, 203, 4, 21, 183, 54, 0, 0, 0], [16, 21, 169, 1666, 170, 0, 0, 0, 0, 0]]
[[1, 270, 72, 290, 137, 441, 2, 0, 0, 0], [1, 3, 2, 0, 0, 0, 0, 0, 0, 0]]
[['这', '句', '话', '你', '会', '说', '么', '<PAD>', '<PAD>', '<PAD>'], ['我', '会', '个', '锤', '子', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']]
[['<BOS>', 'can', 'you', 'do', 'this', 'one', '<EOS>', '<PAD>', '<PAD>', '<PAD>'], ['<BOS>', 'Hi', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']]
