In [53]:
from pypinyin import pinyin, Style
from pack.four_corner_method import FourCornerMethod

In [54]:
class ChineseCharacterCoder:
    def __init__(self):
        # 初始化字典
        self.structure_dict = {}
        self.strokes_dict = {
            '1':'1', '2':'2', '3':'3', '4':'4', '5':'5', '6':'6', '7':'7', '8':'8', '9':'9', '10':'A',
            '11':'B', '12':'C', '13':'D', '14':'E', '15':'F', '16':'G', '17':'H', '18':'I', '19':'J', '20':'K',
            '21':'L', '22':'M', '23':'N', '24':'O', '25':'P', '26':'Q', '27':'R', '28':'S', '29':'T', '30':'U',
            '31':'V', '32':'W', '33':'X', '34':'Y', '35':'Z', '36':'a', '37':'b', '38':'c', '39':'d', '40':'e',
            '41':'f', '42':'g', '43':'h', '44':'i', '45':'j', '46':'k', '47':'l', '48':'m', '49':'n', '50':'o',
            '51':'p'}

        # 加载汉字结构对照文件
        with open('高阶数据集/hanzijiegou_2w.txt', 'r', encoding='utf-8') as file:
            for line in file:
                parts = line.strip().split('\t')
                if len(parts) == 2:
                    structure, chinese_character = parts
                    self.structure_dict[chinese_character] = structure
    
        # 加载汉字笔画对照文件，参考同级目录下的 chinese_unicode_table.txt 文件格式
        self.chinese_char_map = {}
        with open('高阶数据集/chinese_unicode_table.txt', 'r', encoding='UTF-8') as f:
            lines = f.readlines()
            for line in lines[6:]: # 前 6 行是表头，去掉
                line_info = line.strip().split()
                # 处理后的数组第一个是文字，第 7 个是笔画数量
                self.chinese_char_map[line_info[0]] = self.strokes_dict[line_info[6]]
    def split_pinyin(self, chinese_character): 
        # 将汉字转换为拼音(带声调) 
        pinyin_result = pinyin(chinese_character, style=Style.TONE3, heteronym=True)
        
        # 多音字的话，选择第一个拼音
        if pinyin_result: 
            py = pinyin_result[0][0] 
        
        initials = "" # 声母
        finals = "" # 韵母
        codas = "" # 补码
        tone = "" # 声调
        
        # 声母列表
        initials_list = ["b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s", "y", "w"] 
        
        # 韵母列表
        finals_list = ["a", "o", "e", "i", "u", "ü", "ai", "ei", "ui", "ao", "ou", "iu", "ie", "üe", "er", "an", "en", "in", "un", "ün", "ang", "eng", "ing", "ong"] 
        
        # 获取声调
        if py[-1].isdigit(): 
            tone = py[-1] 
            py = py[:-1] 
        
        # 获取声母
        for initial in initials_list: 
            if py.startswith(initial): 
                initials = initial 
                py = py[len(initial):] 
                break
        
        # 获取韵母
        for final in finals_list: 
            if py.endswith(final): 
                finals = final 
                py = py[:-len(final)] 
                break
        
        # 获取补码
        codas = py 
        
        return initials, finals, codas, tone 
        
        return None
    def generate_pronunciation_code(self, hanzi): 
        initial, final, coda, tone = self.split_pinyin(hanzi)
        
        # 轻声字，例如'了' 
        if tone == '': 
            tone = '0' 
        
        # 声母映射
        initials_mapping = {'b': '1', 'p': '2', 'm': '3', 'f': '4', 'd': '5', 't': '6', 'n': '7', 'l': '8', 
            'g': '9', 'k': 'a', 'h': 'b', 'j': 'c', 'q': 'd', 'x': 'e', 'zh': 'f', 'ch': 'g', 
            'sh': 'h', 'r': 'i', 'z': 'j', 'c': 'k', 's': 'l', 'y': 'm', 'w': 'n'} 
        
        # 韵母映射
        finals_mapping = {'a': '1', 'o': '2', 'e': '3', 'i': '4', 'u': '5', 'ü': '6', 'ai': '7', 'ei': '8', 
            'ui': '9', 'ao': 'a', 'ou': 'b', 'iu': 'c', 'ie': 'd', 'üe': 'e', 'er': 'f', 
            'an': 'g', 'en': 'h', 'in': 'i', 'un': 'j', 'ün': 'k', 'ang': 'l', 'eng': 'm', 
            'ing': 'n', 'ong': 'o'} 
        
        # 补码映射
        coda_mapping = {'': '0', 'u':'1', 'i':'1'} 
        
        # 获取映射值
        initial_code = initials_mapping.get(initial, '0')
        final_code = finals_mapping.get(final, '0')
        coda_code = coda_mapping.get(coda, '0')
        
        # 组合生成四位数的字音编码
        pronunciation_code = initial_code + final_code + coda_code + tone 
        
        return pronunciation_code
    def generate_glyph_code(self, hanzi): 
        # 获取汉字的结构
        structure_code = self.structure_dict[hanzi] 
        
        # 获取汉字的四角编码
        fcc = FourCornerMethod().query(hanzi)
        
        # 获取汉字的笔画数
        stroke = self.chinese_char_map[hanzi] 
        
        # 组合生成的字形编码
        glyph_code = structure_code + fcc + stroke 
        
        return glyph_code
    def generate_character_code(self, hanzi):
        return self.generate_glyph_code(hanzi) + self.generate_pronunciation_code(hanzi)



# 3.构建字符相似性网络

In [62]:
# 构建字符相似性网络（用矩阵形式表示）
def compute_sim_mat(chinese_characters, chinese_characters_count, chinese_characters_code):
    sim_mat = [[0] * len(chinese_characters) for _ in range(len(chinese_characters))]
    
    for i in tqdm(range(len(chinese_characters)), desc='Constructing Similarity Matrix', unit='i'):
        for j in range(i, len(chinese_characters)):
            similarity = computeSSCsimilarity(
                chinese_characters_code[chinese_characters[i]],
                chinese_characters_code[chinese_characters[j]]
            )
            sim_mat[i][j] = similarity
            sim_mat[j][i] = similarity

    # 将结果写入文件
    output_file = 'similarity_matrix.txt'
    with open(output_file, 'w', encoding='utf-8') as f:
        for row in sim_mat:
            f.write('\t'.join(map(str, row)) + '\n')

    return sim_mat


# 4.利用字符相似性网络进行字符嵌入学习

In [63]:
# 根据字符相似性网络生成最终的字嵌入向量
def generate_char_vectors(chinese_characters, w2v_vectors, sim_mat, text, chinese_characters_count, threshold=0.6):
    char_vectors = {}
    for i in tqdm(range(len(chinese_characters)), desc='Generating char vectors'):
        character = chinese_characters[i]
        similar_group = []
        for j in range(len(sim_mat[i])):
            if sim_mat[i][j] >= threshold:
                similar_group.append(chinese_characters[j])
        sum_count = 0
        emb = np.zeros_like(w2v_vectors[list(w2v_vectors.keys())[0]])  # 初始化一个全零向量
        for c in similar_group:
            if c not in w2v_vectors.keys():
                update(w2v_vectors, text, c)
            emb += chinese_characters_count[c] * w2v_vectors[c]
            sum_count += chinese_characters_count[c]
        emb /= sum_count if sum_count else 1  # 避免除以0
        char_vectors[character] = emb

    return char_vectors


# 5.生成句子嵌入

In [64]:
# 根据字嵌入向量生成句子嵌入向量
def generate_sentence_vectors(texts, char_vectors, d=100):
    sentence_vectors = []
    for text in tqdm(texts, desc='Generating sentence vectors'):
        alpha = np.zeros((len(text), len(text)))
        for i in range(len(text)):
            for j in range(len(text)):
                alpha[i][j] = alpha[i][j] = np.dot(char_vectors[text[i]], char_vectors[text[j]]) / np.sqrt(d)

        alpha_hat = np.zeros_like(alpha)
        for i in range(len(text)):
            for j in range(len(text)):
                alpha_hat[i][j] = alpha_hat[i][j] = np.exp(alpha[i][j]) / np.sum(alpha[i])

        m = np.zeros((d,))  # 初始化一个全零向量
        for i in range(len(text)):
            mi = np.zeros((d,))
            for j in range(len(text)):
                mi += alpha_hat[i][j] * char_vectors[text[j]]
            m += mi
        sentence_vectors.append(m / d)

    return sentence_vectors


# 6.构建模型

In [65]:
# 垃圾文本分类
def spam_classification(train_tags, train_word_vectors, test_tags, test_word_vectors):
    # 使用逻辑回归模型
    logistic_repression = LogisticRegression()
    logistic_repression.fit(np.array(train_word_vectors), np.array(train_tags))
    predictions = logistic_repression.predict(test_word_vectors)

    # 输出混淆矩阵和分类报告
    cm = confusion_matrix(np.array(test_tags), np.array(predictions))
    print("混淆矩阵:")
    print(cm)

    report = classification_report(np.array(test_tags), np.array(predictions))
    print("分类报告:")
    print(report)


In [None]:
spam