In [None]:
# 基于词典的中文分词实验

## 任务一：词典分词基础实现


In [None]:
# 基于词典的中文分词算法实现
# 包含正向最大匹配、逆向最大匹配和双向匹配算法

class DictionarySegmentation:
    def __init__(self, dict_path=None):
        """
        初始化分词器
        Args:
            dict_path: 词典文件路径，如果为None则使用默认词典
        """
        self.dictionary = set()
        self.max_word_length = 0
        
        if dict_path:
            self.load_dictionary(dict_path)
        else:
            self.load_default_dictionary()
    
    def load_default_dictionary(self):
        """加载默认词典"""
        default_words = [
            "北京", "大学", "北京大学", "清华", "清华大学", "中国", "人民", "共和国",
            "中华", "中华人民共和国", "自然", "语言", "处理", "自然语言", "自然语言处理",
            "机器", "学习", "机器学习", "人工", "智能", "人工智能", "深度", "深度学习",
            "计算机", "科学", "计算机科学", "燕山", "燕山大学", "河北", "河北省",
            "教育部", "工业", "信息化", "信息化部", "工业和信息化部", "国家", "国防",
            "科技", "工业局", "国家国防科技工业局", "重点", "重点大学", "全国",
            "全国重点大学", "一流", "世界", "一流大学", "世界一流大学", "学科",
            "一流学科", "世界一流学科", "建设", "高校", "联盟", "成员", "张三"
        ]
        
        for word in default_words:
            self.dictionary.add(word)
            self.max_word_length = max(self.max_word_length, len(word))
        
        print(f"默认词典加载完成，共{len(self.dictionary)}个词，最大词长{self.max_word_length}")
    
    def load_dictionary(self, dict_path):
        """从文件加载词典"""
        try:
            with open(dict_path, 'r', encoding='utf-8') as f:
                for line in f:
                    word = line.strip().split()[0]  # 取第一列作为词语
                    if word:
                        self.dictionary.add(word)
                        self.max_word_length = max(self.max_word_length, len(word))
            print(f"词典{dict_path}加载完成，共{len(self.dictionary)}个词")
        except FileNotFoundError:
            print(f"词典文件{dict_path}不存在，使用默认词典")
            self.load_default_dictionary()

# 创建分词器实例
segmenter = DictionarySegmentation()
print("分词器初始化完成")


In [None]:
# 实现三种分词算法

def forward_max_matching(text, dictionary, max_word_length):
    """
    正向最大匹配算法
    Args:
        text: 待分词文本
        dictionary: 词典
        max_word_length: 最大词长
    Returns:
        分词结果列表
    """
    result = []
    i = 0
    
    while i < len(text):
        # 从最大词长开始尝试匹配
        max_len = min(max_word_length, len(text) - i)
        word = None
        
        for length in range(max_len, 0, -1):
            candidate = text[i:i+length]
            if candidate in dictionary:
                word = candidate
                break
        
        if word:
            result.append(word)
            i += len(word)
        else:
            # 单字符处理
            result.append(text[i])
            i += 1
    
    return result

def backward_max_matching(text, dictionary, max_word_length):
    """
    逆向最大匹配算法
    Args:
        text: 待分词文本
        dictionary: 词典
        max_word_length: 最大词长
    Returns:
        分词结果列表
    """
    result = []
    i = len(text)
    
    while i > 0:
        # 从最大词长开始尝试匹配
        max_len = min(max_word_length, i)
        word = None
        
        for length in range(max_len, 0, -1):
            start_pos = i - length
            candidate = text[start_pos:i]
            if candidate in dictionary:
                word = candidate
                break
        
        if word:
            result.insert(0, word)
            i -= len(word)
        else:
            # 单字符处理
            result.insert(0, text[i-1])
            i -= 1
    
    return result

def bidirectional_matching(text, dictionary, max_word_length):
    """
    双向匹配算法
    Args:
        text: 待分词文本
        dictionary: 词典
        max_word_length: 最大词长
    Returns:
        分词结果列表
    """
    # 分别进行正向和逆向匹配
    forward_result = forward_max_matching(text, dictionary, max_word_length)
    backward_result = backward_max_matching(text, dictionary, max_word_length)
    
    # 选择更优的结果
    # 规则1：词数少的优先
    if len(forward_result) != len(backward_result):
        return forward_result if len(forward_result) < len(backward_result) else backward_result
    
    # 规则2：单字词少的优先
    forward_single = sum(1 for word in forward_result if len(word) == 1)
    backward_single = sum(1 for word in backward_result if len(word) == 1)
    
    if forward_single != backward_single:
        return forward_result if forward_single < backward_single else backward_result
    
    # 规则3：默认选择正向匹配结果
    return forward_result

# 将算法添加到分词器类中
DictionarySegmentation.forward_max_matching = lambda self, text: forward_max_matching(text, self.dictionary, self.max_word_length)
DictionarySegmentation.backward_max_matching = lambda self, text: backward_max_matching(text, self.dictionary, self.max_word_length)
DictionarySegmentation.bidirectional_matching = lambda self, text: bidirectional_matching(text, self.dictionary, self.max_word_length)

print("分词算法实现完成")


In [None]:
# 测试三种分词算法

test_texts = [
    "张三即将是自然语言处理方面的高手。",
    "燕山大学是河北省重点大学。",
    "北京大学是中国的一流大学。",
    "人工智能和机器学习是计算机科学的重要分支。"
]

print("=" * 60)
print("三种分词算法测试结果对比")
print("=" * 60)

for i, text in enumerate(test_texts, 1):
    print(f"\n【测试文本 {i}】: {text}")
    print("-" * 50)
    
    # 正向最大匹配
    forward_result = segmenter.forward_max_matching(text)
    print(f"正向最大匹配: {' / '.join(forward_result)}")
    print(f"词数: {len(forward_result)}, 单字词: {sum(1 for w in forward_result if len(w)==1)}")
    
    # 逆向最大匹配  
    backward_result = segmenter.backward_max_matching(text)
    print(f"逆向最大匹配: {' / '.join(backward_result)}")
    print(f"词数: {len(backward_result)}, 单字词: {sum(1 for w in backward_result if len(w)==1)}")
    
    # 双向匹配
    bi_result = segmenter.bidirectional_matching(text)
    print(f"双向匹配结果: {' / '.join(bi_result)}")
    print(f"词数: {len(bi_result)}, 单字词: {sum(1 for w in bi_result if len(w)==1)}")
    
    # 分析结果差异
    if forward_result == backward_result:
        print("✓ 正向和逆向匹配结果一致")
    else:
        print("⚠ 正向和逆向匹配结果存在差异")

print("\n" + "=" * 60)
print("测试完成")
