### 导入库

In [1]:
import re
from collections import defaultdict

### 读取文本文件

划分测试集和训练集

In [2]:
#读取盖茨比文本(gaicibi.txt)
text = open('gaicibi.txt', 'r', encoding='utf-8').read()

#将完整文本划分为训练集和测试集，80%作为训练集，20%作为测试集
train_size = int(len(text) * 0.8)
train_text = text[:train_size]
test_text = text[train_size:]

### 文本预处理

对输入的文本进行预处理，包括替换换行符、删除多余空白符、基于标点符号分割句子。你不需要为每一个句子添加起始和终止符号。返回一个句子列表。

经过文本预处理的句子列表示例：["了不起的盖茨比", "作者弗司各特菲茨杰拉德", "美国中西部城市卡罗威世家的后裔尼克厌倦了中西部的生活"...]

输出示例：
```
Sentence 1: 了不起的盖茨比
Sentence 2: 作者弗司各特菲茨杰拉德
Sentence 3: 美国中西部城市卡罗威世家的后裔尼克厌倦了中西部的生活
Sentence 4: 到纽约当证券交易人并在市郊长岛西卵区租了一套小屋
Sentence 5: 他的邻居便是豪华的盖茨比公馆
Sentence 6: 小海湾对面的东卵区宫殿式的大厦住着从芝加哥搬来的汤姆和黛茜夫妇
Sentence 7: 黛茜是尼克的远房表妹
Sentence 8: 汤姆是他大学里的同学
Sentence 9: 家里很有钱
Sentence 10: 他性情暴戾盛气凌人
...
```

In [3]:

def preprocess_text(text):
    #替换换行符为空格
    text = text.replace('\n', ' ')
    #删除多余空白符
    text = re.sub(r'\s+', ' ', text)
    #删除标点符号，只保留逗号和句号
    text = re.sub(r'[^\w\s，。？！]', '', text)
    #基于逗号、句号、空格进行分句
    sentences = re.split(r'[，。？！ ]', text)
    #去除空句子
    sentences = [s for s in sentences if s]
    return sentences

corpus = preprocess_text(text)


### unigram

实现unigram算法，返回每个词的概率分布

输出示例：
```
{'了': 0.017678317144610257,
 '不': 0.012850643111013348,
 '起': 0.0032647579795547425,
 '的': 0.043576415017886706,
 '盖': 0.004086736052421362,
 '茨': 0.003889924401171608,
 '比': 0.004434050731097399,
 '作': 0.0009377496324252984,
 '者': 0.0007062065133079408,
 '弗': 0.00015050302742628246,
 '司': 0.00037046899058777223,
 '各': 0.0003820461465436401,
 '特': 0.0012156013753661275,
 '菲': 0.00021996596316148974,
 '杰': 0.00041677761441124374,
 '拉': 0.0007062065133079408,
 '德': 0.0006946293573520728,
 '美': 0.0010419440360281093,
 '国': 0.0010650983479398451,
 '中': 0.003021637704481517,
 '西': 0.003889924401171608,
 '部': 0.0009030181645576947,
 '城': 0.000683052201396205,
 '市': 0.0002431202750732255,
 '卡': 0.00027785174294082916,
...
 '贷': 1.1577155955867882e-05,
 '资': 4.630862382347153e-05,
 '籍': 1.1577155955867882e-05,
 '烫': 4.630862382347153e-05,
 ...}
```

In [4]:
def calculate_unigram_probabilities(corpus):
    #计算单字的概率
    unigram_counts = defaultdict(int)
    total_unigrams = 0
    for sentence in corpus:
        for char in sentence:
            unigram_counts[char] += 1
            total_unigrams += 1
    unigram_probabilities = {char: count / total_unigrams for char, count in unigram_counts.items()}
    print(unigram_probabilities)
    return unigram_probabilities

calculate_unigram_probabilities(corpus)


{'了': 0.01756888914456653, '不': 0.012771098199390209, '起': 0.0032445492722775124, '的': 0.04330667893919347, '盖': 0.004061439337283553, '茨': 0.003865845941437036, '比': 0.004406604153483288, '作': 0.0009319450037392855, '者': 0.0007018351262727952, '弗': 0.00014957142035321866, '司': 0.0003681758039463844, '各': 0.00037968129781970893, '特': 0.0012080768566990739, '菲': 0.00021860438359316573, '杰': 0.00041419777943968244, '拉': 0.0007018351262727952, '德': 0.0006903296323994707, '美': 0.001035494448599206, '国': 0.0010585054363458552, '中': 0.003002933900937698, '西': 0.003865845941437036, '部': 0.000897428522119312, '城': 0.0006788241385261462, '市': 0.00024161537133981476, '卡': 0.0002761318529597883, '罗': 0.0006097911752861991, '威': 0.001392164758672266, '世': 0.0007823735833860668, '家': 0.0029799229131910487, '后': 0.004303054708623368, '裔': 2.3010987746649024e-05, '尼': 0.0006903296323994707, '克': 0.0017258240809986769, '厌': 0.0001840879019731922, '倦': 4.602197549329805e-05, '生': 0.002554219639878042, 

{'了': 0.01756888914456653,
 '不': 0.012771098199390209,
 '起': 0.0032445492722775124,
 '的': 0.04330667893919347,
 '盖': 0.004061439337283553,
 '茨': 0.003865845941437036,
 '比': 0.004406604153483288,
 '作': 0.0009319450037392855,
 '者': 0.0007018351262727952,
 '弗': 0.00014957142035321866,
 '司': 0.0003681758039463844,
 '各': 0.00037968129781970893,
 '特': 0.0012080768566990739,
 '菲': 0.00021860438359316573,
 '杰': 0.00041419777943968244,
 '拉': 0.0007018351262727952,
 '德': 0.0006903296323994707,
 '美': 0.001035494448599206,
 '国': 0.0010585054363458552,
 '中': 0.003002933900937698,
 '西': 0.003865845941437036,
 '部': 0.000897428522119312,
 '城': 0.0006788241385261462,
 '市': 0.00024161537133981476,
 '卡': 0.0002761318529597883,
 '罗': 0.0006097911752861991,
 '威': 0.001392164758672266,
 '世': 0.0007823735833860668,
 '家': 0.0029799229131910487,
 '后': 0.004303054708623368,
 '裔': 2.3010987746649024e-05,
 '尼': 0.0006903296323994707,
 '克': 0.0017258240809986769,
 '厌': 0.0001840879019731922,
 '倦': 4.60219754932980

### bigram

实现bigram算法，返回每个词的概率分

输出示例：
```
{'了不': 0.015194681861348529,
 '不起': 0.026268115942028984,
 '起的': 0.06666666666666667,
 '的盖': 0.004352872896111433,
 '盖茨': 0.9261363636363636,
 '茨比': 0.9702380952380952,
 '作者': 0.11428571428571428,
 '者弗': 0.018518518518518517,
 '弗司': 0.08333333333333333,
 '司各': 0.17857142857142858,
 '各特': 0.12121212121212122,
 '特菲': 0.029411764705882353,
 '菲茨': 0.47368421052631576,
 '茨杰': 0.026785714285714284,
 '杰拉': 0.2571428571428571,
 '拉德': 0.1694915254237288,
 '美国': 0.524390243902439,
 '国中': 0.02197802197802198,
 '中西': 0.04329004329004329,
 '西部': 0.07017543859649122,
 '部城': 0.02631578947368421,
 '城市': 0.23529411764705882,
 '市卡': 0.05263157894736842,
 '卡罗': 0.6521739130434783,
 '罗威': 0.28888888888888886,
...
 '充当': 0.041666666666666664,
 '当了': 0.0136986301369863,
 '了盖': 0.011396011396011397,
 '比和': 0.019943019943019943,
 ...}
```

In [11]:
def calculate_bigram_probabilities(corpus):
    """
    计算bigram概率分布
    P(w_i | w_{i-1}) = count(w_{i-1}, w_i) / count(w_{i-1})
    """
    # 统计bigram出现次数
    bigram_counts = defaultdict(int)
    # 统计每个词作为第一个词的出现次数
    unigram_counts = defaultdict(int)
    
    for sentence in corpus:
        # 将句子转换为字符列表
        chars = list(sentence)
        for i in range(len(chars) - 1):
            # 当前词和下一个词组成bigram
            current_char = chars[i]
            next_char = chars[i + 1]
            bigram = (current_char, next_char)
            
            bigram_counts[bigram] += 1
            unigram_counts[current_char] += 1
    
    # 计算bigram条件概率
    bigram_probabilities = {}
    for (current_char, next_char), count in bigram_counts.items():
        probability = count / unigram_counts[current_char]
        bigram_probabilities[current_char + next_char] = probability
    
    return bigram_probabilities

calculate_bigram_probabilities(corpus)

{'了不': 0.015194681861348529,
 '不起': 0.026268115942028984,
 '起的': 0.06666666666666667,
 '的盖': 0.004351610095735422,
 '盖茨': 0.9261363636363636,
 '茨比': 0.9702380952380952,
 '作者': 0.11428571428571428,
 '者弗': 0.018518518518518517,
 '弗司': 0.08333333333333333,
 '司各': 0.17857142857142858,
 '各特': 0.12121212121212122,
 '特菲': 0.02912621359223301,
 '菲茨': 0.47368421052631576,
 '茨杰': 0.026785714285714284,
 '杰拉': 0.2571428571428571,
 '拉德': 0.1694915254237288,
 '美国': 0.524390243902439,
 '国中': 0.02197802197802198,
 '中西': 0.04329004329004329,
 '西部': 0.07017543859649122,
 '部城': 0.02631578947368421,
 '城市': 0.2222222222222222,
 '市卡': 0.05263157894736842,
 '卡罗': 0.6521739130434783,
 '罗威': 0.28888888888888886,
 '威世': 0.008620689655172414,
 '世家': 0.029850746268656716,
 '家的': 0.08296943231441048,
 '的后': 0.0017406440382941688,
 '后裔': 0.006329113924050633,
 '裔尼': 1.0,
 '尼克': 0.7,
 '克厌': 0.008,
 '厌倦': 0.13333333333333333,
 '倦了': 0.25,
 '了中': 0.002849002849002849,
 '部的': 0.07894736842105263,
 '的生': 0.0046417174354

### trigram

实现trigram算法，返回每个词的概率分布

输出示例：
```
{'了不起': 0.5625,
 '不起的': 0.5294117647058824,
 '起的盖': 0.5333333333333333,
 '的盖茨': 0.8666666666666667,
 '盖茨比': 1.0,
 '作者弗': 0.125,
 '者弗司': 1.0,
 '弗司各': 1.0,
 '司各特': 0.8,
 '各特菲': 0.75,
 '特菲茨': 1.0,
 '菲茨杰': 1.0,
 '茨杰拉': 1.0,
 '杰拉德': 1.0,
 '美国中': 0.046511627906976744,
 '国中西': 1.0,
 '中西部': 1.0,
 '西部城': 0.10526315789473684,
 '部城市': 1.0,
 '城市卡': 0.1,
 '市卡罗': 1.0,
 '卡罗威': 0.8666666666666667,
 '罗威世': 0.1111111111111111,
 '威世家': 1.0,
 '世家的': 1.0,
...
 '性在许': 1.0,
 '在许多': 1.0,
 '许多方': 0.038461538461538464,
 '多方面': 1.0,
 ...}
```

In [12]:
def calculate_trigram_probabilities(corpus):

    trigram_counts = defaultdict(int)
    bigram_counts = defaultdict(int)
    
    for sentence in corpus:
        chars = list(sentence)
        for i in range(len(chars) - 1):
            # 统计bigram
            bigram = chars[i] + chars[i + 1]
            bigram_counts[bigram] += 1
            
            # 统计trigram
            if i < len(chars) - 2:
                trigram = chars[i] + chars[i + 1] + chars[i + 2]
                trigram_counts[trigram] += 1
    
    # 计算概率
    trigram_probabilities = {}
    for trigram, count in trigram_counts.items():
        bigram = trigram[:2]
        if bigram_counts[bigram] > 0:
            probability = count / bigram_counts[bigram]
            trigram_probabilities[trigram] = probability
    
    return trigram_probabilities

calculate_trigram_probabilities(corpus)


{'了不起': 0.5625,
 '不起的': 0.3103448275862069,
 '起的盖': 0.47058823529411764,
 '的盖茨': 0.8666666666666667,
 '盖茨比': 1.0,
 '作者弗': 0.125,
 '者弗司': 1.0,
 '弗司各': 1.0,
 '司各特': 0.8,
 '各特菲': 0.75,
 '特菲茨': 1.0,
 '菲茨杰': 1.0,
 '茨杰拉': 1.0,
 '杰拉德': 1.0,
 '美国中': 0.046511627906976744,
 '国中西': 1.0,
 '中西部': 1.0,
 '西部城': 0.1,
 '部城市': 1.0,
 '城市卡': 0.08333333333333333,
 '市卡罗': 1.0,
 '卡罗威': 0.8666666666666667,
 '罗威世': 0.07692307692307693,
 '威世家': 1.0,
 '世家的': 0.5,
 '家的后': 0.05263157894736842,
 '的后裔': 0.3333333333333333,
 '后裔尼': 0.5,
 '裔尼克': 1.0,
 '尼克厌': 0.023809523809523808,
 '克厌倦': 1.0,
 '厌倦了': 0.5,
 '倦了中': 1.0,
 '了中西': 0.3333333333333333,
 '西部的': 0.2,
 '部的生': 0.3333333333333333,
 '的生活': 0.5625,
 '到纽约': 1.0,
 '纽约当': 0.02,
 '约当证': 1.0,
 '当证券': 1.0,
 '证券交': 0.25,
 '券交易': 1.0,
 '交易人': 0.3333333333333333,
 '易人并': 1.0,
 '人并在': 0.25,
 '并在市': 1.0,
 '在市郊': 1.0,
 '市郊长': 1.0,
 '郊长岛': 1.0,
 '长岛西': 0.13333333333333333,
 '岛西卵': 1.0,
 '西卵区': 0.038461538461538464,
 '卵区租': 0.5,
 '区租了': 1.0,
 '租了一': 0.5,
 '了一套': 0.0078125,
 '一套小

### 计算句子概率

1. 使用上述计算得出的unigram, bigram, trigram结果，计算给定句子在unigram、bigram和trigram模型下的对数似然值。

输出格式:
```
Unigram Negative Log-Likelihood: -16.467815940872697
Bigram Negative Log-Likelihood: -0.10694757282788604
Trigram Negative Log-Likelihood: 0.0
```

2. 处理包含多个句子的txt文件，计算每个句子的unigram, bigram, 和trigram负对数似然值，并将结果写入到result.txt

In [None]:
import math
def calculate_sentence_neg_log_likelihood(sentence, unigram_probabilities, bigram_probabilities, trigram_probabilities):
    """
    计算句子在unigram、bigram和trigram模型下的负对数似然值
    
    Args:
        sentence: 输入句子
        unigram_probabilities: unigram概率字典
        bigram_probabilities: bigram概率字典  
        trigram_probabilities: trigram概率字典
    
    Returns:
        tuple: (unigram_nll, bigram_nll, trigram_nll)
    """
    # 处理空句子
    if not sentence:
        return 0.0, 0.0, 0.0
    
    # 获取词汇表大小用于平滑
    vocab_size = len(unigram_probabilities)
    smoothing_factor = 1e-10  # 小的平滑因子，避免log(0)
    
    # Unigram负对数似然
    unigram_log_prob = 0.0
    for char in sentence:
        # 使用平滑处理未出现的字符
        prob = unigram_probabilities.get(char, smoothing_factor)
        unigram_log_prob += math.log(prob)
    unigram_nll = -unigram_log_prob
    
    # Bigram负对数似然
    bigram_log_prob = 0.0
    if len(sentence) >= 2:
        # 第一个字符的概率
        first_char_prob = unigram_probabilities.get(sentence[0], smoothing_factor)
        bigram_log_prob += math.log(first_char_prob)
        
        # 后续字符的条件概率
        for i in range(1, len(sentence)):
            bigram = sentence[i-1] + sentence[i]
            prob = bigram_probabilities.get(bigram, smoothing_factor)
            bigram_log_prob += math.log(prob)
    else:
        # 单字符句子
        prob = unigram_probabilities.get(sentence[0], smoothing_factor)
        bigram_log_prob = math.log(prob)
    
    bigram_nll = -bigram_log_prob
    
    # Trigram负对数似然
    trigram_log_prob = 0.0
    if len(sentence) >= 3:
        # 第一个字符的概率
        first_char_prob = unigram_probabilities.get(sentence[0], smoothing_factor)
        trigram_log_prob += math.log(first_char_prob)
        
        # 第二个字符的条件概率
        if len(sentence) >= 2:
            bigram = sentence[0] + sentence[1]
            second_char_prob = bigram_probabilities.get(bigram, smoothing_factor)
            trigram_log_prob += math.log(second_char_prob)
        
        # 后续字符的三元条件概率
        for i in range(2, len(sentence)):
            trigram = sentence[i-2] + sentence[i-1] + sentence[i]
            prob = trigram_probabilities.get(trigram, smoothing_factor)
            trigram_log_prob += math.log(prob)
    else:
        # 处理短句子
        if len(sentence) == 1:
            prob = unigram_probabilities.get(sentence[0], smoothing_factor)
            trigram_log_prob = math.log(prob)
        elif len(sentence) == 2:
            first_char_prob = unigram_probabilities.get(sentence[0], smoothing_factor)
            trigram_log_prob += math.log(first_char_prob)
            bigram = sentence[0] + sentence[1]
            second_char_prob = bigram_probabilities.get(bigram, smoothing_factor)
            trigram_log_prob += math.log(second_char_prob)
    
    trigram_nll = -trigram_log_prob
    
    return unigram_nll, bigram_nll, trigram_nll


# 示例句子
sentence = "盖茨比"

# 计算句子的对数似然值
unigram_nll, bigram_nll, trigram_nll = calculate_sentence_neg_log_likelihood(sentence, unigram_probabilities, bigram_probabilities, trigram_probabilities)

print(f"Unigram Negative Log-Likelihood: {unigram_nll}")
print(f"Bigram Negative Log-Likelihood: {bigram_nll}")
print(f"Trigram Negative Log-Likelihood: {trigram_nll}")

def process_sentences_file(filepath, unigram_probabilities, bigram_probabilities, trigram_probabilities, output_filepath):
    """
    处理包含多个句子的txt文件，计算每个句子的负对数似然值
    
    Args:
        filepath: 输入文件路径
        unigram_probabilities: unigram概率字典
        bigram_probabilities: bigram概率字典
        trigram_probabilities: trigram概率字典
        output_filepath: 输出文件路径
    """
    try:
        # 读取输入文件
        with open(filepath, 'r', encoding='utf-8') as f:
            sentences = [line.strip() for line in f if line.strip()]
        
        # 计算每个句子的负对数似然值
        results = []
        for i, sentence in enumerate(sentences, 1):
            unigram_nll, bigram_nll, trigram_nll = calculate_sentence_neg_log_likelihood(
                sentence, unigram_probabilities, bigram_probabilities, trigram_probabilities
            )
            
            result_line = f"句子 {i}: '{sentence}'\n"
            result_line += f"Unigram Negative Log-Likelihood: {unigram_nll}\n"
            result_line += f"Bigram Negative Log-Likelihood: {bigram_nll}\n"
            result_line += f"Trigram Negative Log-Likelihood: {trigram_nll}\n"
            result_line += "-" * 50 + "\n"
            
            results.append(result_line)
            print(result_line)
        
        # 写入输出文件
        with open(output_filepath, 'w', encoding='utf-8') as f:
            f.write("句子概率计算结果\n")
            f.write("=" * 50 + "\n\n")
            f.writelines(results)
        
        print(f"结果已保存到: {output_filepath}")
        
    except FileNotFoundError:
        print(f"错误: 找不到文件 {filepath}")
    except Exception as e:
        print(f"处理文件时出错: {e}")

计算平滑后的概率

In [None]:
#计算平滑后的概率
def _calculate_ngram_probabilities(corpus, n, alpha=1.0):
    """
    通用的N-gram概率计算函数，支持加法平滑。

    Args:
        corpus (list of str): 文本语料库。
        n (int): N-gram的阶数 (1 for unigram, 2 for bigram, 3 for trigram)。
        alpha (float): 平滑参数。默认为1.0 (拉普拉斯平滑)。

    Returns:
        dict: N-gram的概率分布字典。
    """
    if not corpus:
        return {}

    # 1. 构建词汇表
    vocab = set()
    for sentence in corpus:
        for char in sentence:
            vocab.add(char)
    V = len(vocab) # 词汇表大小

    # 2. 统计N-gram和(N-1)-gram的计数
    ngram_counts = defaultdict(int)
    context_counts = defaultdict(int)
    
    total_words = 0 # 用于unigram计算

    for sentence in corpus:
        chars = list(sentence)
        total_words += len(chars)
        
        # 统计(N-1)-gram (上下文)的计数
        for i in range(len(chars) - n + 2):
            context = tuple(chars[i : i + n - 1])
            if context: # 确保上下文不为空
                context_counts[context] += 1
        
        # 统计N-gram的计数
        for i in range(len(chars) - n + 1):
            ngram = tuple(chars[i : i + n])
            ngram_counts[ngram] += 1
            
    # 3. 计算平滑后的概率
    probabilities = {}
    for ngram, count in ngram_counts.items():
        context = ngram[:-1]
        
        # 获取上下文计数
        if n == 1: # Unigram的上下文是总词数
            context_count = total_words
        else:
            context_count = context_counts.get(context, 0)
            
        # 应用加法平滑公式
        probability = (count + alpha) / (context_count + alpha * V)
        
        # 将元组转换为字符串作为键，方便使用
        probabilities[''.join(ngram)] = probability
        
    return probabilities

计算perplexity

In [None]:
#计算perplexity
import numpy as np
def perplexity(model, test_data, vocab_size):
    total_log_prob = 0
    total_words = 0
    for sentence in test_data:
        log_prob = model.log_prob(sentence)
        total_log_prob += log_prob
        total_words += len(sentence)
    perplexity = np.exp(-total_log_prob / total_words)
    return perplexity