# N-gram Language Modeling

In [None]:
from collections import defaultdict
import math
import os
print(os.getcwd())  # 這會顯示目前的工作目錄
# 讀取數據
def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        data = file.readlines()
    return [line.strip().split() for line in data]

In [4]:
# 構建 unigram 模型
def build_unigram_model(data):
    word_counts = defaultdict(int)
    total_count = 0
    for sentence in data:
        for word in sentence:
            word_counts[word] += 1
            total_count += 1
    # 計算機率
    unigram_prob = {word: count / total_count for word, count in word_counts.items()}
    return unigram_prob

# 計算困惑度
def calculate_perplexity(model, data):
    perplexity = 0
    total_words = 0
    for sentence in data:
        for word in sentence:
            prob = model.get(word, model.get("<UNK>", 1e-6))  # 使用 <UNK> 來處理未見詞
            perplexity += -math.log2(prob)
            total_words += 1
    return 2 ** (perplexity / total_words)

# 測試函數
train_data = load_data('HW2/A2-Data/1b_benchmark.train.tokens')
dev_data = load_data('HW2/A2-Data/1b_benchmark.dev.tokens')


unigram_model = build_unigram_model(train_data)
train_perplexity = calculate_perplexity(unigram_model, train_data)
dev_perplexity = calculate_perplexity(unigram_model, dev_data)

print(f"Unigram Model - Train Perplexity: {train_perplexity}")
print(f"Unigram Model - Development Perplexity: {dev_perplexity}")


C:\Users\USER\Downloads\NLP-Courses\NLP201\Assignments
Unigram Model - Train Perplexity: 1696.1368477170352
Unigram Model - Development Perplexity: 1720.9324942060468


In [9]:
def build_bigram_model(data):
    bigram_counts = defaultdict(lambda: defaultdict(int))
    unigram_counts = defaultdict(int)
    for sentence in data:
        for i in range(1, len(sentence)):
            unigram_counts[sentence[i - 1]] += 1
            bigram_counts[sentence[i - 1]][sentence[i]] += 1
        unigram_counts[sentence[-1]] += 1  # for the last word
    # 計算條件機率
    bigram_prob = {w1: {w2: count / unigram_counts[w1] for w2, count in w2_dict.items()} 
                   for w1, w2_dict in bigram_counts.items()}
    return bigram_prob
def calculate_bigram_perplexity(model, data):
    perplexity = 0
    total_words = 0
    for sentence in data:
        for i in range(1, len(sentence)):
            prob = model.get(sentence[i - 1], {}).get(sentence[i], model.get("<UNK>", 1e-6))
            perplexity += -math.log2(prob)
            total_words += 1
    return 2 ** (perplexity / total_words)
    
bigram_model = build_bigram_model(train_data)
bigram_train_perplexity = calculate_bigram_perplexity(bigram_model, train_data)
bigram_dev_perplexity = calculate_bigram_perplexity(bigram_model, dev_data)

print(f"Bigram Model - Train Perplexity: {bigram_train_perplexity}")
print(f"Bigram Model - Development Perplexity: {bigram_dev_perplexity}")

Bigram Model - Train Perplexity: 82.29197494110008
Bigram Model - Development Perplexity: 1531.3391546889097


In [8]:
def build_trigram_model(data):
    trigram_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    bigram_counts = defaultdict(lambda: defaultdict(int))
    
    # 計算 trigram 和 bigram 的詞頻
    for sentence in data:
        if len(sentence) < 2:
            continue  # 如果句子少於兩個詞，跳過該句
        for i in range(2, len(sentence)):
            bigram_counts[sentence[i - 2]][sentence[i - 1]] += 1
            trigram_counts[sentence[i - 2]][sentence[i - 1]][sentence[i]] += 1
        # 更新最後兩個單詞的 bigram 次數
        bigram_counts[sentence[-2]][sentence[-1]] += 1 
    
    # 計算條件機率
    trigram_prob = {w1: {w2: {w3: count / bigram_counts[w1][w2] 
                              for w3, count in w3_dict.items()} 
                         for w2, w3_dict in w2_dict.items()} 
                    for w1, w2_dict in trigram_counts.items()}
    return trigram_prob

    
def calculate_trigram_perplexity(model, data):
    perplexity = 0
    total_words = 0
    for sentence in data:
        for i in range(2, len(sentence)):
            # 取得三連詞的機率，如果不存在則使用 `<UNK>` 的低機率
            prob = model.get(sentence[i - 2], {}).get(sentence[i - 1], {}).get(sentence[i], 1e-6)
            perplexity += -math.log2(prob)
            total_words += 1
    return 2 ** (perplexity / total_words)
    
trigram_model = build_trigram_model(train_data)
trigram_train_perplexity = calculate_trigram_perplexity(trigram_model, train_data)
trigram_dev_perplexity = calculate_trigram_perplexity(trigram_model, dev_data)

print(f"Trigram Model - Train Perplexity: {trigram_train_perplexity}")
print(f"Trigram Model - Development Perplexity: {trigram_dev_perplexity}")

Trigram Model - Train Perplexity: 5.53204747614129
Trigram Model - Development Perplexity: 42533.384050602035


# Smoothing with Linear Interpolation

In [11]:
# 使用線性插值平滑計算困惑度
def calculate_interpolated_perplexity(unigram_model, bigram_model, trigram_model, data, lambda1, lambda2, lambda3):
    perplexity = 0
    total_words = 0
    for sentence in data:
        for i in range(2, len(sentence)):
            unigram_prob = unigram_model.get(sentence[i], unigram_model.get("<UNK>", 1e-6))
            bigram_prob = bigram_model.get(sentence[i - 1], {}).get(sentence[i], 1e-6)
            trigram_prob = trigram_model.get(sentence[i - 2], {}).get(sentence[i - 1], {}).get(sentence[i], 1e-6)
            
            # 經過平滑後的機率
            interpolated_prob = lambda1 * unigram_prob + lambda2 * bigram_prob + lambda3 * trigram_prob
            perplexity += -math.log2(interpolated_prob)
            total_words += 1
    return 2 ** (perplexity / total_words)

# 定義不同的 lambda 組合來進行實驗
lambda_combinations = [
    (0.3, 0.3, 0.4),  # 作業指定的組合
    (0.1, 0.3, 0.6),
    (0.2, 0.4, 0.4),
    (0.5, 0.3, 0.2),
    (0.4, 0.4, 0.2)
]


# 儲存最佳困惑度和對應的 lambda 值
best_dev_perplexity = float('inf')
best_lambda_combination = None

# 進行實驗，計算每組 lambda 值的困惑度
for lambda1, lambda2, lambda3 in lambda_combinations:
    train_perplexity = calculate_interpolated_perplexity(
        unigram_model, bigram_model, trigram_model, train_data, lambda1, lambda2, lambda3
    )
    dev_perplexity = calculate_interpolated_perplexity(
        unigram_model, bigram_model, trigram_model, dev_data, lambda1, lambda2, lambda3
    )
    print(f"Lambda values (λ1={lambda1}, λ2={lambda2}, λ3={lambda3}) - Train Perplexity: {train_perplexity}")
    print(f"Lambda values (λ1={lambda1}, λ2={lambda2}, λ3={lambda3}) - Development Perplexity: {dev_perplexity}")
    
    # 更新最佳困惑度和 lambda 組合
    if dev_perplexity < best_dev_perplexity:
        best_dev_perplexity = dev_perplexity
        best_lambda_combination = (lambda1, lambda2, lambda3)

# 輸出最佳結果
print(f"Best Lambda values: λ1={best_lambda_combination[0]}, λ2={best_lambda_combination[1]}, λ3={best_lambda_combination[2]}")
print(f"Best Development Perplexity: {best_dev_perplexity}")

Lambda values (λ1=0.3, λ2=0.3, λ3=0.4) - Train Perplexity: 11.393113747399937
Lambda values (λ1=0.3, λ2=0.3, λ3=0.4) - Development Perplexity: 617.7934957112209
Lambda values (λ1=0.1, λ2=0.3, λ3=0.6) - Train Perplexity: 8.129643791859909
Lambda values (λ1=0.1, λ2=0.3, λ3=0.6) - Development Perplexity: 783.0750529916
Lambda values (λ1=0.2, λ2=0.4, λ3=0.4) - Train Perplexity: 11.007429969369507
Lambda values (λ1=0.2, λ2=0.4, λ3=0.4) - Development Perplexity: 623.2613123314627
Lambda values (λ1=0.5, λ2=0.3, λ3=0.2) - Train Perplexity: 19.445779031405287
Lambda values (λ1=0.5, λ2=0.3, λ3=0.2) - Development Perplexity: 580.2746066391761
Lambda values (λ1=0.4, λ2=0.4, λ3=0.2) - Train Perplexity: 18.47001454471425
Lambda values (λ1=0.4, λ2=0.4, λ3=0.2) - Development Perplexity: 560.8472677479118
Best Lambda values: λ1=0.4, λ2=0.4, λ3=0.2
Best Development Perplexity: 560.8472677479118


# Experiments with GPT-3