In [5]:
from datasets import load_dataset
from collections import Counter
import re
# 加載 Penn Treebank 資料集，允許執行自定義程式碼
ptb = load_dataset('ptb_text_only', split=['train', 'validation', 'test'], trust_remote_code=True)

# 查看資料集的結構
print(ptb)


[Dataset({
    features: ['sentence'],
    num_rows: 42068
}), Dataset({
    features: ['sentence'],
    num_rows: 3370
}), Dataset({
    features: ['sentence'],
    num_rows: 3761
})]


In [6]:
# 1. 建立詞彙表
def build_vocab(sentences, min_freq=2):
    # 計算詞頻
    counter = Counter()
    for sentence in sentences:
        tokens = re.findall(r'\w+', sentence.lower())  # 將句子轉換為小寫並用正則表達式分詞
        counter.update(tokens)
    
    # 建立詞彙表，僅保留頻率大於等於 min_freq 的詞
    vocab = {word: idx for idx, (word, count) in enumerate(counter.items(), start=2) if count >= min_freq}
    
    # 添加特殊符號 (e.g., PAD for padding, UNK for unknown tokens)
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    
    return vocab

# 2. 將句子格式化為詞彙索引
def format_sentences(sentences, vocab):
    formatted_sentences = []
    for sentence in sentences:
        tokens = re.findall(r'\w+', sentence.lower())
        # 使用詞彙表將詞轉換為索引
        indexed_sentence = [vocab.get(token, vocab['<UNK>']) for token in tokens]
        formatted_sentences.append(indexed_sentence)
    return formatted_sentences

# 將資料集中的句子提取出來
train_sentences = [example['sentence'] for example in ptb[0]]
validation_sentences = [example['sentence'] for example in ptb[1]]
test_sentences = [example['sentence'] for example in ptb[2]]

# 建立詞彙表 (基於訓練集)
vocab = build_vocab(train_sentences)

# 格式化句子
train_data = format_sentences(train_sentences, vocab)
validation_data = format_sentences(validation_sentences, vocab)
test_data = format_sentences(test_sentences, vocab)

# 確認處理後的數據
print(f"Vocabulary size: {len(vocab)}")
print(f"First 3 training examples (as word indices): {train_data[:3]}")

Vocabulary size: 9616
First 3 training examples (as word indices): [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 24, 1, 1, 1], [28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 30], [42, 29, 43, 44, 45, 29, 30, 46, 35, 47, 48, 49]]


In [7]:
# 打印前五筆資料的句子及其對應的詞彙索引
for i in range(5):
    sentence = train_sentences[i]
    indexed_sentence = train_data[i]
    print(f"原始句子 {i+1}: {sentence}")
    print(f"對應的詞彙索引: {indexed_sentence}\n")


原始句子 1: aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter
對應的詞彙索引: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 24, 1, 1, 1]

原始句子 2: pierre <unk> N years old will join the board as a nonexecutive director nov. N
對應的詞彙索引: [28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 30]

原始句子 3: mr. <unk> is chairman of <unk> n.v. the dutch publishing group
對應的詞彙索引: [42, 29, 43, 44, 45, 29, 30, 46, 35, 47, 48, 49]

原始句子 4: rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this british industrial conglomerate
對應的詞彙索引: [50, 29, 30, 31, 32, 51, 52, 44, 45, 53, 54, 55, 56, 57, 58, 38, 39, 40, 45, 59, 60, 61, 62]

原始句子 5: a form of asbestos once used to make kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than N years ago 

In [10]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence

# 定義新的填充函數
def collate_fn(batch):
    # 將每個數據點的句子解包，因為 DataLoader 傳遞過來的是一個列表，列表中的每個元素是 TensorDataset 中的一個元組
    sentences = [item[0] for item in batch]
    # 使用 pad_sequence 將句子填充為相同長度
    padded_batch = pad_sequence(sentences, batch_first=True, padding_value=vocab['<PAD>'])
    return padded_batch

# 重新定義 DataLoader，這次使用修正的 collate_fn
batch_size = 32

# 使用 TensorDataset 包裝數據
train_dataset = TensorDataset(torch.nn.utils.rnn.pad_sequence(train_tensors, batch_first=True, padding_value=vocab['<PAD>']))
validation_dataset = TensorDataset(torch.nn.utils.rnn.pad_sequence(validation_tensors, batch_first=True, padding_value=vocab['<PAD>']))
test_dataset = TensorDataset(torch.nn.utils.rnn.pad_sequence(test_tensors, batch_first=True, padding_value=vocab['<PAD>']))

# 定義 DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# 打印一個批次來確認數據格式
for batch in train_loader:
    print(f"Batch size: {batch.shape}")
    print(f"First batch (as word indices): {batch}")
    break


Batch size: torch.Size([32, 98])
First batch (as word indices): tensor([[8684, 8685, 3001,  ...,    0,    0,    0],
        [ 285,  252,   42,  ...,    0,    0,    0],
        [  38, 3130,  172,  ...,    0,    0,    0],
        ...,
        [  35, 9629, 2818,  ...,    0,    0,    0],
        [1327,   90,  553,  ...,    0,    0,    0],
        [  35,  192,  704,  ...,    0,    0,    0]])


In [17]:
!nvidia-smi


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [18]:
device = torch.device("cpu")
model.to(device)

LanguageModel(
  (embedding): Embedding(9616, 128)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=9616, bias=True)
)

In [19]:
import torch
import torch.nn as nn
import torch.optim as optim

# 1. 定義語言模型架構
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(LanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        # 嵌入層
        x = self.embedding(x)
        # LSTM 層
        out, hidden = self.lstm(x, hidden)
        # 全連接層
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        # 初始化 LSTM 隱藏狀態和細胞狀態
        weight = next(self.parameters()).data
        return (weight.new_zeros(self.lstm.num_layers, batch_size, self.lstm.hidden_size).to(device),
                weight.new_zeros(self.lstm.num_layers, batch_size, self.lstm.hidden_size).to(device))

# 2. 定義模型超參數
vocab_size = len(vocab)
embed_size = 128
hidden_size = 256
num_layers = 2
learning_rate = 0.001
num_epochs = 10

# 3. 建立模型、損失函數和優化器
model = LanguageModel(vocab_size, embed_size, hidden_size, num_layers)

# 移動模型到設備之前，檢查所有參數是否正常
for name, param in model.named_parameters():
    if param.requires_grad:
        if torch.isnan(param).any():
            raise ValueError(f"Parameter {name} contains NaN values")
        if torch.isinf(param).any():
            raise ValueError(f"Parameter {name} contains infinite values")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

device = torch.device("cpu")
model.to(device)

# 檢查詞彙索引範圍
def check_vocab_range(data, vocab_size):
    for sentence in data:
        for idx in sentence:
            if idx >= vocab_size or idx < 0:
                print(f"Out of range index found: {idx}")
                return False
    return True

assert check_vocab_range(train_data, vocab_size), "Train data contains out of range indices!"
assert check_vocab_range(validation_data, vocab_size), "Validation data contains out of range indices!"
assert check_vocab_range(test_data, vocab_size), "Test data contains out of range indices!"

for epoch in range(num_epochs):
    model.train()
    
    for batch_idx, batch in enumerate(train_loader):
        # 確保批次數據和隱藏狀態位於正確設備上
        batch = batch.to(device)
        batch_size = batch.size(0)
        hidden = model.init_hidden(batch_size)

        optimizer.zero_grad()

        # 前向傳播
        hidden = tuple([each.to(device) for each in hidden])
        output, hidden = model(batch, hidden)

        # 調整 output 的形狀，使其適合 CrossEntropyLoss 的輸入
        output = output.view(-1, vocab_size)
        batch = batch.view(-1)

        # 檢查 batch 是否在正確的範圍內且為 LongTensor
        assert batch.dtype == torch.long, "Batch tensor must be of type LongTensor"
        assert torch.max(batch) < vocab_size, "Batch contains out-of-range indices"

        # 計算損失
        loss = criterion(output, batch)
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}")

# 5. 模型驗證
model.eval()
total_loss = 0

with torch.no_grad():
    for batch_idx, batch in enumerate(validation_loader):
        # 確保批次數據和隱藏狀態位於正確設備上
        batch = batch.to(device)
        batch_size = batch.size(0)
        hidden = model.init_hidden(batch_size)

        hidden = tuple([each.to(device) for each in hidden])
        output, hidden = model(batch, hidden)

        # 調整 output 的形狀，使其適合 CrossEntropyLoss 的輸入
        output = output.view(-1, vocab_size)
        batch = batch.view(-1)

        # 檢查 batch 是否在正確的範圍內且為 LongTensor
        assert batch.dtype == torch.long, "Batch tensor must be of type LongTensor"
        assert torch.max(batch) < vocab_size, "Batch contains out-of-range indices"

        # 計算損失
        loss = criterion(output, batch)
        total_loss += loss.item()

avg_loss = total_loss / len(validation_loader)
print(f"Validation Loss: {avg_loss:.4f}")


Out of range index found: 9616


AssertionError: Train data contains out of range indices!

In [23]:
from datasets import load_dataset
from collections import Counter
import re
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence

# 加載 Penn Treebank 資料集
ptb = load_dataset('ptb_text_only', split=['train', 'validation', 'test'], trust_remote_code=True)

# 1. 建立詞彙表
def build_vocab(sentences, min_freq=2):
    # 計算詞頻
    counter = Counter()
    for sentence in sentences:
        tokens = re.findall(r'\w+', sentence.lower())  # 將句子轉換為小寫並用正則表達式分詞
        counter.update(tokens)
    
    # 添加特殊符號 (e.g., PAD for padding, UNK for unknown tokens)
    vocab = {'<PAD>': 0, '<UNK>': 1}
    
    # 建立詞彙表，僅保留頻率大於等於 min_freq 的詞，從索引 2 開始
    for idx, (word, count) in enumerate(counter.items(), start=2):
        if count >= min_freq:
            vocab[word] = idx
            
    return vocab

# 2. 將句子格式化為詞彙索引
def format_sentences(sentences, vocab):
    formatted_sentences = []
    for sentence in sentences:
        tokens = re.findall(r'\w+', sentence.lower())
        # 使用詞彙表將詞轉換為索引，如果詞不在詞彙表中，則使用 <UNK> 的索引
        indexed_sentence = [vocab.get(token, vocab['<UNK>']) for token in tokens]
        formatted_sentences.append(indexed_sentence)
    return formatted_sentences

# 將資料集中的句子提取出來
train_sentences = [example['sentence'] for example in ptb[0]]
validation_sentences = [example['sentence'] for example in ptb[1]]
test_sentences = [example['sentence'] for example in ptb[2]]

# 建立詞彙表 (基於訓練集)
vocab = build_vocab(train_sentences)

# 格式化句子
train_data = format_sentences(train_sentences, vocab)
validation_data = format_sentences(validation_sentences, vocab)
test_data = format_sentences(test_sentences, vocab)

# 檢查詞彙索引範圍，並打印出錯誤的句子及其索引
def check_vocab_range(data, vocab_size):
    for sentence_idx, sentence in enumerate(data):
        for idx in sentence:
            if idx >= vocab_size or idx < 0:
                # 打印出超出範圍的索引以及其對應的原始句子
                print(f"Out of range index found in sentence {sentence_idx}: {idx}")
                print(f"Sentence: {data[sentence_idx]}")
                return False
    return True

# 確保詞彙表和數據的匹配
try:
    assert check_vocab_range(train_data, len(vocab)), "Train data contains out of range indices!"
    assert check_vocab_range(validation_data, len(vocab)), "Validation data contains out of range indices!"
    assert check_vocab_range(test_data, len(vocab)), "Test data contains out of range indices!"
except AssertionError as e:
    print(e)

# 打印詞彙表大小
print(f"Vocabulary size: {len(vocab)}")

# 打印示例句子及其索引
for i in range(5):
    print(f"Sentence {i}: {train_sentences[i]}")
    print(f"Indexed sentence: {train_data[i]}")


Out of range index found in sentence 37973: 9616
Sentence: [9616, 459, 49, 101, 81, 33, 1354, 37, 1105, 37, 30, 411, 112, 30, 45, 113, 814, 2272, 2273, 3488, 7008, 67, 740, 474, 67, 4661, 662, 98, 290, 38, 104, 2493, 3488, 5788]
Train data contains out of range indices!
Vocabulary size: 9616
Sentence 0: aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter
Indexed sentence: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 24, 1, 1, 1]
Sentence 1: pierre <unk> N years old will join the board as a nonexecutive director nov. N
Indexed sentence: [28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 30]
Sentence 2: mr. <unk> is chairman of <unk> n.v. the dutch publishing group
Indexed sentence: [42, 29, 43, 44, 45, 29, 30, 46, 35, 47, 48, 49]
Sentence 3: rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexec