In [2]:
from datasets import load_dataset
from collections import Counter
import re
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence

ptb= load_dataset('ptb_text_only', split=['train', 'validation', 'test'], trust_remote_code=True)

# 1. 建立詞彙表
def build_vocab(sentences, min_freq=2):
    # 計算詞頻
    counter = Counter()
    for sentence in sentences:
        tokens = re.findall(r'\w+', sentence.lower())  # 將句子轉換為小寫並用正則表達式分詞
        counter.update(tokens)
    
    # 建立詞彙表，僅保留頻率大於等於 min_freq 的詞
    vocab = {word: idx for idx, (word, count) in enumerate(counter.items(), start=2) if count >= min_freq}
    
    # 添加特殊符號 (e.g., PAD for padding, UNK for unknown tokens)
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    
    return vocab

# 2. 將句子格式化為詞彙索引
def format_sentences(sentences, vocab):
    formatted_sentences = []
    for sentence in sentences:
        tokens = re.findall(r'\w+', sentence.lower())
        # 使用詞彙表將詞轉換為索引
        indexed_sentence = [vocab.get(token, vocab['<UNK>']) for token in tokens]
        formatted_sentences.append(indexed_sentence)
    return formatted_sentences

# 將資料集中的句子提取出來
train_sentences = [example['sentence'] for example in ptb[0]]
validation_sentences = [example['sentence'] for example in ptb[1]]
test_sentences = [example['sentence'] for example in ptb[2]]

# 建立詞彙表 (基於訓練集)
vocab = build_vocab(train_sentences)

# 格式化句子
train_data = format_sentences(train_sentences, vocab)
validation_data = format_sentences(validation_sentences, vocab)
test_data = format_sentences(test_sentences, vocab)

def collate_fn(batch):
    # 將每個數據點的句子解包，因為 DataLoader 傳遞過來的是一個列表，列表中的每個元素是 TensorDataset 中的一個元組
    sentences = [item[0] for item in batch]
    # 使用 pad_sequence 將句子填充為相同長度
    padded_batch = pad_sequence(sentences, batch_first=True, padding_value=vocab['<PAD>'])
    return padded_batch

# 重新定義 DataLoader，這次使用修正的 collate_fn
batch_size = 32

# 使用 TensorDataset 包裝數據
train_dataset = TensorDataset(torch.nn.utils.rnn.pad_sequence(train_tensors, batch_first=True, padding_value=vocab['<PAD>']))
validation_dataset = TensorDataset(torch.nn.utils.rnn.pad_sequence(validation_tensors, batch_first=True, padding_value=vocab['<PAD>']))
test_dataset = TensorDataset(torch.nn.utils.rnn.pad_sequence(test_tensors, batch_first=True, padding_value=vocab['<PAD>']))

# 定義 DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)



KeyboardInterrupt



In [None]:
# 1. 建立詞彙表
def build_vocab(sentences, min_freq=2):
    # 計算詞頻
    counter = Counter()
    for sentence in sentences:
        tokens = re.findall(r'\w+', sentence.lower())  # 將句子轉換為小寫並用正則表達式分詞
        counter.update(tokens)
    
    # 建立詞彙表，僅保留頻率大於等於 min_freq 的詞
    vocab = {word: idx for idx, (word, count) in enumerate(counter.items(), start=2) if count >= min_freq}
    
    # 添加特殊符號 (e.g., PAD for padding, UNK for unknown tokens)
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    
    return vocab

# 2. 將句子格式化為詞彙索引
def format_sentences(sentences, vocab):
    formatted_sentences = []
    for sentence in sentences:
        tokens = re.findall(r'\w+', sentence.lower())
        # 使用詞彙表將詞轉換為索引
        indexed_sentence = [vocab.get(token, vocab['<UNK>']) for token in tokens]
        formatted_sentences.append(indexed_sentence)
    return formatted_sentences

# 將資料集中的句子提取出來
train_sentences = [example['sentence'] for example in ptb[0]]
validation_sentences = [example['sentence'] for example in ptb[1]]
test_sentences = [example['sentence'] for example in ptb[2]]

# 建立詞彙表 (基於訓練集)
vocab = build_vocab(train_sentences)

# 格式化句子
train_data = format_sentences(train_sentences, vocab)
validation_data = format_sentences(validation_sentences, vocab)
test_data = format_sentences(test_sentences, vocab)

# 確認處理後的數據
print(f"Vocabulary size: {len(vocab)}")
print(f"First 3 training examples (as word indices): {train_data[:3]}")