In [1]:
%pip install transformers datasets

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import BertTokenizerFast, BertForTokenClassification

# 데이터 로드
dataset = load_dataset("klue", "ner")
train_dataset = dataset["train"]
test_dataset = dataset["validation"]

# 토크나이저 로드 (KoBERT 토크나이저 사용)
tokenizer = BertTokenizerFast.from_pretrained("monologg/kobert")

# 데이터셋 정의
class NERDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 데이터 전처리 함수
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# 데이터셋 전처리
tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test = test_dataset.map(tokenize_and_align_labels, batched=True)

# 데이터셋 준비
train_encodings = tokenizer(train_dataset['tokens'], truncation=True, padding=True, is_split_into_words=True)
test_encodings = tokenizer(test_dataset['tokens'], truncation=True, padding=True, is_split_into_words=True)

train_labels = tokenized_train["labels"]
test_labels = tokenized_test["labels"]

train_dataset = NERDataset(train_encodings, train_labels)
test_dataset = NERDataset(test_encodings, test_labels)


  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 22.5k/22.5k [00:00<?, ?B/s]
Downloading data: 100%|██████████| 4.21M/4.21M [00:03<00:00, 1.33MB/s]
Downloading data: 100%|██████████| 1.06M/1.06M [00:01<00:00, 740kB/s]
Generating train split: 100%|██████████| 21008/21008 [00:00<00:00, 389036.03 examples/s]
Generating validation split: 100%|██████████| 5000/5000 [00:00<00:00, 312504.02 examples/s]
Map: 100%|██████████| 21008/21008 [00:02<00:00, 8903.70 examples/s]
Map: 100%|██████████| 5000/5000 [00:00<00:00, 8865.25 examples/s]
