In [132]:
from datasets import load_dataset

data_files = {
    "train":"train.json",
    "test":"test.json",
    "valid":"valid.json"
}
dataset = load_dataset("json", data_files=data_files)

In [133]:
from transformers import AutoTokenizer
# Define label mappings explicitly
label_list = ["O", "B-DATE", "I-DATE", "B-FATALITY", "I-FATALITY", "B-COUNTRY", "I-COUNTRY"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

print(id2label)

{0: 'O', 1: 'B-DATE', 2: 'I-DATE', 3: 'B-FATALITY', 4: 'I-FATALITY', 5: 'B-COUNTRY', 6: 'I-COUNTRY'}


In [134]:
print(dataset["train"][11])

{'sentence': 'The scientists found that the death of a child was associated with a 21 percent increased risk of ischemic heart disease, or reduced blood flow to the heart.', 'tokens': [{'label': 'O', 'token': 'The'}, {'label': 'O', 'token': 'scientists'}, {'label': 'O', 'token': 'found'}, {'label': 'O', 'token': 'that'}, {'label': 'O', 'token': 'the'}, {'label': 'O', 'token': 'death'}, {'label': 'O', 'token': 'of'}, {'label': 'O', 'token': 'a'}, {'label': 'O', 'token': 'child'}, {'label': 'O', 'token': 'was'}, {'label': 'O', 'token': 'associated'}, {'label': 'O', 'token': 'with'}, {'label': 'O', 'token': 'a'}, {'label': 'O', 'token': '21'}, {'label': 'O', 'token': 'percent'}, {'label': 'O', 'token': 'increased'}, {'label': 'O', 'token': 'risk'}, {'label': 'O', 'token': 'of'}, {'label': 'O', 'token': 'ischemic'}, {'label': 'O', 'token': 'heart'}, {'label': 'O', 'token': 'disease'}, {'label': 'O', 'token': ','}, {'label': 'O', 'token': 'or'}, {'label': 'O', 'token': 'reduced'}, {'label':

In [135]:
# Define label mapping
label_map = {"O": 0, "B-COUNTRY":1, "I-COUNTRY":2, "B-DATE":3, "I-DATE":4, "B-FATALITY":5, "I-FATALITY":6}  # Update with actual labels

def convert_dataset_format(example):
    return {
        "tokens": [token["token"] for token in example["tokens"]],  # Extracting tokens
        "labels": [label_map[token["label"]] for token in example["tokens"]]  # Mapping labels
    }

# Apply the conversion
dataset = dataset.map(convert_dataset_format)

In [138]:
print(dataset['train'][10])

{'sentence': 'The study looked not only at the loss of infants and children, but also adolescents and adult children up to age 29.', 'tokens': ['The', 'study', 'looked', 'not', 'only', 'at', 'the', 'loss', 'of', 'infants', 'and', 'children', ',', 'but', 'also', 'adolescents', 'and', 'adult', 'children', 'up', 'to', 'age', '29', '.'], 'labels': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0]}


In [139]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base", add_prefix_space=True)

In [140]:
from transformers import AutoTokenizer

def tokenize_and_align_labels(examples, tokenizer):
    """
    Tokenize input text while properly aligning NER labels.
    """
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",  # Ensure uniform sequence length
        max_length=128,  # Adjust as needed
        is_split_into_words=True,
        return_tensors="pt",  # Ensure PyTorch format
        return_attention_mask=True,  # Generate attention masks
    )

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=0)  # Get word index mapping
        previous_word_idx = None
        aligned_labels = []

        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)  # Ignore special tokens
            elif word_id != previous_word_idx:
                if word_id < len(label):  # Prevent IndexError
                    aligned_labels.append(label[word_id])  # Assign label for first subword
                else:
                    aligned_labels.append(-100)  # Ignore out-of-range errors
            else:
                aligned_labels.append(-100)  # Mask subword tokens

            previous_word_idx = word_id

        labels.append(aligned_labels)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [124]:
tokenized_dataset = dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)

# Load into DataLoader
from torch.utils.data import DataLoader
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=data_collator)

Map:   0%|          | 0/1801 [00:00<?, ? examples/s]

Map:   0%|          | 0/3483 [00:00<?, ? examples/s]

Map:   0%|          | 0/1763 [00:00<?, ? examples/s]

In [105]:
from transformers import AutoTokenizer

def tokenize_and_align_labels(examples, tokenizer):
    """
    Tokenize input text while properly aligning NER labels.
    """
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",  # Ensure uniform sequence length
        max_length=128,  # Adjust as needed
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to word indices
        previous_word_idx = None
        aligned_labels = []

        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)  # Ignore special tokens
            elif word_id != previous_word_idx:
                aligned_labels.append(label[word_id])  # Assign label for the first subword
            else:
                aligned_labels.append(label[word_id])  # Keep label for subword tokens

            previous_word_idx = word_id

        labels.append(aligned_labels)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [106]:
tokenized_dataset = dataset.map(
    lambda x: tokenize_and_align_labels(x, tokenizer),  # Pass tokenizer explicitly
    batched=True
)

Map:   0%|          | 0/1801 [00:00<?, ? examples/s]

Map:   0%|          | 0/3483 [00:00<?, ? examples/s]

Map:   0%|          | 0/1763 [00:00<?, ? examples/s]

In [125]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "roberta-base",
    num_labels=len(label_list),  # Ensure label_list is defined
    label2id=label2id,  # Maps labels to IDs
    id2label={v: k for k, v in label2id.items()}  # Maps IDs back to labels
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [126]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,  # Keep only last 2 checkpoints
    learning_rate=2e-5,
    lr_scheduler_type="linear",  # Linear decay
    warmup_steps=500,  # Gradually increase LR at start
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    label_smoothing_factor=0.1,  # Helps generalization
    fp16=True,  # Enable mixed precision for faster training
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    tokenizer=tokenizer
)

  trainer = Trainer(


In [127]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.6358,0.573016
2,0.5456,0.541025
3,0.5292,0.535924
4,0.538,0.536112
5,0.5209,0.535974


TrainOutput(global_step=1130, training_loss=0.6223127846169261, metrics={'train_runtime': 2750.5216, 'train_samples_per_second': 3.274, 'train_steps_per_second': 0.411, 'total_flos': 588270915006720.0, 'train_loss': 0.6223127846169261, 'epoch': 5.0})

In [128]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.53592449426651, 'eval_runtime': 124.1812, 'eval_samples_per_second': 14.197, 'eval_steps_per_second': 1.78, 'epoch': 5.0}


In [129]:
model.save_pretrained("mic_ner_model_01")
tokenizer.save_pretrained("mic_ner_model_01")

('mic_ner_model_01\\tokenizer_config.json',
 'mic_ner_model_01\\special_tokens_map.json',
 'mic_ner_model_01\\vocab.json',
 'mic_ner_model_01\\merges.txt',
 'mic_ner_model_01\\added_tokens.json',
 'mic_ner_model_01\\tokenizer.json')

In [130]:
tokenizer = AutoTokenizer.from_pretrained("mic_ner_model_01")
tokenizer.add_special_tokens({"bos_token": "<s>", "eos_token": "</s>"})


2