# Train RoBERTa model on NER:

In [2]:
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification,
    TrainingArguments, Trainer, DataCollatorForTokenClassification
)
import numpy as np
import torch
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Read Conll2003 Dataset

In [3]:
def read_conll_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            if not sentence.strip():
                continue
            tokens = [line.split() for line in sentence.split("\n") if line.strip()]
            data.append(tokens)
        return data

base_path = "/content/drive/MyDrive/conll2003/"
train_data = read_conll_file(base_path + "eng.train")
valid_data = read_conll_file(base_path + "eng.testa")
test_data  = read_conll_file(base_path + "eng.testb")

### Build tag mapping

In [4]:
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

def extract_tokens_and_labels(data):
    tokens_list = []
    labels_list = []
    for sentence in data:
        tokens = [t[0] for t in sentence]
        labels = [t[3] for t in sentence]
        tokens_list.append(tokens)
        labels_list.append([label2id[l] for l in labels])
    return {"tokens": tokens_list, "ner_tags": labels_list}

raw_datasets = DatasetDict({
    "train": Dataset.from_dict(extract_tokens_and_labels(train_data)),
    "validation": Dataset.from_dict(extract_tokens_and_labels(valid_data)),
    "test": Dataset.from_dict(extract_tokens_and_labels(test_data)),
})


### Import Base Model and Tokenizer

In [5]:
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Label alignment function

In [6]:
data_collator = DataCollatorForTokenClassification(tokenizer)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=512,
        return_overflowing_tokens=False,
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        prev_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != prev_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label[word_idx] % 2 == 1 else -100)
            prev_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names
)

Map:   0%|          | 0/14987 [00:00<?, ? examples/s]

Map:   0%|          | 0/3466 [00:00<?, ? examples/s]

Map:   0%|          | 0/3684 [00:00<?, ? examples/s]

### Evaluation indicators(Use seqeval)

In [9]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [11]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=799afd1b1eb12de0d95c2cc5c5364604c8cff0d405b0bb0521b65b694e533305
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [12]:
import evaluate
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

### Train config

In [13]:
training_args = TrainingArguments(
    output_dir=base_path + "results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    push_to_hub=False,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


### Train and eval

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1957,0.051216,0.941226,0.949849,0.945518,0.987016
2,0.041,0.04158,0.959007,0.963355,0.961176,0.990182
3,0.024,0.040988,0.961752,0.966566,0.964153,0.991054


TrainOutput(global_step=2811, training_loss=0.06710354130872836, metrics={'train_runtime': 639.1553, 'train_samples_per_second': 70.344, 'train_steps_per_second': 4.398, 'total_flos': 1080835163228142.0, 'train_loss': 0.06710354130872836, 'epoch': 3.0})

In [15]:
test_results = trainer.evaluate(tokenized_datasets["test"])
print(test_results)

{'eval_loss': 0.13540306687355042, 'eval_precision': 0.9203020967902216, 'eval_recall': 0.9291662486204475, 'eval_f1': 0.9247129306040938, 'eval_accuracy': 0.9787780719819554, 'eval_runtime': 14.3729, 'eval_samples_per_second': 256.316, 'eval_steps_per_second': 16.072, 'epoch': 3.0}


### Save trained model

In [16]:
model_save_path = base_path + "ner_roberta_model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model saved to: {model_save_path}")

Model saved to: /content/drive/MyDrive/conll2003/ner_roberta_model


### Predict on sentences

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

loaded_model = AutoModelForTokenClassification.from_pretrained(model_save_path)
loaded_tokenizer = AutoTokenizer.from_pretrained(model_save_path)

loaded_model.to(device)
print(f"use device: {device}")

def predict_single_sentence(sentence, model, tokenizer, label_list):
    tokens = sentence.split()
    print(f"Input Sentence: {sentence}")
    print(f"Tokenize result: {tokens}")

    inputs = tokenizer(
        tokens,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        padding=True
    )

    inputs = {key: value.to(device) for key, value in inputs.items()}

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)

    predicted_labels = predictions[0].cpu().numpy()

    tokenized_input = tokenizer(
        tokens,
        is_split_into_words=True,
        truncation=True,
        max_length=512,
        padding=True
    )
    word_ids = tokenized_input.word_ids()

    previous_word_idx = None
    predicted_ner_tags = []

    for i, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue
        elif word_idx != previous_word_idx:
            predicted_ner_tags.append(label_list[predicted_labels[i]])
            previous_word_idx = word_idx

    return tokens, predicted_ner_tags

def print_ner_results(tokens, labels):
    print("\nNER prediction:")
    print("-" * 40)
    for token, label in zip(tokens, labels):
        print(f"{token:15} -> {label}")
    print("-" * 40)

test_sentences = [
    "John Smith works at Google in California",
    "Apple Inc. is located in Cupertino and Tim Cook is the CEO",
    "Microsoft was founded by Bill Gates and Paul Allen"
]

for i, sentence in enumerate(test_sentences, 1):
    print(f"\nSentence {i}:")
    print("=" * 50)

    tokens, labels = predict_single_sentence(sentence, loaded_model, tokenizer, label_list)
    print_ner_results(tokens, labels)

use device: cuda

Sentence 1:
Input Sentence: John Smith works at Google in California
Tokenize result: ['John', 'Smith', 'works', 'at', 'Google', 'in', 'California']

NER prediction:
----------------------------------------
John            -> B-PER
Smith           -> I-PER
works           -> O
at              -> O
Google          -> B-ORG
in              -> O
California      -> B-LOC
----------------------------------------

Sentence 2:
Input Sentence: Apple Inc. is located in Cupertino and Tim Cook is the CEO
Tokenize result: ['Apple', 'Inc.', 'is', 'located', 'in', 'Cupertino', 'and', 'Tim', 'Cook', 'is', 'the', 'CEO']

NER prediction:
----------------------------------------
Apple           -> B-ORG
Inc.            -> I-ORG
is              -> O
located         -> O
in              -> O
Cupertino       -> B-LOC
and             -> O
Tim             -> B-PER
Cook            -> I-PER
is              -> O
the             -> O
CEO             -> O
----------------------------------------