### NER

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are ini

[{'entity': 'B-PER', 'score': np.float32(0.9990139), 'index': 4, 'word': 'Wolfgang', 'start': 11, 'end': 19}, {'entity': 'B-LOC', 'score': np.float32(0.999645), 'index': 9, 'word': 'Berlin', 'start': 34, 'end': 40}]


In [2]:
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-PER', 'score': np.float32(0.9990139), 'index': 4, 'word': 'Wolfgang', 'start': 11, 'end': 19}, {'entity': 'B-LOC', 'score': np.float32(0.999645), 'index': 9, 'word': 'Berlin', 'start': 34, 'end': 40}]


### Imports

In [None]:
import numpy as np
from datasets import Dataset
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
from seqeval.metrics import precision_score, recall_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


### Load dataset

In [None]:
dataset = load_dataset(
    "csv",
    data_files={
        "train": "/kaggle/input/finer-ord/train.csv",
        "validation": "/kaggle/input/finer-ord/val.csv",
        "test": "/kaggle/input/finer-ord/test.csv"
    }
)

print(dataset)
print(dataset["train"][0])


def group_sentences(ds_split):
    grouped = {}
    for ex in ds_split:
        token = ex['gold_token']
        if token is None or not isinstance(token, str) or not token.strip():  
            continue
        key = (ex['doc_idx'], ex['sent_idx'])
        if key not in grouped:
            grouped[key] = {'tokens': [], 'ner_tags': []}
        grouped[key]['tokens'].append(token)
        grouped[key]['ner_tags'].append(ex['gold_label'])
    
    # Only include non-empty sentences
    data = [{'tokens': val['tokens'], 'ner_tags': val['ner_tags']} for key, val in grouped.items() if val['tokens']]
    return Dataset.from_list(data)

dataset['train'] = group_sentences(dataset['train'])
dataset['validation'] = group_sentences(dataset['validation'])
dataset['test'] = group_sentences(dataset['test'])

print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['gold_label', 'gold_token', 'doc_idx', 'sent_idx'],
        num_rows: 80531
    })
    validation: Dataset({
        features: ['gold_label', 'gold_token', 'doc_idx', 'sent_idx'],
        num_rows: 10233
    })
    test: Dataset({
        features: ['gold_label', 'gold_token', 'doc_idx', 'sent_idx'],
        num_rows: 25957
    })
})
{'gold_label': 0, 'gold_token': 'Kenyan', 'doc_idx': 0, 'sent_idx': 0}
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3262
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 402
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1075
    })
})
{'tokens': ['Kenyan', 'Firms', 'Eye', 'Deals', 'During', 'Obama', 'Summit', 'Tagged', ':', 'The', 'Global', 'Entrepreneurship', 'Summit', ',', 'launched', 'by', 'President', 'Obama', 'in', '2009', ',', 'brings', 'together', 'entrepreneurs',

### Tokenize

In [27]:

model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)

label_to_id = {
    'O': 0,
    'B-PER': 1,
    'I-PER': 2,
    'B-LOC': 3,
    'I-LOC': 4,
    'B-ORG': 5,
    'I-ORG': 6
}
id_to_label = {v: k for k, v in label_to_id.items()}
label_list = list(label_to_id.keys())

print("Labels:", label_list)


def tokenize_and_align_labels(examples):
    for i, sent in enumerate(examples["tokens"]):
        for j, tok in enumerate(sent):
            if not isinstance(tok, str):
                print(f"Non-string token in example {i}, position {j}: {tok}")
    
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True,
        is_split_into_words=True
    )

    all_labels = []
    for i, label_seq in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_seq[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs


tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Labels: ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']


Map:   0%|          | 0/3262 [00:00<?, ? examples/s]

Map:   0%|          | 0/402 [00:00<?, ? examples/s]

Map:   0%|          | 0/1075 [00:00<?, ? examples/s]

### Model

In [28]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name, 
    num_labels=len(label_list), 
    id2label=id_to_label, 
    label2id=label_to_id,
    ignore_mismatched_sizes=True
)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) 

### Train

In [29]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    disable_tqdm=False,
    report_to="none"  # если хочешь отключить wandb или другие логгеры
)


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id_to_label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }


data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


print("Начинаем обучение...")
trainer.train()
print("Обучение завершено.")

  trainer = Trainer(


Начинаем обучение...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0475,0.0449,0.883607,0.885057,0.884331
2,0.0226,0.041599,0.912252,0.904762,0.908491
3,0.0176,0.042218,0.907743,0.904762,0.90625


Обучение завершено.


### Evaluate

In [30]:
results = trainer.evaluate(tokenized_datasets["test"])
print(results)

trainer.save_model("/kaggle/working/models/ner_model")
tokenizer.save_pretrained("/kaggle/working/models/ner_model")

{'eval_loss': 0.06925616413354874, 'eval_precision': 0.7867892976588629, 'eval_recall': 0.8261633011413521, 'eval_f1': 0.8059957173447537, 'eval_runtime': 2.3791, 'eval_samples_per_second': 451.856, 'eval_steps_per_second': 28.583, 'epoch': 3.0}


('/kaggle/working/models/ner_model/tokenizer_config.json',
 '/kaggle/working/models/ner_model/special_tokens_map.json',
 '/kaggle/working/models/ner_model/vocab.txt',
 '/kaggle/working/models/ner_model/added_tokens.json',
 '/kaggle/working/models/ner_model/tokenizer.json')

In [32]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_path = "/kaggle/working/models/ner_model"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

text = "Barack Obama was born in Hawaii."

ner_results = ner_pipeline(text)

for entity in ner_results:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.4f}")


Device set to use cuda:0


Entity: Barack Obama, Label: PER, Score: 0.9935
Entity: Hawaii, Label: LOC, Score: 0.9948
