In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(example)
print(ner_results)

Downloading tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'B-PER', 'score': 0.9990139, 'index': 4, 'word': 'Wolfgang', 'start': 11, 'end': 19}, {'entity': 'B-LOC', 'score': 0.999645, 'index': 9, 'word': 'Berlin', 'start': 34, 'end': 40}]


In [66]:

from datasets import load_dataset
import pandas as pd
language = 'en'

dataset = load_dataset('wikiann','en', split='test')
# Load the WikiANN dataset


# Display basic information about the dataset
print(dataset)

# Access the first example in the dataset
example = dataset['tokens'][0]
print("Example:", example)

dataset = dataset.rename_column("ner_tags", "labels")
dataset = dataset.rename_column("tokens", "text")

Found cached dataset wikiann (/home/s6amalia/.cache/huggingface/datasets/wikiann/en/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e)


Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'spans'],
    num_rows: 10000
})
Example: ['Shortly', 'afterward', ',', 'an', 'encouraging', 'response', 'influenced', 'him', 'to', 'go', 'to', 'India', ';', 'he', 'arrived', 'at', 'Adyar', 'in', '1884', '.']


In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaForTokenClassification
import torch

# Load the pre-trained BERT model and tokenizer
model_name = "/home/s6amalia/xlmroberta-wikiann-en.pt"
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model = XLMRobertaForTokenClassification.from_pretrained(model_name,num_labels=7)

In [117]:
dataset['text'][2]

['Blacktown', 'railway', 'station']

In [116]:
from transformers import pipeline
classifier = pipeline( model="/home/s6amalia/xlmroberta-wikiann-en.pt", truncation=True,tokenizer = 'xlm-roberta-base')

classifier(' '.join(dataset['text'][1]))

RuntimeError: Instantiating a pipeline without a task set raised an error: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/home/s6amalia/xlmroberta-wikiann-en.pt'. Use `repo_type` argument if needed.

In [118]:
# List of tokens to predict NER tags for
tokens = dataset['text'][2]
# Tokenize the input
inputs = tokenizer(tokens,  is_split_into_words=True,truncation=True,padding="max_length", return_tensors="pt")
model.eval()
# Make the prediction
outputs = model(**inputs).logits
predictions = torch.argmax(outputs, dim=-1)
label_map = {0: "O", 1: "B-PER", 2: "I-PER", 5: "B-LOC", 6: "I-LOC", 3: "B-ORG", 4: "I-ORG"}
predictions[inputs['attention_mask']==1]

tensor([3, 4, 4, 4, 4, 4, 4, 3])

In [119]:
dataset['labels'][2]

[3, 4, 4]

In [42]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "huggingface-course/bert-finetuned-ner"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier(" ".join(dataset[0]['tokens']))

[{'entity_group': 'LOC',
  'score': 0.9995029,
  'word': 'India',
  'start': 68,
  'end': 73},
 {'entity_group': 'LOC',
  'score': 0.99907565,
  'word': 'Adyar',
  'start': 90,
  'end': 95}]

In [43]:
dataset[0]['spans']

['LOC: India', 'LOC: Adyar']