In [140]:
from datasets import load_dataset
language = 'en'

dataset = load_dataset('wikiann','en')
dataset["train"][2]

Found cached dataset wikiann (/home/s6amalia/.cache/huggingface/datasets/wikiann/en/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e)


  0%|          | 0/3 [00:00<?, ?it/s]

{'tokens': ['Karl', 'Ove', 'Knausgård', '(', 'born', '1968', ')'],
 'ner_tags': [1, 2, 2, 0, 0, 0, 0],
 'langs': ['en', 'en', 'en', 'en', 'en', 'en', 'en'],
 'spans': ['PER: Karl Ove Knausgård']}

In [141]:
dataset

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'spans'],
        num_rows: 20000
    })
})

In [142]:
label_list = dataset["train"].features[f"ner_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [143]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [144]:
example = dataset["train"][2]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'karl',
 'o',
 '##ve',
 'kn',
 '##aus',
 '##gard',
 '(',
 'born',
 '1968',
 ')',
 '[SEP]']

In [145]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [146]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Loading cached processed dataset at /home/s6amalia/.cache/huggingface/datasets/wikiann/en/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e/cache-7431a44ae42d950d.arrow


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Loading cached processed dataset at /home/s6amalia/.cache/huggingface/datasets/wikiann/en/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e/cache-35796646de0bdc3d.arrow


In [147]:
tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
word_ids = tokenized_inputs.word_ids(batch_index=0)
word_ids

[None, 0, 1, 1, 2, 2, 2, 3, 4, 5, 6, None]

In [148]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_dataset['test']["input_ids"][0])
tokenized_dataset['test']["tokens"][0].word_ids(batch_index=0)

AttributeError: 'list' object has no attribute 'word_ids'

In [149]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [150]:
import evaluate

seqeval = evaluate.load("seqeval")

In [151]:
import numpy as np

labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [152]:
labels = [label_list[i] for i in example[f"ner_tags"]]
labels

['B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O']

In [153]:

id2label = {
    0: "O",
    1: 'B-PER',
    2: 'I-PER',
    3: 'B-ORG',
    4: 'I-ORG',
    5: 'B-LOC',
    6: 'I-LOC',
}
label2id = {
    "O": 0,
    'B-PER': 1,
    'I-PER': 2,
    'B-ORG': 3,
    'I-ORG': 4,
    'B-LOC': 5,
    'I-LOC': 6,
}

In [16]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    'xlm-roberta-base', num_labels=7, id2label=id2label, label2id=label2id
)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./wikiann_fine_tuned",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="no",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [155]:
from transformers import pipeline
classifier = pipeline("ner", model="/home/s6amalia/xlmroberta-wikiann-en.pt", tokenizer = 'xlm-roberta-base')

classifier(' '.join(dataset["test"]['tokens'][0]))

[{'entity': 'B-LOC',
  'score': 0.98101395,
  'index': 18,
  'word': '▁India',
  'start': 68,
  'end': 73},
 {'entity': 'B-LOC',
  'score': 0.9404207,
  'index': 23,
  'word': '▁Ad',
  'start': 90,
  'end': 92},
 {'entity': 'I-LOC',
  'score': 0.8774265,
  'index': 24,
  'word': 'yar',
  'start': 92,
  'end': 95}]

In [291]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
import pandas as pd
import torch
model = AutoModelForTokenClassification.from_pretrained("/home/s6amalia/xlmroberta-wikiann-de.pt")
tokenizer = AutoTokenizer.from_pretrained("/home/s6amalia/xlmroberta-wikiann-de.tk")


In [313]:
dataset_typo =load_dataset("json",data_files = '/home/s6amalia/thesis/Wikiann_noisy_dataset/test_es_typos_0.1.jsonl')
dataset_typo = dataset_typo['train']
dataset_typo

Downloading and preparing dataset json/default to /home/s6amalia/.cache/huggingface/datasets/json/default-9dacbee0a4700c15/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/s6amalia/.cache/huggingface/datasets/json/default-9dacbee0a4700c15/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['tokens', 'ner_tags', 'langs'],
    num_rows: 7831
})

In [319]:
from datasets import load_dataset
language = 'es'

dataset = load_dataset('wikiann','es')

Found cached dataset wikiann (/home/s6amalia/.cache/huggingface/datasets/wikiann/es/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e)


  0%|          | 0/3 [00:00<?, ?it/s]

In [308]:
def cal_accuracy(dataset):
    right_count = 0
    wrong_count = 0
    for i in range(len(dataset)):
        print(int(i*100/len(dataset)),'%', end='\r')
        inputs = dataset['tokens'][i]
        inputs = tokenizer(inputs,truncation=True, is_split_into_words=True,return_tensors="pt")
        with torch.no_grad():
            logits = model(**inputs).logits
        predictions = torch.argmax(logits, dim=-1)
#         predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
        res=pd.DataFrame([inputs.word_ids(batch_index=0),predictions[0].tolist()])

    # tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        res=res.transpose()
        res.columns=['idx','label']
        res=res.drop_duplicates(subset=['idx'], keep='first')[1:]
        res['true_label']  = dataset[i]['ner_tags']
        r = 0
        w = 0
        for j in range(len(res)):
            if (res['true_label'].iloc[j] == res['label'].iloc[j]):
                if res['true_label'].iloc[j] != 0 :
                    r = r+1
            else:         
                w = w +1 

        right_count = right_count + r
        wrong_count = wrong_count + w
        
    return right_count,wrong_count

In [324]:
right_count , wrong_count = cal_accuracy(dataset_typo)


11 %

KeyboardInterrupt: 

In [None]:
print(right_count)
print(wrong_count)
print('Accuracy = ',100*right_count/(right_count+wrong_count))

In [211]:
' '.join(dataset["test"]['tokens'][3])

"'' Mycalesis perseus lalassis '' ( Hewitson , 1864 )"

In [323]:
len(dataset["test"])

10000

In [225]:
tokenized_dataset['test']['labels'][3]

22

In [227]:
dataset["test"][3]['ner_tags']

[0, 5, 6, 6, 0, 0, 0, 0, 0, 0]