In [None]:
!pip install -q datasets
!pip install -q evaluate
!pip install -q seqeval
!pip install -q -U transformers
!pip install -q transformers[torch]
!pip install -q -U accelerate

In [57]:
model_name = "Evolett/rubert-tiny2-finetuned-ner"  # Name of the BERT model
max_len = 512  # Maximum length of input sequences for the model

In [58]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Initialize tokenizer with specified model name and maximum length
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=max_len)

In [60]:
from datasets import load_dataset

# Load dataset from "iluvvatar/RuNNE"
dataset = load_dataset("iluvvatar/RuNNE")
dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'entities'],
        num_rows: 461
    })
    test: Dataset({
        features: ['id', 'text', 'entities'],
        num_rows: 93
    })
    dev: Dataset({
        features: ['id', 'text', 'entities'],
        num_rows: 323
    })
})

In [61]:
entities = set()  # Initialize set to store unique entity labels

# Extract unique entity labels from the 'entities' field in the train split
for sent in dataset['train']['entities']:
    entities = entities.union(set(ent.split()[2] for ent in sent))

# Initialize mappings for entity labels to IDs and vice versa
entity2id = {'O': 0}
id2entity = {0: 'O'}

# Assign IDs to each entity label (BIO format)
for idx, entity in enumerate(entities):
    entity2id[f'B-{entity}'] = 2 * idx + 1
    id2entity[2 * idx + 1] = f'B-{entity}'
    entity2id[f'I-{entity}'] = 2 * idx + 2
    id2entity[2 * idx + 2] = f'I-{entity}'

print("Entity to ID mapping:")
print(entity2id)
print("\nID to Entity mapping:")
print(id2entity)

Entity to ID mapping:
{'O': 0, 'B-ORGANIZATION': 1, 'I-ORGANIZATION': 2, 'B-AWARD': 3, 'I-AWARD': 4, 'B-FAMILY': 5, 'I-FAMILY': 6, 'B-STATE_OR_PROVINCE': 7, 'I-STATE_OR_PROVINCE': 8, 'B-ORDINAL': 9, 'I-ORDINAL': 10, 'B-PERSON': 11, 'I-PERSON': 12, 'B-EVENT': 13, 'I-EVENT': 14, 'B-PERCENT': 15, 'I-PERCENT': 16, 'B-NUMBER': 17, 'I-NUMBER': 18, 'B-DATE': 19, 'I-DATE': 20, 'B-IDEOLOGY': 21, 'I-IDEOLOGY': 22, 'B-TIME': 23, 'I-TIME': 24, 'B-PRODUCT': 25, 'I-PRODUCT': 26, 'B-MONEY': 27, 'I-MONEY': 28, 'B-RELIGION': 29, 'I-RELIGION': 30, 'B-LOCATION': 31, 'I-LOCATION': 32, 'B-NATIONALITY': 33, 'I-NATIONALITY': 34, 'B-PROFESSION': 35, 'I-PROFESSION': 36, 'B-WORK_OF_ART': 37, 'I-WORK_OF_ART': 38, 'B-AGE': 39, 'I-AGE': 40, 'B-PENALTY': 41, 'I-PENALTY': 42, 'B-FACILITY': 43, 'I-FACILITY': 44, 'B-DISTRICT': 45, 'I-DISTRICT': 46, 'B-CRIME': 47, 'I-CRIME': 48, 'B-COUNTRY': 49, 'I-COUNTRY': 50, 'B-LAW': 51, 'I-LAW': 52, 'B-CITY': 53, 'I-CITY': 54, 'B-LANGUAGE': 55, 'I-LANGUAGE': 56, 'B-DISEASE': 57, '

In [62]:
import pandas as pd
from datasets import Dataset, concatenate_datasets


def dataset_maker(file_name='/kaggle/input/nlp-a3-data/train.jsonl', is_train=True):
    """
    Function to create a dataset from a JSON file.

    Args:
        file_name (str, optional): Path to the JSON file.
        is_train (bool, optional): Dataset is for training or not.

    Returns:
        Dataset: Dataset created from the JSON file.
    """
    df = pd.read_json(file_name, lines=True)    
    if is_train:
        df = df.rename({'sentences': 'text', 'ners': 'entities'}, axis=1)[['text', 'entities']]
        entities = []
        for row in df.iterrows():
            entities.append([f'{ent[0]} {ent[1]} {ent[2]}' for ent in row[1]['entities']])
        df['entities'] = entities
    else:
        df = df.rename({'senences': 'text'}, axis=1)
    dataset = Dataset.from_pandas(df, preserve_index=False)
    return dataset

json_dataset = dataset_maker()
json_dataset

Dataset({
    features: ['text', 'entities'],
    num_rows: 519
})

In [None]:
def format_hf(example):
    """
    Function to format entity annotations for Hugging Face datasets.

    Args:
        example (dict): Example containing 'entities' field.

    Returns:
        dict: Example with formatted entity annotations.
    """
    for j in range(len(example['entities'])):
        s, e, E = example['entities'][j].split()
        example['entities'][j] = f'{s} {int(e) - 1} {E}'  # Format end position by subtracting 1
    return example


dataset['train'] = dataset['train'].map(format_hf)
dataset['test'] = dataset['test'].map(format_hf)

In [63]:
dataset['train'] = concatenate_datasets([json_dataset, dataset['train']])
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'entities', 'id'],
        num_rows: 980
    })
    test: Dataset({
        features: ['id', 'text', 'entities'],
        num_rows: 93
    })
    dev: Dataset({
        features: ['id', 'text', 'entities'],
        num_rows: 323
    })
})

## 29 models

In [None]:
import evaluate

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2entity[p] for (p, l) in zip(prediction, label)]# if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2entity[l] for (p, l) in zip(prediction, label)]# if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [64]:
import numpy as np
from transformers import TrainingArguments, Trainer

def create_dataset_model(b_entity_id):
    def mapper_2(example):
        """
        Function to map text and entity annotations to tokenized inputs and NER tags.

        Args:
            example (dict): Example containing 'text' and 'entities' fields.

        Returns:
            dict: Tokenized inputs with NER tags.
        """
        text = example['text']  # Extract text from example
        entities = example['entities']  # Extract entity annotations
        ner_tags = [0] * max_len  # Initialize NER tags with zeros
         # Iterate over entity annotations and assign BIO tags to token positions
        for entity_str in entities:
            start, end, entity = entity_str.split()
            start, end = int(start), int(end)
            if entity not in id2entity[b_entity_id] and entity not in id2entity[b_entity_id + 1]:
                continue
            token_start = len(tokenizer.tokenize(text[0:start]))  # Start token index
            subtext = tokenizer.tokenize(text[start:end + 1])  # Tokenize subtext containing entity
            for token_id in range(token_start, min(len(ner_tags), token_start + len(subtext) + 1)):
                if token_id == token_start:
                    if ner_tags[token_id] == 0:
                        ner_tags[token_id] = 1 # Assign B-tag for beginning of entity
                else:
                    if ner_tags[token_id] == 0:
                        ner_tags[token_id] = 2 # Assign I-tag for inside of entity
        # Tokenize text and assign NER tags to corresponding token positions
        result = tokenizer(text, max_length=max_len, padding='max_length', truncation=True)
        result['labels'] = ner_tags

        # Check if the length of labels and input_ids doesn't match
        assert len(result['labels']) != len(result['input_ids'])
        return result
    
    train_dataset = dataset['train'].map(mapper_2)
    test_dataset = dataset['test'].map(mapper_2)
    
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=3, 
                                                            id2label={0: 'O', 1: id2entity[b_entity_id], 2: id2entity[b_entity_id + 1]}, 
                                                            label2id={'O': 0, id2entity[b_entity_id]: 1, id2entity[b_entity_id + 1]: 2}, 
                                                            ignore_mismatched_sizes=True)

    training_args = TrainingArguments(
        output_dir=id2entity[b_entity_id],
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=15,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        report_to='tensorboard',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )
    
    trainer.train()

In [65]:
for i in range(1, 58, 2):
    create_dataset_model(i)

Map:   0%|          | 0/980 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (660 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/93 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at tesemnikov-av/rubert-ner-toxicity and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([2, 312]) in the checkpoint and torch.Size([3, 312]) in the model instantiated
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.265828,0.0,0.0,0.0,0.938298
2,No log,0.237086,0.0,0.0,0.0,0.93813
3,No log,0.219985,0.0,0.0,0.0,0.939306
4,No log,0.208101,0.021978,0.008097,0.011834,0.939915
5,No log,0.20099,0.014881,0.010121,0.012048,0.940587
6,No log,0.194649,0.018717,0.01417,0.016129,0.941385
7,No log,0.189918,0.016746,0.01417,0.015351,0.942456
8,No log,0.186896,0.025591,0.026316,0.025948,0.942687
9,No log,0.183778,0.026639,0.026316,0.026477,0.943695
10,No log,0.181914,0.030741,0.034413,0.032474,0.94338


  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 

In [50]:
from transformers import pipeline

class megamodel:
    def __init__(self):
        self.pipelines = {}
        for i in range(1, len(id2entity), 2):
            self.pipelines[i] = pipeline('ner', 
                                    tokenizer=tokenizer, 
                                    model=AutoModelForTokenClassification.from_pretrained(f'/kaggle/working/{id2entity[i]}/checkpoint-465', 
                                                                                          local_files_only=True),
                                    aggregation_strategy='simple')
    def predict(self, text):
        predictions = []
        for idx in self.pipelines:
            for i in range(0, len(text), 512):
                predictions += self.pipelines[idx](text[i:min(i+512, len(text))])
        result = []
        for token in predictions:
            result += [[token['start'], token['end'], token['entity_group']]]
        return result

In [51]:
mega_pipeline = megamodel()

In [53]:
def get_answer(text, id):
    res = mega_pipeline.predict(text)
    return {'id': id, 'ners': res}

In [54]:
dev_dataset = dataset_maker('/kaggle/input/nlp-a3-data/dev.jsonl', is_train=False)
dev_dataset

Dataset({
    features: ['text', 'id'],
    num_rows: 65
})

In [55]:
dev_answers = []
for sample in dev_dataset:
    dev_answers += [get_answer(sample['text'], sample['id'])]

In [56]:
import json

def save_jsonl(data, filename):
    with open(filename, 'w') as file:
        for item in data:
            json.dump(item, file)
            file.write('\n')

save_jsonl(dev_answers, 'test.jsonl')