In [1]:
import json
import re
import random
from datasets import load_dataset
import pandas as pd

import os
import pathlib
import re
import label_studio_sdk
import logging

from typing import List, Dict, Optional
from label_studio_ml.model import LabelStudioMLBase
from label_studio_ml.response import ModelResponse

  "cipher": algorithms.TripleDES,
  "class": algorithms.Blowfish,
  "class": algorithms.TripleDES,


In [2]:

from transformers import pipeline, Pipeline
from itertools import groupby
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoTokenizer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, ClassLabel, Value, Sequence, Features
from functools import partial


In [3]:
logger = logging.getLogger(__name__)
_model: Optional[Pipeline] = None
MODEL_DIR = os.getenv('MODEL_DIR', './results')
BASELINE_MODEL_NAME = os.getenv('BASELINE_MODEL_NAME', 'Babelscape/wikineural-multilingual-ner')
BASELINE_MODEL_NAME = os.getenv('BASELINE_MODEL_NAME', 'DmitryPogrebnoy/distilbert-base-russian-cased')
FINETUNED_MODEL_NAME = os.getenv('FINETUNED_MODEL_NAME', 'finetuned_model')

In [4]:
label_config = '''<View style="display:flex;align-items:start;gap:8px;flex-direction:row">
                   <Text name="text" value="$clearText" granularity="word"/>
                   <Labels name="label" toName="text" showInline="false">
                    <Label value="DIR" background="#4824f9"/>
                    <Label value="SLD" background="#00ff1e"/>
                    <Label value="WP" background="#ff0000"/>
                  	<Label value="LOC" background="#57fff4"/>
                    <Label value="UNIT" background="green"/>
                    <Label value="COUNT" background="#000000"/>
                    <Label value="FREE" background="#0008ff"/>
                    <Label value="LOST" background="#ff0000"/>
                    <Label value="CAPT" background="#ffbb00"/>
                  </Labels>
                </View> '''
project_id = 1

In [5]:
def reload_model():
    global _model
    _model = None
    try:
        chk_path = str(pathlib.Path(MODEL_DIR) / FINETUNED_MODEL_NAME)
        logger.info(f"Loading finetuned model from {chk_path}")
        _model = pipeline("ner", model=chk_path, tokenizer=chk_path)
    except:
        # if finetuned model is not available, use the baseline model with the original labels
        logger.info(f"Loading baseline model {BASELINE_MODEL_NAME}")
        _model = pipeline("ner", model=BASELINE_MODEL_NAME, tokenizer=BASELINE_MODEL_NAME)

In [6]:
reload_model()

In [7]:
class HuggingFaceNER(LabelStudioMLBase):
    """Custom ML Backend model
    """
    LABEL_STUDIO_HOST = os.getenv('LABEL_STUDIO_HOST', 'http://localhost:8080')
    LABEL_STUDIO_API_KEY = os.getenv('LABEL_STUDIO_API_KEY', 'c35a2f5689358d1e9d7522309643ba5b9cfca062')
    START_TRAINING_EACH_N_UPDATES = int(os.getenv('START_TRAINING_EACH_N_UPDATES', 10))
    LEARNING_RATE = float(os.getenv('LEARNING_RATE', 1e-3))
    NUM_TRAIN_EPOCHS = int(os.getenv('NUM_TRAIN_EPOCHS', 10))
    WEIGHT_DECAY = float(os.getenv('WEIGHT_DECAY', 0.01))

    def get_labels(self):
        li = self.label_interface
        from_name, _, _ = li.get_first_tag_occurence('Labels', 'Text')
        tag = li.get_tag(from_name)
        return tag.labels
    
    def _get_tasks(self, project_id):
        # download annotated tasks from Label Studio
        ls = label_studio_sdk.Client(self.LABEL_STUDIO_HOST, self.LABEL_STUDIO_API_KEY)
        project = ls.get_project(id=project_id)
        tasks = project.get_labeled_tasks()
        return tasks

In [8]:
ner =  HuggingFaceNER (project_id = project_id, label_config = label_config)

In [9]:
ner.get_labels()

['DIR', 'SLD', 'WP', 'LOC', 'UNIT', 'COUNT', 'FREE', 'LOST', 'CAPT']

In [10]:
tasks = ner._get_tasks(1)

In [11]:
 # we need to convert Label Studio NER annotations to hugingface NER format in datasets
        # for example:
        # {'id': '0',
        #  'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0],
        #  'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']

In [12]:
ds_raw = []
from_name, to_name, value = ner.label_interface.get_first_tag_occurence('Labels', 'Text')

In [13]:
tokenizer = AutoTokenizer.from_pretrained(BASELINE_MODEL_NAME)



In [14]:
no_label = 'O'
label_to_id = {no_label: 0}

In [15]:
task = tasks[0]

In [16]:

#task

In [17]:
annotation = task['annotations'][0]

In [18]:
#annotation['result']

In [19]:
# фильтруем результат - что бы была только разметака (value)
list_values = list(filter(lambda r: r['type'] == 'labels', annotation['result']))

In [20]:
# Метка, начало, конец.
spans = [{'label': r['value']['labels'][0], 'start': r['value']['start'], 'end': r['value']['end']} for r in list_values]

In [21]:
#spans

In [22]:
spans = sorted(spans, key=lambda x: x['start'])

In [23]:
# Получаем текст
text = ner.preload_task_data(task, task['data'][value])

In [24]:
#text

In [25]:
 # insert tokenizer.pad_token to the unlabeled chunks of the text in-between the labeled spans, as well as to the beginning and end of the text
last_end = 0
all_spans = []
for span in spans:
    if last_end < span['start']:
        all_spans.append({'label': no_label, 'start': last_end, 'end': span['start']})
    all_spans.append(span)
    last_end = span['end']
if last_end < len(text):
    all_spans.append({'label': no_label, 'start': last_end, 'end': len(text)})

In [26]:
 # now tokenize chunks separately and add them to the dataset
item = {'id': task['id'], 'tokens': [], 'ner_tags': []}
for span in all_spans:
    tokens = tokenizer.tokenize(text[span['start']:span['end']])
    item['tokens'].extend(tokens)
    if span['label'] == no_label:
        item['ner_tags'].extend([label_to_id[no_label]] * len(tokens))
    else:
        label = 'B-' + span['label']
        if label not in label_to_id:
            label_to_id[label] = len(label_to_id)
        item['ner_tags'].append(label_to_id[label])
        if len(tokens) > 1:
            label = 'I-' + span['label']
            if label not in label_to_id:
                label_to_id[label] = len(label_to_id)
            item['ner_tags'].extend([label_to_id[label] for _ in range(1, len(tokens))])

In [27]:
#item

In [28]:
ds_raw = []
for task in tasks:
    for annotation in task['annotations']:
        if not annotation.get('result'):
            continue
        # фильтруем результат - что бы была только разметака (value)
        list_values = list(filter(lambda r: r['type'] == 'labels', annotation['result']))
        spans = [{'label': r['value']['labels'][0], 'start': r['value']['start'], 'end': r['value']['end']} for r in list_values]
        spans = sorted(spans, key=lambda x: x['start'])
        text = ner.preload_task_data(task, task['data'][value])
        #text = text[:512]
        # insert tokenizer.pad_token to the unlabeled chunks of the text in-between the labeled spans, as well as to the beginning and end of the text
        last_end = 0
        all_spans = []
        for span in spans:
            if last_end < span['start']:
                all_spans.append({'label': no_label, 'start': last_end, 'end': span['start']})
            all_spans.append(span)
            last_end = span['end']
        if last_end < len(text):
            all_spans.append({'label': no_label, 'start': last_end, 'end': len(text)})
    
        # now tokenize chunks separately and add them to the dataset
        item = {'id': task['id'], 'tokens': [], 'ner_tags': []}
        for span in all_spans:
            tokens = tokenizer.tokenize(text[span['start']:span['end']])
            tokens_ids = tokenizer.encode(text[span['start']:span['end']], add_special_tokens=False)
            item['tokens'].extend(tokens)
            if span['label'] == no_label:
                item['ner_tags'].extend([label_to_id[no_label]] * len(tokens))
            else:
                label = 'B-' + span['label']
                if label not in label_to_id:
                    label_to_id[label] = len(label_to_id)
                item['ner_tags'].append(label_to_id[label])
                if len(tokens) > 1:
                    label = 'I-' + span['label']
                    if label not in label_to_id:
                        label_to_id[label] = len(label_to_id)
                    item['ner_tags'].extend([label_to_id[label] for _ in range(1, len(tokens))])
        ds_raw.append(item)


Token indices sequence length is longer than the specified maximum sequence length for this model (679 > 512). Running this sequence through the model will result in indexing errors


In [29]:
len(ds_raw[0]["tokens"])

978

In [30]:
# convert to huggingface dataset
# Define the features of your dataset
features = Features({
    'id': Value('string'),
    'tokens': Sequence(Value('string')),
    'ner_tags': Sequence(ClassLabel(names=list(label_to_id.keys())))
})

In [31]:
 def tokenize_and_align_labels( examples, tokenizer):
    """
    From example https://huggingface.co/docs/transformers/en/tasks/token_classification#preprocess
    """
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [32]:
hf_dataset = Dataset.from_list(ds_raw, features=features)

In [33]:
tokenized_dataset = hf_dataset.map(partial(tokenize_and_align_labels, tokenizer=tokenizer), batched=True)

Map:   0%|          | 0/164 [00:00<?, ? examples/s]

In [34]:
tokenized_dataset

Dataset({
    features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 164
})

In [35]:
#tokenized_dataset[0]["ner_tags"]

In [36]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
id_to_label = {i: label for label, i in label_to_id.items()}

In [37]:
data_collator

DataCollatorForTokenClassification(tokenizer=DistilBertTokenizerFast(name_or_path='DmitryPogrebnoy/distilbert-base-russian-cased', vocab_size=13982, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_

In [38]:
id_to_label

{0: 'O',
 1: 'B-DIR',
 2: 'I-DIR',
 3: 'B-LOC',
 4: 'I-LOC',
 5: 'B-COUNT',
 6: 'B-SLD',
 7: 'I-SLD',
 8: 'B-WP',
 9: 'I-WP',
 10: 'B-UNIT',
 11: 'I-UNIT',
 12: 'I-COUNT',
 13: 'B-FREE',
 14: 'I-FREE',
 15: 'B-CAPT',
 16: 'I-CAPT'}

In [39]:
label_to_id

{'O': 0,
 'B-DIR': 1,
 'I-DIR': 2,
 'B-LOC': 3,
 'I-LOC': 4,
 'B-COUNT': 5,
 'B-SLD': 6,
 'I-SLD': 7,
 'B-WP': 8,
 'I-WP': 9,
 'B-UNIT': 10,
 'I-UNIT': 11,
 'I-COUNT': 12,
 'B-FREE': 13,
 'I-FREE': 14,
 'B-CAPT': 15,
 'I-CAPT': 16}

In [40]:
model = AutoModelForTokenClassification.from_pretrained(
            BASELINE_MODEL_NAME, num_labels=len(id_to_label),
            id2label=id_to_label, label2id=label_to_id)

# model = AutoModelForTokenClassification.from_pretrained(
#             BASELINE_MODEL_NAME, 
#             id2label=id_to_label, label2id=label_to_id)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at DmitryPogrebnoy/distilbert-base-russian-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
 training_args = TrainingArguments(
    output_dir=str(pathlib.Path(MODEL_DIR) / FINETUNED_MODEL_NAME),
    learning_rate=ner.LEARNING_RATE,
    per_device_train_batch_size=8,
    num_train_epochs=ner.NUM_TRAIN_EPOCHS,
    weight_decay=ner.WEIGHT_DECAY,
    evaluation_strategy="epoch",
)


In [42]:
#pip install transformers==4.39.1

In [43]:
#pip install accelerate==0.27.2

In [44]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    
)
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.005733
2,No log,1.004756
3,No log,1.00607
4,No log,1.013386
5,No log,1.002655
6,No log,1.014303
7,No log,1.005023
8,No log,1.002125
9,No log,1.001972
10,No log,1.002353


TrainOutput(global_step=210, training_loss=1.0811178298223587, metrics={'train_runtime': 215.3412, 'train_samples_per_second': 7.616, 'train_steps_per_second': 0.975, 'total_flos': 214329198305280.0, 'train_loss': 1.0811178298223587, 'epoch': 10.0})

In [45]:
chk_path = str(pathlib.Path(MODEL_DIR) / FINETUNED_MODEL_NAME)
logger.info(f"Model is trained and saved as {chk_path}")
trainer.save_model(chk_path)

In [46]:
chk_path

'results\\finetuned_model'

In [47]:
reload_model()

In [48]:
li = ner.label_interface
from_name, to_name, value = li.get_first_tag_occurence('Labels', 'Text')
texts = [ner.preload_task_data(task, task['data'][value]) for task in tasks]

In [49]:
text = task['data'][value]

In [50]:
text

'Сводка Министерства обороны Российской Федерации о ходе проведения специальной военной операциипо состоянию на 30 июня 2024 г.Часть 2  Подразделения группировки войск Восток заняли более выгодные рубежи и нанесли поражение живой силе и технике 115-й, 123-й, 128-й бригад теробороны в районах населенных пунктов Ровнополь, Времевка, Новоукраинка и Великая Новоселка Донецкой Народной Республики. Отражена контратака штурмовой группы 123-й бригады теробороны противника.ВСУ потеряли до 130 военнослужащих, две боевые бронированные машины, семь автомобилей, 155-мм гаубицу FH-70 производства Великобритании, 155-мм гаубицу М198 производства США, две 122-мм гаубицы Д-30.Уничтожены два полевых склада боеприпасов и склад военно-технического имущества ВСУ. Подразделениями группировки войск Днепр нанесено поражение живой силе и технике 35-й бригады морской пехоты, 121-й бригады теробороны и 22-й бригады нацгвардии в районах населенных пунктов Ильинка Днепропетровской области, Золотая Балка, Ольговка 

In [51]:
model_predictions = _model(text)

In [52]:
model_predictions

[]

In [121]:
predictions = []
for prediction in model_predictions:
    # prediction returned in the format: [{'entity': 'B-ORG', 'score': 0.999, 'index': 1, 'start': 0, 'end': 7, 'word': 'Google'}, ...]
    # we need to group them by 'B-' and 'I-' prefixes to form entities
    results = []
    avg_score = 0
    for label, group in groupby(prediction, key=lambda x: re.sub(r'^[BI]-', '', x['entity'])):
        entities = list(group)
        start = entities[0]['start']
        end = entities[-1]['end']
        score = float(sum([entity['score'] for entity in entities]) / len(entities))
        results.append({
            'from_name': from_name,
            'to_name': to_name,
            'type': 'labels',
            'value': {
                'start': start,
                'end': end,
                'labels': [label]
            },
            'score': score
        })
        avg_score += score
    if results:
        predictions.append({
            'result': results,
            'score': avg_score / len(results),
            'model_version': ner.get('model_version')
        })

In [130]:
model

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(13982, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
   

In [133]:
id_to_label

{0: 'O',
 1: 'B-DIR',
 2: 'I-DIR',
 3: 'B-LOC',
 4: 'I-LOC',
 5: 'B-COUNT',
 6: 'B-SLD',
 7: 'I-SLD',
 8: 'B-WP',
 9: 'I-WP',
 10: 'B-UNIT',
 11: 'I-UNIT',
 12: 'I-COUNT',
 13: 'B-FREE',
 14: 'I-FREE',
 15: 'B-CAPT',
 16: 'I-CAPT'}

In [112]:
label_to_id

{'O': 0,
 'B-DIR': 1,
 'I-DIR': 2,
 'B-LOC': 3,
 'I-LOC': 4,
 'B-COUNT': 5,
 'B-SLD': 6,
 'I-SLD': 7,
 'B-WP': 8,
 'I-WP': 9,
 'B-UNIT': 10,
 'I-UNIT': 11}

In [40]:
for task in tasks:
    print(task)
    # for annotation in task['annotations']:
    #     print (annannotation)
        #if not annotation.get('result'):
            

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [40]:
#dataset = load_dataset("json", data_files="RMDR_ANATATION_3_MONTH.json")

In [None]:
ner.

In [46]:
pip



Usage:   
  C:\ProgramData\anaconda3\python.exe -m pip <command> [options]

Commands:
  install                     Install packages.
  download                    Download packages.
  uninstall                   Uninstall packages.
  freeze                      Output installed packages in requirements format.
  inspect                     Inspect the python environment.
  list                        List installed packages.
  show                        Show information about installed packages.
  check                       Verify installed packages have compatible dependencies.
  config                      Manage local and global configuration.
  search                      Search PyPI for packages.
  cache                       Inspect and manage pip's wheel cache.
  index                       Inspect information available from package indexes.
  wheel                       Build wheels from your requirements.
  hash                        Compute hashes of package archives.
  

In [41]:
with open('RMDR_ANATATION_3_MONTH.json',  encoding="utf8") as f:
    messages = json.load(f)
    

In [42]:
random.shuffle(messages)

In [43]:
len(messages)

164

In [4]:
#messages[0]

In [32]:
lst = list(map(lambda msg: {"id": msg["id"], "date": msg["date"], "clearText": msg["clearText"], "label": msg["label"] }, messages))

In [5]:
#lst[0]

In [34]:
#data_df = pd.read_json('RMDR_ANATATION_3_MONTH.json', lines=True)

In [35]:
df = pd.DataFrame(lst)

In [36]:
df

Unnamed: 0,id,date,clearText,label
0,1602,2024-06-13T13:51:35,Сводка Министерства обороны Российской Федерац...,"[{'start': 259, 'end': 264, 'text': 'Север', '..."
1,1586,2024-06-05T13:11:37,Сводка Министерства обороны Российской Федерац...,"[{'start': 166, 'end': 172, 'text': 'Восток', ..."
2,45,2022-07-07T11:51:51,Сводка Министерства обороны Российской Федера...,"[{'start': 272, 'end': 275, 'text': '142', 'la..."
3,1580,2024-06-02T11:55:23,Сводка Министерства обороны Российской Федерац...,"[{'start': 255, 'end': 260, 'text': 'Север', '..."
4,44,2022-07-07T11:51:48,Сводка Министерства обороны Российской Федера...,"[{'start': 474, 'end': 483, 'text': '2,5 тысяч..."
...,...,...,...,...
159,865,2023-06-26T14:57:45,Сводка Министерства обороны Российской Федера...,"[{'start': 264, 'end': 266, 'text': '83', 'lab..."
160,1626,2024-06-24T13:12:34,Сводка Министерства обороны Российской Федерац...,"[{'start': 258, 'end': 263, 'text': 'Север', '..."
161,1627,2024-06-24T13:12:34,Сводка Министерства обороны Российской Федерац...,"[{'start': 167, 'end': 173, 'text': 'Восток', ..."
162,1639,2024-06-30T13:14:00,Сводка Министерства обороны Российской Федерац...,"[{'start': 167, 'end': 173, 'text': 'Восток', ..."
