In [2]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

In [12]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [3]:
ds_data = json.load(open('annotations_learn.json','r'))
len(ds_data)

26

In [4]:
# Define a function to create spaCy DocBin objects from the annotated data
def get_spacy_doc(file, data):
  # Create a blank spaCy pipeline
  nlp = spacy.blank('ru')
  db = DocBin()

  # Iterate through the data
  for text, annot in tqdm(data):
    doc = nlp.make_doc(text)
    annot = annot['entities']

    ents = []
    entity_indices = []

    # Extract entities from the annotations
    for start, end, label in annot:
      skip_entity = False
      for idx in range(start, end):
        if idx in entity_indices:
          skip_entity = True
          break
      if skip_entity:
        continue

      entity_indices = entity_indices + list(range(start, end))
      try:
        span = doc.char_span(start, end, label=label, alignment_mode='strict')
      except:
        continue

      if span is None:
        # Log errors for annotations that couldn't be processed
        err_data = str([start, end]) + "    " + str(text) + "\n"
        file.write(err_data)
      else:
        ents.append(span)

    try:
      doc.ents = ents
      db.add(doc)
    except:
      pass

  return db

In [4]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(ds_data, test_size=0.2)

In [72]:
with open('trained/train_file.txt', 'w+') as wfile:
    db = get_spacy_doc(wfile, train)
    db.to_disk('trained/train_data.spacy')
    db = get_spacy_doc(wfile, test)
    db.to_disk('trained/test_data.spacy')

100%|██████████| 20/20 [00:00<00:00, 689.65it/s]
100%|██████████| 6/6 [00:00<00:00, 597.22it/s]


In [1]:
from transformers.tokenization_utils import BatchEncoding

In [5]:
!python -m spacy train config.cfg --paths.train trained/train_data.spacy  --paths.dev trained/test_data.spacy --gpu-id 0 --output trained

[38;5;4mℹ Saving to output directory: trained[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  0       0        2122.45    771.30    0.00    0.00    0.00    0.00
100     200       27384.31  44104.22   75.47   73.17   77.92    0.75
200     400          24.58  16274.54   79.75   77.78   81.82    0.80
300     600          34.09  15720.07   79.25   76.83   81.82    0.79
400     800           9.39  15059.79   76.00   78.08   74.03    0.76
500    1000           0.00  14364.88   76.00   78.08   74.03    0.76
600    1200           0.00  13644.78   77.12   77.63   76.62    0.77
700    1400           0.00  12849.41   78.21   77.22   79.22    0.78
800    1600           0.00  11913.47   72.60   76.81   68.83    0.73
900    1800           0.00  1


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]
tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]
config.json: 100%|██████████| 625/625 [00:00<?, ?B/s] 

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]
vocab.txt: 100%|██████████| 872k/872k [00:00<00:00, 1.82MB/s]
vocab.txt: 100%|██████████| 872k/872k [00:00<00:00, 1.82MB/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]
tokenizer.json: 100%|██████████| 1.72M/1.72M [00:00<00:00, 2.62MB/s]
tokenizer.json: 100%|██████████| 1.72M/1.72M [00:00<00:00, 2.61MB/s]

model.safetensors:   0%|          | 0.00/672M [00:00<?, ?B/s]
model.safetensors:   2%|▏         | 10.5M/672M [00