In [1]:
!python3 -m spacy download ru_core_news_lg -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m513.4/513.4 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.2/53.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# Imports

In [6]:
import json
import random, spacy
from spacy.training import Example
import ru_core_news_lg
import math
import warnings
warnings.filterwarnings("ignore")

# Functions

In [7]:
def read_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    return data

# function to update entities by removing overlapping entities
def update_entities(ents):
    processed_indices = set()
    new_entities = []

    for ent in ents:
        # check if any index in the entity's range has already been processed
        if not any(processed in processed_indices for processed in range(ent[0], ent[1]+1)):
            # if not, add the entity to the new entities list
            new_entities.append(ent)
            # update the set of processed indices with the entity's range
            processed_indices.update(range(ent[0], ent[1]+1))

    return new_entities

# Prepare input data

In [8]:
train = read_file("/content/drive/MyDrive/Colab Notebooks/Assignment 3/test/train.jsonl")

# prepare the data in the format needed for model training
prepared_data = []
for data in train:
    sorted_ners = sorted(data['ners'], key=lambda x: x[1]-x[0])
    entities = [(ner[0], ner[1]+1, ner[2]) for ner in sorted_ners]
    prepared_data.append((data['sentences'], {"entities": entities}))

for _, ner in prepared_data:
    # remove pverlapping entities
    ner['entities'] = update_entities(ner['entities'])

# Train the model

In [9]:
nlp = ru_core_news_lg.load()
optimizer = nlp.initialize()

for itn in range(70):
    # shuffle the data for better generalization
    random.shuffle(prepared_data)
    for i, (text, offsets) in enumerate(prepared_data):

        # compute some staff for beautiful progress bar
        progress = i / len(prepared_data) * 100
        filled_length = math.ceil(progress * 50 / 100)
        print(f'\rEpoch {itn}: [{"█" * filled_length}{"░" * (50 - filled_length)}] {progress:.2f}%', end='')

        # update the model
        nlp.update([Example.from_dict(nlp.make_doc(text), offsets)], sgd=optimizer)
    print()

nlp.to_disk("model")

Epoch 0: [██████████████████████████████████████████████████] 99.81%
Epoch 1: [██████████████████████████████████████████████████] 99.81%
Epoch 2: [██████████████████████████████████████████████████] 99.81%
Epoch 3: [██████████████████████████████████████████████████] 99.81%
Epoch 4: [██████████████████████████████████████████████████] 99.81%
Epoch 5: [██████████████████████████████████████████████████] 99.81%
Epoch 6: [██████████████████████████████████████████████████] 99.81%
Epoch 7: [██████████████████████████████████████████████████] 99.81%
Epoch 8: [██████████████████████████████████████████████████] 99.81%
Epoch 9: [██████████████████████████████████████████████████] 99.81%
Epoch 10: [██████████████████████████████████████████████████] 99.81%
Epoch 11: [██████████████████████████████████████████████████] 99.81%
Epoch 12: [██████████████████████████████████████████████████] 99.81%
Epoch 13: [██████████████████████████████████████████████████] 99.81%
Epoch 14: [███████████████████

# Test

In [10]:
text_ners = []
# get the set of ners in the training example
set_of_ners = set(n[2] for item in train for n in item["ners"])
text = read_file('/content/drive/MyDrive/Colab Notebooks/Assignment 3/test/test.jsonl')
# get the ners from the test text
text_ners = [{'ners': [[e.start_char, e.end_char-1, e.label_] for e in nlp(data['senences']).ents if e.label_ in set_of_ners], 'id': data['id']} for data in text]

# Save

In [11]:
# create the file and its zip
with open('/content/drive/MyDrive/Colab Notebooks/Assignment 3/output_spacy/test.jsonl', 'w') as outfile:
    for entry in text_ners:
        json.dump(entry, outfile)
        outfile.write('\n')

# for convenient folder structure in colab
!zip -r /content/drive/MyDrive/Colab\ Notebooks/Assignment\ 3/output_spacy/test.zip /content/drive/MyDrive/Colab\ Notebooks/Assignment\ 3/output_spacy/test.jsonl

updating: content/drive/MyDrive/Colab Notebooks/Assignment 3/output_spacy/test.jsonl (deflated 76%)


In [13]:
# for submission
!zip test test.jsonl

  adding: test.jsonl (deflated 76%)
