### Take data exported from label studio and convert to spacy suitable format

In [34]:
import json

In [35]:
classes = ["Shape", "ShapeSubType", "Unit", "Dimension"]

converted = {"classes": classes, "annotations": []}

In [36]:
input_file = "labelstudio_export.json"
with open(input_file, "r") as f:
    data = json.load(f)

In [37]:
for item in data:
    converted["annotations"].append(
        [
            item["value"],
            {
                "entities": [
                    [
                        lbl["start"],
                        lbl["end"],
                        lbl["labels"][0],
                    ]
                    for lbl in item["lbl"]
                ]
            },
        ]
    )

### Convert the converted data to spacy format

In [38]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

In [39]:
def convert(path, dataset):
    nlp = spacy.blank("en")
    db = DocBin()
    for text, annot in tqdm(dataset):
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    db.to_disk(path)

In [40]:
TRAIN_DATA = converted["annotations"][:1]
DEV_DATA = converted["annotations"][1:]

convert("train.spacy", TRAIN_DATA)
convert("dev.spacy", DEV_DATA)

100%|██████████| 1/1 [00:00<00:00, 929.79it/s]
100%|██████████| 1/1 [00:00<00:00, 1806.33it/s]


### Create the config file

In [41]:
!python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency --gpu --force

[38;5;4m[i] Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: GPU
- Transformer: roberta-base
[38;5;2m[+] Auto-filled config with all values[0m
[38;5;2m[+] Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


### Train the model

In [2]:
!python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy --gpu-id 0 --output ./output

^C
