### Import the necessary libraries.

In [18]:
import spacy
import pathlib
import json
import os
from spacy.tokens import DocBin
from spacy.util import filter_spans
import re

#### Load the json data

In [20]:
root_dir = os.getcwd()
directory_path = os.path.join(root_dir, "ResumesJsonAnnotated")

data = []
for filename in os.listdir(directory_path):
    if filename.endswith(".json"):
        with open(os.path.join(directory_path, filename), "r", encoding="utf-8", errors="replace") as file:
            data.append(json.load(file))

def surrogate_remover(text):
    return re.sub(r'[\ud800-\udfff]', '', text)

formatted_data = []
for items in data:
    text = surrogate_remover(items['text'])
    ents = []
    for ann in items["annotations"]:
        start, end, label = ann
        ents.append((start, end, label))
    formatted_data.append((text, {"entities": ents}))

nlp = spacy.blank("en")
doc_bin = DocBin()

for text, annotation in formatted_data:
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annotation["entities"]:
        span = doc.char_span(start, end, label=label)
        if span:
            ents.append(span)
    ents = filter_spans(ents)
    doc.ents = ents
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")