In [18]:
import spacy
import pathlib
import json
import os
from spacy.tokens import DocBin
from spacy.util import filter_spans
import re

#### Load the json data

In [None]:
root_dir = os.getcwd()
directory_path = os.path.join(root_dir, "ResumesJsonAnnotated")

data = []
for filename in os.listdir(directory_path):
    if filename.endswith(".json"):
        with open(os.path.join(directory_path, filename), "r", encoding="utf-8", errors="replace") as file:
            data.append(json.load(file))

def surrogate_remover(text):
    return re.sub(r'[\ud800-\udfff]', '', text)

formatted_data = []
for items in data:
    text = surrogate_remover(items['text'])
    ents = []
    for ann in items["annotations"]:
        start, end, label = ann
        ents.append((start, end, label))
    formatted_data.append((text, {"entities": ents}))

split_idx = int(len(formatted_data) * 0.8)
train_data = formatted_data[:split_idx]
dev_data = formatted_data[split_idx:]

print(f"Total examples: {len(formatted_data)}")
print(f"Training examples: {len(train_data)}")
print(f"Dev examples: {len(dev_data)}")

nlp = spacy.blank("en")

train_doc_bin = DocBin()
for text, annotation in train_data:
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annotation["entities"]:
        span = doc.char_span(start, end, label=label)
        if span:
            ents.append(span)
    ents = filter_spans(ents)
    doc.ents = ents
    train_doc_bin.add(doc)

train_doc_bin.to_disk("train.spacy")

dev_doc_bin = DocBin()
for text, annotation in dev_data:
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annotation["entities"]:
        span = doc.char_span(start, end, label=label)
        if span:
            ents.append(span)
    ents = filter_spans(ents)
    doc.ents = ents
    dev_doc_bin.add(doc)

dev_doc_bin.to_disk("dev.spacy")
print("✅ Created train.spacy and dev.spacy")

Total examples: 5029
Training examples: 4023
Dev examples: 1006
✅ Created train.spacy and dev.spacy
✅ Created train.spacy and dev.spacy
