In [57]:
import spacy
import random
import csv

# Define the labels for the entities we want to recognize
LABELS = ["Product", "Quantity", "Price", "Supplier"]

# Load the spaCy model
nlp = spacy.blank("en")

# Define the entity recognizer and add the labels
ner = nlp.add_pipe("ner")
for label in LABELS:
    ner.add_label(label)


In [76]:
# Load and preprocess the training data
train_data = []
with open("data/training-data.csv", "r", encoding="utf-8") as file:
    reader = csv.DictReader(file)
    for row in reader:
        text = row["Statement"]
        entities = []
        for label in LABELS:
            start_idx = text.find(row[label])
            if start_idx != -1:
                end_idx = start_idx + len(row[label])
                entities.append((start_idx, end_idx, label))
        train_data.append((text, {"entities": entities}))

In [77]:
# Train the model
n_iter = 500
random.seed(0)
optimizer = nlp.begin_training()
for i in range(n_iter):
    random.shuffle(train_data)
    losses = {}
    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        example = spacy.training.Example.from_dict(doc, annotations)
        nlp.update([example], sgd=optimizer, drop=0.35, losses=losses)
    print("Iteration {:d} - Loss: {:.4f}".format(i, losses["ner"]))

# Save the trained model
nlp.to_disk("data/en_pr_vroozi_new")

Iteration 0 - Loss: 567.6218
Iteration 1 - Loss: 256.4415
Iteration 2 - Loss: 148.6483
Iteration 3 - Loss: 132.9558
Iteration 4 - Loss: 107.6068
