In [None]:
import json
import random
import spacy
from spacy.util import minibatch, compounding
from pathlib import Path
from tqdm import tqdm
from spacy.training.example import Example
def remove_overlapping_entities(data):
    cleaned_data = []

    for text, annot in data:
        entities = sorted(annot["entities"], key=lambda x: x[0])  # sort by start
        filtered = []
        last_end = -1
        for start, end, label in entities:
            if start >= last_end:
                filtered.append([start, end, label])
                last_end = end
        cleaned_data.append((text, {"entities": filtered}))

    return cleaned_data
# Load training data
with open("train_data.json", "r", encoding="utf-8") as f:
    TRAIN_DATA = json.load(f)

# Load base model (can use en_core_web_md or lg too)
nlp = spacy.load("en_core_web_sm")
ner = nlp.get_pipe("ner")

# Add the custom label
LABEL = "INGREDIENT"
ner.add_label(LABEL)

# Disable other pipeline components
pipe_exceptions = ["ner"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
TRAIN_DATA = remove_overlapping_entities(TRAIN_DATA)

# Start training
with nlp.disable_pipes(*unaffected_pipes):
    optimizer = nlp.resume_training()
    n_iter = 10

    for itn in range(n_iter):
        print(f"Iteration {itn + 1}")
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.5))

        for batch in batches:
            examples = []
            for text, annots in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annots)
                examples.append(example)

            nlp.update(examples, drop=0.3, losses=losses)
        print("Losses:", losses)
# Save the trained model
output_dir = Path("ingredient_ner_model")
output_dir.mkdir(exist_ok=True)
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")


Iteration 1




Losses: {'ner': np.float32(65233.93)}
Iteration 2
Losses: {'ner': np.float32(39954.656)}
Iteration 3
Losses: {'ner': np.float32(36568.15)}
Iteration 4
Losses: {'ner': np.float32(34626.848)}
Iteration 5
Losses: {'ner': np.float32(33231.32)}
Iteration 6
Losses: {'ner': np.float32(32244.283)}
Iteration 7
Losses: {'ner': np.float32(31620.39)}
Iteration 8
Losses: {'ner': np.float32(30830.018)}
Iteration 9
Losses: {'ner': np.float32(30408.396)}
Iteration 10
Losses: {'ner': np.float32(29717.562)}
Model saved to ingredient_ner_model


In [None]:
import shutil
from google.colab import files  # <-- Add this

# Zip the folder
shutil.make_archive('/content/custom_ner_2', 'zip', '/content/ingredient_ner_model')

'/content/custom_ner_2.zip'