In [1]:
import json
import random
import spacy
import pandas as pd
from ast import literal_eval
from spacy.util import minibatch, compounding
from pathlib import Path
from tqdm import tqdm
from spacy.training.example import Example

spacy.require_gpu()
print("Using GPU:", spacy.prefer_gpu())

def remove_overlapping_entities(data):
    cleaned_data = []
    for text, annot in data:
        entities = sorted(annot["entities"], key=lambda x: x[0])  # sort by start
        filtered = []
        last_end = -1
        for start, end, label in entities:
            if start >= last_end:
                filtered.append([start, end, label])
                last_end = end
        cleaned_data.append((text, {"entities": filtered}))
    return cleaned_data

# Load training data from CSV
def load_training_data_from_csv(csv_file):
    df = pd.read_csv(csv_file)
    data = []
    for _, row in df.iterrows():
        text = row['text']
        try:
            entities = literal_eval(row['entities'])  # Convert string to list
        except Exception as e:
            print(f"Skipping row due to parse error: {e}")
            continue
        data.append((text, {"entities": entities}))
    return data

# Read from CSV instead of JSON
TRAIN_DATA = load_training_data_from_csv("train.csv")
TRAIN_DATA = remove_overlapping_entities(TRAIN_DATA)

# Load base spaCy model
nlp = spacy.load("en_core_web_sm")
ner = nlp.get_pipe("ner")

# Add custom label
LABEL = "INGREDIENT"
ner.add_label(LABEL)

# Disable other pipeline components during training
pipe_exceptions = ["ner"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Train the NER model
with nlp.disable_pipes(*unaffected_pipes):
    optimizer = nlp.resume_training()
    n_iter = 10

    for itn in range(n_iter):
        print(f"Iteration {itn + 1}")
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.5))

        for batch in batches:
            examples = []
            for text, annots in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annots)
                examples.append(example)

            nlp.update(examples, drop=0.3, losses=losses)
        print("Losses:", losses)

# Save trained model
output_dir = Path("ingredient_ner_model")
output_dir.mkdir(exist_ok=True)
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")


Using GPU: True
Iteration 1




Losses: {'ner': np.float32(81416.23)}
Iteration 2
Losses: {'ner': np.float32(69627.01)}
Iteration 3
Losses: {'ner': np.float32(67667.03)}
Iteration 4
Losses: {'ner': np.float32(66390.15)}
Iteration 5
Losses: {'ner': np.float32(65498.754)}
Iteration 6
Losses: {'ner': np.float32(64886.38)}
Iteration 7
Losses: {'ner': np.float32(64317.418)}
Iteration 8
Losses: {'ner': np.float32(63921.21)}
Iteration 9
Losses: {'ner': np.float32(63518.723)}
Iteration 10
Losses: {'ner': np.float32(63141.9)}
Model saved to ingredient_ner_model


In [2]:
import shutil
from google.colab import files  # <-- Add this

# Zip the folder
shutil.make_archive('/content/custom_ner_3', 'zip', '/content/ingredient_ner_model')
from google.colab import files
files.download("custom_ner_3.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>