In [8]:
import spacy
import json
from spacy.training import Example
import random

# Load the annotated data from the JSON file
with open(r"D:\SpaCy\spcyenv\merged_annotations.json", 'r') as f:
    data = json.load(f)

# Verify the structure of the loaded data
if not isinstance(data, list):
    raise ValueError("Loaded JSON data is not a list. Please check the file format.")

# Prepare the data for spaCy
TRAIN_DATA = []

for item in data:
    if not isinstance(item, dict):
        raise ValueError("Each item in the JSON data should be a dictionary.")
    text = item.get('text')
    entities = item.get('entities')
    
    if text is None or entities is None:
        raise ValueError("Each item should contain 'text' and 'entities' keys.")
    
    # Ensure entities is a list of dictionaries
    if not isinstance(entities, list) or not all(isinstance(ent, dict) for ent in entities):
        raise ValueError("'entities' should be a list of dictionaries.")
    
    # Extract and prepare the training data
    entity_list = [(ent['start'], ent['end'], ent['label']) for ent in entities]
    TRAIN_DATA.append((text, {"entities": entity_list}))

# Create a blank English model
nlp = spacy.blank("en")

# Add the NER component to the pipeline
ner = nlp.add_pipe("ner")

# Add custom labels to the NER component
ner.add_label("DOOR_NO")
ner.add_label("STREET")
ner.add_label("AREA")
ner.add_label("DISTRICT")
ner.add_label("PINCODE")

# Convert data to spaCy's Example format
train_examples = []
for text, annotations in TRAIN_DATA:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    train_examples.append(example)

# Start the training process
optimizer = nlp.begin_training()

# Train the model
for epoch in range(20):  # You can adjust the number of epochs
    random.shuffle(train_examples)
    losses = {}
    
    for example in train_examples:
        nlp.update([example], drop=0.5, losses=losses)

    print(f"Epoch {epoch + 1}, Losses: {losses}")

# Save the trained model
nlp.to_disk("custom_ner_model")

# Load the trained model for testing
nlp = spacy.load("custom_ner_model")

Epoch 1, Losses: {'ner': np.float32(181.48045)}
Epoch 2, Losses: {'ner': np.float32(118.93549)}
Epoch 3, Losses: {'ner': np.float32(93.89517)}
Epoch 4, Losses: {'ner': np.float32(83.23425)}
Epoch 5, Losses: {'ner': np.float32(51.556496)}
Epoch 6, Losses: {'ner': np.float32(38.827915)}
Epoch 7, Losses: {'ner': np.float32(32.61917)}
Epoch 8, Losses: {'ner': np.float32(28.021118)}
Epoch 9, Losses: {'ner': np.float32(27.571054)}
Epoch 10, Losses: {'ner': np.float32(22.102154)}
Epoch 11, Losses: {'ner': np.float32(29.907356)}
Epoch 12, Losses: {'ner': np.float32(24.942862)}
Epoch 13, Losses: {'ner': np.float32(21.905954)}
Epoch 14, Losses: {'ner': np.float32(16.954334)}
Epoch 15, Losses: {'ner': np.float32(22.03499)}
Epoch 16, Losses: {'ner': np.float32(11.817067)}
Epoch 17, Losses: {'ner': np.float32(22.686808)}
Epoch 18, Losses: {'ner': np.float32(21.24961)}
Epoch 19, Losses: {'ner': np.float32(14.538632)}
Epoch 20, Losses: {'ner': np.float32(18.446405)}


In [9]:
test_text = "111, ABC Street, Areaname , Mumbai, 100000"
doc = nlp(test_text)

for ent in doc.ents:
    print(ent.text, ent.label_)


111 DOOR_NO
ABC Street STREET
Areaname AREA
Mumbai AREA
100000 PINCODE 
