In [1]:
! pip install -U spacy -q

In [2]:
!python -m spacy info

[1m

spaCy version    3.8.7                         
Location         /usr/local/lib/python3.12/dist-packages/spacy
Platform         Linux-6.1.123+-x86_64-with-glibc2.35
Python version   3.12.11                       
Pipelines        en_core_web_sm (3.8.0)        



In [3]:
# FOR LOCAL MACHINE, USE A COMMAND SIMILAR TO THIS TO SPECIFY THE PATH FOR training_data.json
# !wget https://raw.githubusercontent.com/amrrs/custom-ner-with-spacy/main/pvr_training_data.json

In [4]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

TRAIN DATA

In [5]:
f = open('annotations.json')
TRAIN_DATA = json.load(f)

In [6]:
TRAIN_DATA

{'classes': ['ORG', 'DATE', 'NAMES', 'LOC', 'OCC'],
 'annotations': [['On September 10, 2023, Anita Sharma, a journalist with The Times of India, reported from New Delhi about a major conference organized by the United Nations Development Programme (UNDP) in partnership with Harvard University. The conference was attended by Dr. Michael Roberts, a professor of economics at London School of Economics, and Suresh Iyer, the Managing Director of Reliance Industries Limited. Earlier that week, on September 7, 2023, Prime Minister Raghav Verma of India met with CEO Emily Zhang of Tesla Inc. in Mumbai to discuss renewable energy investments. During the same month, Professor Kavita Nair from IIT Bombay presented her team’s research in San Francisco, California, at a summit hosted by the World Economic Forum, where several data scientists and policy analysts shared insights on sustainable technology.',
   {'entities': [[59, 73, 'ORG'],
     [141, 184, 'ORG'],
     [205, 224, 'ORG'],
     [305, 

In [7]:
for text, annot in tqdm(TRAIN_DATA['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./training_data.spacy") # save the docbin object

100%|██████████| 2/2 [00:00<00:00, 397.98it/s]


VALIDATION DATA

In [9]:
f = open('validation.json')
VALIDATION_DATA = json.load(f)

In [10]:
VALIDATION_DATA

{'classes': ['ORG', 'DATE', 'NAMES', 'LOC', 'OCC'],
 'annotations': [['On September 10, 2023, Anita Sharma, a journalist with The Times of India, reported from New Delhi about a major conference organized by the United Nations Development Programme (UNDP) in partnership with Harvard University. The conference was attended by Dr. Michael Roberts, a professor of economics at London School of Economics, and Suresh Iyer, the Managing Director of Reliance Industries Limited. Earlier that week, on September 7, 2023, Prime Minister Raghav Verma of India met with CEO Emily Zhang of Tesla Inc. in Mumbai to discuss renewable energy investments. During the same month, Professor Kavita Nair from IIT Bombay presented her team’s research in San Francisco, California, at a summit hosted by the World Economic Forum, where several data scientists and policy analysts shared insights on sustainable technology.',
   {'entities': [[3, 21, 'DATE'],
     [23, 35, 'NAMES'],
     [39, 49, 'OCC'],
     [59, 73,

In [11]:
for text, annot in tqdm(VALIDATION_DATA['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./validation_data.spacy") # save the docbin object

100%|██████████| 1/1 [00:00<00:00, 350.72it/s]


In [12]:
#! python -m spacy init config config.cfg --lang en --pipeline ner --optimize accuracy -> When we use CPU, no transformer, optimizing for accuracy
# OR
# You can also use "https://spacy.io/usage/training#quickstart" site to make a base config and then fill it to make a config.cfg
# This program uses the former method with training done with CPU, without transformer and optimizing for accuracy
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize accuracy

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: accuracy
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [13]:
# Download the required model
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [14]:
!python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./validation_data.spacy --gpu-id 0


[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     72.73    0.00    0.00    0.00    0.00
100     200        415.57   5323.15   50.00   91.67   34.38    0.50
200     400         16.28   2728.75   50.00   91.67   34.38    0.50
300     600         13.09   2578.60   71.19   77.78   65.62    0.71
400     800         11.15   2504.58   50.00   91.67   34.38    0.50
500    1000          9.61   2462.56   83.10   75.64   92.19    0.83
666    1200         21.85   3809.78   71.19   77.78   65.62    0.71
866    1400         35.57   4465.23   64.15   80.95   53.12    0.64
1066    1600         34.65   4471.97   74.19   76.67   71.88    0.74
1266    1800         44.35   4433.90   69.57   

In [16]:
nlp_ner = spacy.load("/content/model-best")

In [17]:
doc = nlp_ner('''This Service Agreement is made on June 12, 2025 between Beta Solutions LLC, with its registered office at 12 Alexanderplatz, Berlin, Germany, and Gamma Health Systems, headquartered at 88 Marine Drive, Mumbai, India. Laura Bennett, Chief Executive Officer of Beta Solutions, and Dr. Arjun Rao, Chief Financial Officer of Gamma Health Systems, executed the Agreement. The Agreement becomes effective on July 1, 2025. The total contract value is EUR 250,000, inclusive of applicable taxes. The Services shall be provided for 24 months from the effective date, subject to early termination: either party may terminate with 60 days’ prior written notice. The Agreement shall be governed by the laws of Germany. Contract Reference No: CON-BSG-20250612''') # input sample text

In [18]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter