In [17]:
! pip install -U spacy -q

In [18]:
!python -m spacy info

[1m

spaCy version    3.8.7                         
Location         /usr/local/lib/python3.12/dist-packages/spacy
Platform         Linux-6.1.123+-x86_64-with-glibc2.35
Python version   3.12.11                       
Pipelines        en_core_web_lg (3.8.0), en_core_web_sm (3.8.0)



In [19]:
# FOR LOCAL MACHINE, USE A COMMAND SIMILAR TO THIS TO SPECIFY THE PATH FOR training_data.json
# !wget https://raw.githubusercontent.com/amrrs/custom-ner-with-spacy/main/pvr_training_data.json

In [5]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

TRAIN DATA

In [21]:
f = open('training_data.json')
TRAIN_DATA = json.load(f)

In [22]:
TRAIN_DATA

{'classes': ['ORG',
  'GPE',
  'DATE',
  'MONEY',
  'DURATION',
  'NOTICE',
  'CONTRACT_REF',
  'ROLE',
  'MISC'],
 'annotations': [['This Agreement is entered into on the 12th day of June, 2024, between Beta Solutions Pvt. Ltd.,\na company incorporated under the Companies Act, 2013 and having its registered office at\nFlat No. 12B, MG Road, Mumbai, India,\n(hereinafter referred to as the "Service Provider"), and Omega Systems LLC, located at 320\nHoward Street, San Francisco, CA 94105,\n(hereinafter referred to as the "Client"). The Agreement becomes effective on July 1, 2024.\nThe total contract value is USD 150,000.00, inclusive of applicable taxes and fees. The contract\nshall remain in effect for a period of 12 months unless terminated earlier in accordance with the\nprovisions herein. The termination clause states that either party may terminate the agreement\nwith 30 days\' written notice under justifiable circumstances. This Agreement shall be governed by\nand construed in acco

In [23]:
for text, annot in tqdm(TRAIN_DATA['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./training_data.spacy") # save the docbin object

100%|██████████| 4/4 [00:00<00:00, 294.70it/s]


VALIDATION DATA

In [24]:
f = open('validation_data.json')
VALIDATION_DATA = json.load(f)

In [25]:
VALIDATION_DATA

{'classes': ['ORG',
  'GPE',
  'DATE',
  'MONEY',
  'DURATION',
  'NOTICE',
  'CONTRACT_REF',
  'ROLE',
  'MISC'],
 'annotations': [['This Agreement is entered into on the 20th day of August, 2024, between Gamma Tech Pvt. Ltd.,\na company incorporated under the Companies Act, 2013 and having its registered office at\nSuite 305, MG Road, Mumbai, India,\n(hereinafter referred to as the "Service Provider"), and Nova Innovations Inc., located at 455\nMarket Street, San Francisco, CA 94105,\n(hereinafter referred to as the "Client"). The Agreement becomes effective on September 1, 2024.\nThe total contract value is USD 125,000.00, inclusive of applicable taxes and fees. The contract\nshall remain in effect for a period of 12 months unless terminated earlier in accordance with the\nprovisions herein. The termination clause states that either party may terminate the agreement\nwith 30 days\' written notice under justifiable circumstances. This Agreement shall be governed by\nand construed in 

In [26]:
for text, annot in tqdm(VALIDATION_DATA['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./validation_data.spacy") # save the docbin object

100%|██████████| 1/1 [00:00<00:00, 95.98it/s]


In [28]:
#! python -m spacy init config config.cfg --lang en --pipeline ner --optimize accuracy -> When we use CPU, no transformer, optimizing for accuracy
# OR
# You can also use "https://spacy.io/usage/training#quickstart" site to make a base config and then fill it to make a config.cfg
# This program uses the latter method with training done with GPU, transformer and optimizing for accuracy

In [30]:
# For using transformers
!pip install spacy-transformers

Collecting spacy-transformers
  Downloading spacy_transformers-1.3.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting transformers<4.50.0,>=3.4.0 (from spacy-transformers)
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Downloading spacy_alignments-0.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.6 kB)
Downloading spacy_transformers-1.3.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (795 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m795.8/795.8 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading spacy_alignments-0.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.4/313.4 kB[0m [3

In [31]:
# Forming the config.cfg
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [32]:
# Download the required model
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
!python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./validation_data.spacy --gpu-id 0


[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
2025-08-26 16:42:17.789317: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756226537.824082   25607 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756226537.835664   25607 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1756226537.859981   25607 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1756226537.860016   25607 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:

In [6]:
nlp_ner = spacy.load("/content/model-best")

In [7]:
doc = nlp_ner('''This Agreement is entered into on the 5th day of July, 2024, between Alpha Technologies Pvt.
Ltd.,
a company incorporated under the Companies Act, 2013 and having its registered office at
#210, MG Road, Mumbai, India,
(hereinafter referred to as the "Service Provider"), and Delta Innovations Inc., located at 455
Market Street, San Francisco, CA 94105,
(hereinafter referred to as the "Client"). The Agreement becomes effective on August 1, 2024.
The total contract value is USD 125,000.00, inclusive of applicable taxes and fees. The contract
shall remain in effect for a period
of 12 months unless terminated earlier in accordance with the provisions herein. The
termination clause states that either party may
terminate the agreement with 30 days’ written notice under justifiable circumstances.
This Agreement shall be governed by and construed in accordance with the laws of the State of
California, United States.
The governing law clause ensures that any dispute arising out of or in connection with this
Agreement shall be subject to the exclusive
jurisdiction of the courts located in San Francisco County, California.
Contract Reference No: CON-ALD-20240705 ''') # input sample text

In [8]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter