In [1]:
! pip install -U spacy -q

In [2]:
!python -m spacy info

[1m

spaCy version    3.8.7                         
Location         /usr/local/lib/python3.12/dist-packages/spacy
Platform         Linux-6.1.123+-x86_64-with-glibc2.35
Python version   3.12.11                       
Pipelines        en_core_web_sm (3.8.0)        



In [3]:
# FOR LOCAL MACHINE, USE A COMMAND SIMILAR TO THIS TO SPECIFY THE PATH FOR training_data.json
# !wget https://raw.githubusercontent.com/amrrs/custom-ner-with-spacy/main/pvr_training_data.json

In [4]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

TRAIN DATA

In [5]:
f = open('training_data.json')
TRAIN_DATA = json.load(f)

In [6]:
TRAIN_DATA

{'classes': ['ORG',
  'GPE',
  'DATE',
  'MONEY',
  'DURATION',
  'NOTICE',
  'CONTRACT_REF',
  'ROLE',
  'MISC'],
 'annotations': [['This Agreement is entered into on the 12th day of June, 2024, between Beta Solutions Pvt. Ltd.,\na company incorporated under the Companies Act, 2013 and having its registered office at\nFlat No. 12B, MG Road, Mumbai, India,\n(hereinafter referred to as the "Service Provider"), and Omega Systems LLC, located at 320\nHoward Street, San Francisco, CA 94105,\n(hereinafter referred to as the "Client"). The Agreement becomes effective on July 1, 2024.\nThe total contract value is USD 150,000.00, inclusive of applicable taxes and fees. The contract\nshall remain in effect for a period of 12 months unless terminated earlier in accordance with the\nprovisions herein. The termination clause states that either party may terminate the agreement\nwith 30 days\' written notice under justifiable circumstances. This Agreement shall be governed by\nand construed in acco

In [7]:
for text, annot in tqdm(TRAIN_DATA['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./training_data.spacy") # save the docbin object

100%|██████████| 4/4 [00:00<00:00, 481.43it/s]


VALIDATION DATA

In [8]:
f = open('validation_data.json')
VALIDATION_DATA = json.load(f)

In [9]:
VALIDATION_DATA

{'classes': ['ORG',
  'GPE',
  'DATE',
  'MONEY',
  'DURATION',
  'NOTICE',
  'CONTRACT_REF',
  'ROLE',
  'MISC'],
 'annotations': [['This Agreement is entered into on the 20th day of August, 2024, between Gamma Tech Pvt. Ltd.,\na company incorporated under the Companies Act, 2013 and having its registered office at\nSuite 305, MG Road, Mumbai, India,\n(hereinafter referred to as the "Service Provider"), and Nova Innovations Inc., located at 455\nMarket Street, San Francisco, CA 94105,\n(hereinafter referred to as the "Client"). The Agreement becomes effective on September 1, 2024.\nThe total contract value is USD 125,000.00, inclusive of applicable taxes and fees. The contract\nshall remain in effect for a period of 12 months unless terminated earlier in accordance with the\nprovisions herein. The termination clause states that either party may terminate the agreement\nwith 30 days\' written notice under justifiable circumstances. This Agreement shall be governed by\nand construed in 

In [10]:
for text, annot in tqdm(VALIDATION_DATA['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./validation_data.spacy") # save the docbin object

100%|██████████| 1/1 [00:00<00:00, 334.42it/s]


In [11]:
#! python -m spacy init config config.cfg --lang en --pipeline ner --optimize accuracy -> When we use CPU, no transformer, optimizing for accuracy
# OR
# You can also use "https://spacy.io/usage/training#quickstart" site to make a base config and then fill it to make a config.cfg
# This program uses the former method with training done with CPU, without transformer and optimizing for accuracy
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize accuracy

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: accuracy
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [12]:
# Download the required model
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [13]:
!python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./validation_data.spacy --gpu-id 0


[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    115.26    0.00    0.00    0.00    0.00
 66     200       1170.05   4364.58   71.43   65.79   78.12    0.71
133     400        115.56   1490.22   63.33   67.86   59.38    0.63
200     600        328.32   1762.70   58.82   78.95   46.88    0.59
266     800        176.09   1388.08   80.52   68.89   96.88    0.81
333    1000        436.09   1745.16   72.00   62.79   84.38    0.72
400    1200        384.03   1438.25   76.06   69.23   84.38    0.76
466    1400        448.30   1442.74   77.33   67.44   90.62    0.77
533    1600        505.68   1485.04   68.85   72.41   65.62    0.69
600    1800        550.39   1441.32   80.52   68

In [14]:
nlp_ner = spacy.load("/content/model-best")

In [15]:
doc = nlp_ner('''This Agreement is entered into on the 5th day of July, 2024, between Alpha Technologies Pvt.
Ltd.,
a company incorporated under the Companies Act, 2013 and having its registered office at
#210, MG Road, Mumbai, India,
(hereinafter referred to as the "Service Provider"), and Delta Innovations Inc., located at 455
Market Street, San Francisco, CA 94105,
(hereinafter referred to as the "Client"). The Agreement becomes effective on August 1, 2024.
The total contract value is USD 125,000.00, inclusive of applicable taxes and fees. The contract
shall remain in effect for a period
of 12 months unless terminated earlier in accordance with the provisions herein. The
termination clause states that either party may
terminate the agreement with 30 days’ written notice under justifiable circumstances.
This Agreement shall be governed by and construed in accordance with the laws of the State of
California, United States.
The governing law clause ensures that any dispute arising out of or in connection with this
Agreement shall be subject to the exclusive
jurisdiction of the courts located in San Francisco County, California.
Contract Reference No: CON-ALD-20240705 ''') # input sample text

In [16]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter