In [1]:
! pip install -U spacy -q

In [2]:

!python -m spacy info

[1m

spaCy version    3.8.7                         
Location         /usr/local/lib/python3.12/dist-packages/spacy
Platform         Linux-6.6.105+-x86_64-with-glibc2.35
Python version   3.12.12                       
Pipelines        en_core_web_sm (3.8.0)        



In [3]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

In [5]:
import json
f = open('/content/annotations.json')
TRAIN_DATA = json.load(f)

In [6]:
TRAIN_DATA

{'classes': ['ORG', 'PERSON', 'DATE', 'GPE', 'MONEY'],
 'annotations': [['Infosys was founded in 1981 by Narayana Murthy and six others in Pune, India.',
   {'entities': [[0, 7, 'ORG'],
     [23, 27, 'DATE'],
     [31, 46, 'PERSON'],
     [65, 69, 'GPE'],
     [71, 76, 'GPE']]}],
  ['The company later moved its headquarters to Bengaluru.',
   {'entities': [[44, 53, 'GPE']]}],
  ['Microsoft Corporation was established by Bill Gates and Paul Allen in 1975 in Albuquerque, New Mexico.',
   {'entities': [[0, 21, 'ORG'],
     [41, 51, 'PERSON'],
     [56, 66, 'PERSON'],
     [70, 74, 'DATE'],
     [78, 89, 'GPE'],
     [91, 101, 'GPE']]}],
  ['Apple Inc., based in Cupertino, was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne.',
   {'entities': [[0, 10, 'ORG'],
     [21, 30, 'GPE'],
     [47, 57, 'PERSON'],
     [59, 72, 'PERSON'],
     [78, 90, 'PERSON']]}],
  ['The Indian government announced a $10 billion fund to promote semiconductor manufacturing in 2022.',
   {'entities': [[4, 1

In [7]:


for text, annot in tqdm(TRAIN_DATA['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)
    db.to_disk("./training_data.spacy")


100%|██████████| 11/11 [00:00<00:00, 403.92it/s]


In [8]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [9]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     61.86    0.00    0.00    0.00    0.00
100     200         79.97   1828.26  100.00  100.00  100.00    1.00
243     400          0.00      0.00  100.00  100.00  100.00    1.00
443     600          0.00      0.00  100.00  100.00  100.00    1.00
643     800          0.00      0.00  100.00  100.00  100.00    1.00
843    1000          0.00      0.00  100.00  100.00  100.00    1.00
1043    1200          0.00      0.00  100.00  100.00  100.00    1.00
1243    1400          0.00      0.00  100.00  100.00  100.00    1.00
1443    1600          0.00      0.00  100.00  100.00  100.00    1.00
1643    1800          0.00      0.00  100.00  10

In [10]:
nlp_ner = spacy.load("/content/model-best")

In [11]:
doc = nlp_ner("Market analysts at MorganEast Research, based in Singapore, forecast that the Indian digital lending sector will surpass ₹1.8 lakh crore in transaction volume by 2027, with Aurora capturing approximately 6.4% market share.")

In [12]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter