In [1]:
! pip install -U spacy -q

In [2]:
!python -m spacy info

[1m

spaCy version    3.8.7                         
Location         /usr/local/lib/python3.12/dist-packages/spacy
Platform         Linux-6.1.123+-x86_64-with-glibc2.35
Python version   3.12.11                       
Pipelines        en_core_web_sm (3.8.0)        



In [None]:
# FOR LOCAL MACHINE, USE A COMMAND SIMILAR TO THIS TO GET training_data.json
# !wget https://raw.githubusercontent.com/amrrs/custom-ner-with-spacy/main/pvr_training_data.json

In [3]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

In [9]:
import json
f = open('training_data.json')
TRAIN_DATA = json.load(f)

In [10]:
TRAIN_DATA

{'classes': ['ORG', 'PERSON', 'GPE', 'DATE', 'LOC', 'OCC'],
 'annotations': [['John Doe, a software engineer at Google, moved to New York in 2019. He previously worked at Microsoft in Seattle.',
   {'entities': [[0, 8, 'PERSON'],
     [12, 29, 'OCC'],
     [33, 39, 'ORG'],
     [50, 58, 'GPE'],
     [62, 67, 'DATE'],
     [92, 101, 'ORG'],
     [105, 112, 'GPE']]}],
  ['Mary Smith, who studied at Stanford University, joined Amazon as a data scientist in 2020. She now lives in San Francisco.',
   {'entities': [[0, 10, 'PERSON'],
     [27, 46, 'ORG'],
     [55, 61, 'ORG'],
     [67, 81, 'OCC'],
     [85, 90, 'DATE'],
     [108, 121, 'GPE']]}],
  ['Barack Obama was the 44th President of the United States and was born in Hawaii. He studied at Harvard University.',
   {'entities': [[0, 12, 'PERSON'],
     [26, 35, 'OCC'],
     [43, 56, 'GPE'],
     [73, 80, 'GPE'],
     [95, 113, 'ORG']]}],
  ['Elon Musk, the CEO of Tesla and SpaceX, was born in South Africa and later moved to the United St

In [12]:
for text, annot in tqdm(TRAIN_DATA['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./training_data.spacy") # save the docbin object

100%|██████████| 10/10 [00:00<00:00, 2247.39it/s]


In [13]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize accuracy

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: accuracy
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [15]:
# Download the required model
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [16]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     54.50    0.00    0.00    0.00    0.00
 28     200         43.06   1438.93  100.00  100.00  100.00    1.00
 63     400          0.00      0.00  100.00  100.00  100.00    1.00
106     600          0.00      0.00  100.00  100.00  100.00    1.00
157     800          0.00      0.00  100.00  100.00  100.00    1.00
223    1000          0.00      0.00  100.00  100.00  100.00    1.00
298    1200          0.00      0.00  100.00  100.00  100.00    1.00
398    1400          0.00      0.00  100.00  100.00  100.00    1.00
498    1600          0.00      0.00  100.00  100.00  100.00    1.00
630    1800          0.00      0.00  100.00  100.00

In [17]:
nlp_ner = spacy.load("/content/model-best")

In [18]:
doc = nlp_ner('''Elon Musk, the CEO of Tesla and SpaceX, was born in Pretoria, South Africa. He moved to the United States and studied at the University of Pennsylvania before starting his entrepreneurial journey. Musk is also the founder of Neuralink and The Boring Company. Currently, he lives in Texas and often travels to California for work. Sundar Pichai, the CEO of Google, was born in Chennai, India, and later moved to the United States to study at Stanford University. He has been instrumental in leading Google to become one of the biggest technology companies in the world.

Satya Nadella, the CEO of Microsoft, was born in Hyderabad, India, and later moved to the United States for higher education. After earning his degree from the University of Wisconsin and an MBA from the University of Chicago, he joined Microsoft in 1992. Today, he lives in Washington and continues to lead the company. Meanwhile, Angela Merkel, the former Chancellor of Germany, played a significant role in shaping European politics. She was born in Hamburg in 1954 and worked as a physicist before entering politics.

Tim Cook, an industrial engineer by training, became the CEO of Apple after Steve Jobs passed away in 2011. He is known for expanding Apple’s operations in California and worldwide. Cristiano Ronaldo, a famous football player from Portugal, recently signed with Al-Nassr Football Club in Saudi Arabia. Lionel Messi, another legendary footballer, plays for Inter Miami in the United States after leaving FC Barcelona. The Eiffel Tower in Paris and the Taj Mahal in Agra are iconic landmarks that attract millions of visitors each year. ''') # input sample text

In [19]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter