In [1]:
! pip install -U spacy -q

In [2]:
!python -m spacy info

[1m

spaCy version    3.8.7                         
Location         /usr/local/lib/python3.12/dist-packages/spacy
Platform         Linux-6.1.123+-x86_64-with-glibc2.35
Python version   3.12.11                       
Pipelines        en_core_web_sm (3.8.0)        



In [3]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

In [4]:
import json
f = open('training_data.json')
TRAIN_DATA = json.load(f)

In [6]:
TRAIN_DATA

{'classes': ['ORG', 'DATE', 'PRODUCTS', 'LOCATION', 'AMT', 'PERSON'],
 'annotations': [['In Q1 2024, Microsoft reported revenue of 20.1 billion U.S. dollars. CEO Satya Nadella highlighted strong demand for Azure cloud services in North America.  \r',
   {'entities': [[3, 10, 'DATE'],
     [12, 21, 'ORG'],
     [42, 68, 'AMT'],
     [73, 86, 'PERSON'],
     [117, 137, 'PRODUCTS'],
     [141, 154, 'LOCATION']]}],
  ['Apple shipped 60 million iPhones in Asia during Q1 2024, earning 25.3 billion U.S. dollars. Tim Cook noted that innovation in the M3 chip boosted performance. \r',
   {'entities': [[0, 5, 'ORG'],
     [14, 32, 'PRODUCTS'],
     [36, 40, 'LOCATION'],
     [48, 55, 'DATE'],
     [65, 91, 'AMT'],
     [92, 100, 'PERSON']]}],
  ['Samsung recorded 72 million smartphone shipments in Europe in Q2 2023, generating 21.4 billion U.S. dollars. President TM Roh emphasized the success of the Galaxy S23 series. \r',
   {'entities': [[0, 7, 'ORG'],
     [17, 38, 'PRODUCTS'],
     [52, 58, 

In [5]:
for text, annot in tqdm(TRAIN_DATA['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./training_data.spacy") # save the docbin object

100%|██████████| 7/7 [00:00<00:00, 960.86it/s]


In [12]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize accuracy

[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: accuracy
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [14]:
! python -m spacy download en_core_web_lg


Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [15]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     52.08    5.62    3.40   16.13    0.06
100     200         28.46   1693.53  100.00  100.00  100.00    1.00
278     400          0.00      0.00  100.00  100.00  100.00    1.00
478     600          0.00      0.00  100.00  100.00  100.00    1.00
678     800          0.00      0.00  100.00  100.00  100.00    1.00
878    1000          0.00      0.00  100.00  100.00  100.00    1.00
1078    1200          0.00      0.00  100.00  100.00  100.00    1.00
1278    1400          0.00      0.00  100.00  100.00  100.00    1.00
1478    1600          0.00      0.00  100.00  100.00  100.00    1.00
1678    1800          0.00      0.00  100.00  10

In [16]:
nlp_ner = spacy.load("/content/model-best")

In [19]:
doc = nlp_ner('''In Q1 2024, Microsoft reported revenue of 20.1 billion U.S. dollars. CEO Satya Nadella highlighted strong demand for Azure cloud services in North America.
Apple shipped 60 million iPhones in Asia during Q1 2024, earning 25.3 billion U.S. dollars. Tim Cook noted that innovation in the M3 chip boosted performance.
Samsung recorded 72 million smartphone shipments in Europe in Q2 2023, generating 21.4 billion U.S. dollars. TM Roh emphasized the success of the Galaxy S23 series.
Dell posted revenue of 12.4 billion U.S. dollars in Q2 2023 from laptop sales in North America. Michael Dell said enterprise demand remained solid.''')

In [20]:
spacy.displacy.render(doc, style="ent", jupyter=True)