In [10]:
! pip install -U spacy -q

In [11]:
! python -m spacy info

[1m

spaCy version    3.8.7                         
Location         c:\Users\mvasu\NER-model\.venv\Lib\site-packages\spacy
Platform         Windows-11-10.0.26100-SP0     
Python version   3.13.5                        
Pipelines                                      



In [12]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin objec

In [13]:
import json
f = open('annotations.json')
Train_Data = json.load(f)

In [14]:
Train_Data

{'classes': ['ORG', 'DATE', 'MONEY', 'INDEX', 'PER', 'PRODUCT', 'GPE'],
 'annotations': [['Microsoft Corporation announced its quarterly earnings on July 18, 2024, reporting a revenue of $56.2 billion.\r',
   {'entities': [[0, 21, 'ORG'], [58, 71, 'DATE'], [96, 109, 'MONEY']]}],
  None,
  ['On March 5, 2023, Amazon introduced a new subscription plan priced at $9.99 per month.\r',
   {'entities': [[3, 16, 'DATE'], [18, 24, 'ORG'], [70, 75, 'MONEY']]}],
  None,
  ['Goldman Sachs projected that the S&P 500 index would rise by 8% during the fiscal year 2025.\r',
   {'entities': [[0, 13, 'ORG'], [61, 63, 'PER'], [87, 91, 'DATE']]}],
  None,
  ['Google unveiled its new AI chip called Tensor X3 at the Cloud Next conference in San Francisco.\r',
   {'entities': [[0, 6, 'ORG'], [39, 48, 'PRODUCT'], [81, 94, 'GPE']]}],
  None,
  ['On December 12, 2022, Netflix signed a multi-year content deal worth $500 million with a Hollywood studio.\r',
   {'entities': [[3, 20, 'DATE'],
     [22, 29, 'ORG'],


In [15]:


for item in tqdm(Train_Data['annotations']):
    if item is None:
        # Skip None entries
        continue
    if not (isinstance(item, (list, tuple)) and len(item) == 2):
        # Optionally log items with the wrong shape
        print(f"Skipping malformed entry: {item}")
        continue

    text, annot = item
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot.get('entities', []):
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)
db.to_disk("./training_data.spacy") # save the docbin object    


100%|██████████| 19/19 [00:00<00:00, 2820.85it/s]


In [16]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [17]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

^C


[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     52.00    0.00    0.00    0.00    0.00
100     200         37.00   1738.85  100.00  100.00  100.00    1.00
200     400          0.00      0.00  100.00  100.00  100.00    1.00
367     600          0.00      0.00  100.00  100.00  100.00    1.00
567     800          0.00      0.00  100.00  100.00  100.00    1.00
767    1000          0.00      0.00  100.00  100.00  100.00    1.00
967    1200          0.00      0.00  100.00  100.00  100.00    1.00
1167    1400          0.00      0.00  100.00  100.00  100.00    1.00
1367    1600          0.00      0.00  100.00  100.00  100.00    1.00
1567    1800          0.00      0.00  100.00  100

In [18]:
pip install --upgrade ipython jupyter


Collecting ipython
  Using cached ipython-9.4.0-py3-none-any.whl.metadata (4.4 kB)
Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting notebook (from jupyter)
  Downloading notebook-7.4.5-py3-none-any.whl.metadata (10 kB)
Collecting jupyter-console (from jupyter)
  Downloading jupyter_console-6.6.3-py3-none-any.whl.metadata (5.8 kB)
Collecting nbconvert (from jupyter)
  Using cached nbconvert-7.16.6-py3-none-any.whl.metadata (8.5 kB)
Collecting ipywidgets (from jupyter)
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting jupyterlab (from jupyter)
  Downloading jupyterlab-4.4.6-py3-none-any.whl.metadata (16 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets->jupyter)
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets->jupyter)
  Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Collecting async-lru>=1.0.0 (from

In [20]:
nlp_ner = spacy.load("model-best")

In [21]:
doc = nlp_ner('''Goldman Sachs projected that the S&P 500 index would rise by 8% during the fiscal year 2025.

Google unveiled its new AI chip called Tensor X3 at the Cloud Next conference in San Francisco.

On December 12, 2022, Netflix signed a multi-year content deal worth $500 million with a Hollywood studio.

Apple Inc. reported a net income of $22.9 billion for the quarter ending September 2024.

Tesla Inc., trading under ticker TSLA, announced plans to build a new Gigafactory in Mexico by mid-2025.

JPMorgan Chase revised its forecast for the euro-dollar exchange rate for Q1 2025.

On August 2, 2023, Meta Platforms launched its Threads application to rival Twitter.

Intel Corporation invested $20 billion in a semiconductor plant located in Ohio.''')

In [22]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter