In [1]:
! pip install -U spacy -q

In [2]:
!python -m spacy info

[1m

spaCy version    3.8.7                         
Location         /usr/local/lib/python3.12/dist-packages/spacy
Platform         Linux-6.6.105+-x86_64-with-glibc2.35
Python version   3.12.12                       
Pipelines        en_core_web_sm (3.8.0)        



In [3]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

In [4]:
import json
f = open('/content/finance_ner_dataset.json')
TRAIN_DATA = json.load(f)

In [5]:
TRAIN_DATA

[{'text': 'Goldman Sachs reported net income of $2.5 billion for Q3 2025.',
  'entities': [{'start': 0, 'end': 13, 'label': 'ORG'},
   {'start': 37, 'end': 49, 'label': 'MONEY'},
   {'start': 54, 'end': 61, 'label': 'DATE'}]},
 {'text': 'On February 14, 2024, Apple Inc. announced a 5% increase in quarterly revenue.',
  'entities': [{'start': 3, 'end': 20, 'label': 'DATE'},
   {'start': 22, 'end': 32, 'label': 'ORG'},
   {'start': 45, 'end': 47, 'label': 'PERCENT'}]},
 {'text': 'The Federal Reserve raised interest rates by 0.25% on March 22, 2023.',
  'entities': [{'start': 0, 'end': 19, 'label': 'ORG'},
   {'start': 45, 'end': 50, 'label': 'PERCENT'},
   {'start': 54, 'end': 68, 'label': 'DATE'}]},
 {'text': "Amazon's stock ticker AMZN rose by 3.4% after the earnings call on July 30, 2025.",
  'entities': [{'start': 0, 'end': 6, 'label': 'ORG'},
   {'start': 22, 'end': 26, 'label': 'TICKER'},
   {'start': 35, 'end': 39, 'label': 'PERCENT'},
   {'start': 67, 'end': 80, 'label': 'DATE'}]

In [6]:
for data_item in tqdm(TRAIN_DATA):
    text = data_item['text']
    annot = data_item['entities']
    doc = nlp.make_doc(text)
    ents = []
    for item in annot:
        start = item['start']
        end = item['end']
        label = item['label']
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print(f"Skipping entity: '{text[start:end]}' with label '{label}' at offsets [{start}, {end}] in text: '{text}'")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./training_data.spacy")

100%|██████████| 12/12 [00:00<00:00, 1305.48it/s]


In [7]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [8]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     55.61    0.00    0.00    0.00    0.00
100     200         56.98   2306.43  100.00  100.00  100.00    1.00
200     400          0.00      0.00  100.00  100.00  100.00    1.00
386     600          0.00      0.00  100.00  100.00  100.00    1.00
586     800          0.00      0.00  100.00  100.00  100.00    1.00
786    1000          0.00      0.00  100.00  100.00  100.00    1.00
986    1200          0.00      0.00  100.00  100.00  100.00    1.00
1186    1400          0.00      0.00  100.00  100.00  100.00    1.00
1386    1600          0.00      0.00  100.00  100.00  100.00    1.00
1586    1800          0.00      0.00  100.00  100

In [9]:
nlp_ner = spacy.load("/content/model-best")

In [10]:
doc = nlp_ner("Market analysts at MorganEast Research, based in Singapore, forecast that the Indian digital lending sector will surpass ₹1.8 lakh crore in transaction volume by 2027, with Aurora capturing approximately 6.4% market share.")

In [11]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter