<a href="https://colab.research.google.com/github/amalsalilan/B3-Developing-Named-Entity-Recognition-NER-Models-for-Financial-Data-Extraction-/blob/Naveen/financial_ner_training_v0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
! pip install -U spacy -q

In [3]:

!python -m spacy info

[1m

spaCy version    3.8.7                         
Location         /usr/local/lib/python3.12/dist-packages/spacy
Platform         Linux-6.6.105+-x86_64-with-glibc2.35
Python version   3.12.12                       
Pipelines        en_core_web_sm (3.8.0)        



In [4]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

In [6]:
import json
f = open('/content/Financial Insignts annotations.json')
TRAIN_DATA = json.load(f)

In [7]:
TRAIN_DATA

{'classes': ['TITLE',
  'ORG',
  'PERSON',
  'DATE',
  'MONEY',
  'PERCENT',
  'GPE',
  'ACCOUNT',
  'TICKER',
  'PRODUCT',
  'EVENT',
  'EMAIL',
  'SHARES/STOCKS'],
 'annotations': [['🧾 Sample 1 – Corporate Earnings Report\r',
   {'entities': [[14, 39, 'TITLE']]}],
  None,
  ['On March 20, 2025, AstraNova Holdings Ltd announced its quarterly earnings from its Mumbai headquarters. According to CEO Kavita Menon, the company recorded a 12.4% increase in revenue, bringing the total to $78.6 million for the first quarter. Net profit stood at $9.2 million, compared to $8.1 million in the same period last year. The strong performance was attributed to its financial technology division, AstraPay, which secured new contracts with Delta Bank and Union Finance Corp in Singapore.\r',
   {'entities': [[3, 17, 'DATE'],
     [19, 41, 'ORG'],
     [84, 90, 'GPE'],
     [122, 134, 'PERSON'],
     [159, 164, 'PERCENT'],
     [208, 221, 'MONEY'],
     [265, 277, 'MONEY'],
     [291, 303, 'MONEY'],
     

In [8]:
for item in tqdm(TRAIN_DATA['annotations']):
    if item is None:
        print("Skipping None item")
        continue
    text, annot = item
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./training_data.spacy")

100%|██████████| 20/20 [00:00<00:00, 1139.79it/s]

Skipping None item





In [9]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [10]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     64.02    0.00    0.00    0.00    0.00
 29     200       3322.06   3716.92  100.00  100.00  100.00    1.00
 62     400          2.21      3.07  100.00  100.00  100.00    1.00
 97     600          0.00      0.00  100.00  100.00  100.00    1.00
145     800          0.02      0.01  100.00  100.00  100.00    1.00
212    1000          0.00      0.00  100.00  100.00  100.00    1.00
286    1200          0.00      0.00  100.00  100.00  100.00    1.00
386    1400          0.00      0.00  100.00  100.00  100.00    1.00
486    1600          0.00      0.00  100.00  100.00  100.00    1.00
647    1800          0.00      0.00  100.00  100.00

In [11]:
nlp_ner = spacy.load("/content/model-best")

In [12]:
doc = nlp_ner("Market analysts at MorganEast Research, based in Singapore, forecast that the Indian digital lending sector will surpass ₹1.8 lakh crore in transaction volume by 2027, with Aurora capturing approximately 6.4% market share.")

In [13]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter