<a href="https://colab.research.google.com/github/amalsalilan/B3-Developing-Named-Entity-Recognition-NER-Models-for-Financial-Data-Extraction-/blob/rishwanth/ner_model_data_extraction_rishi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
! pip install -U spacy -q

In [15]:
!python -m spacy info

[1m

spaCy version    3.8.7                         
Location         /usr/local/lib/python3.12/dist-packages/spacy
Platform         Linux-6.6.105+-x86_64-with-glibc2.35
Python version   3.12.12                       
Pipelines        en_core_web_sm (3.8.0)        



In [16]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

In [17]:
import json
f = open('annotations_finance_india_v2.json')
TRAIN_DATA = json.load(f)

In [18]:
TRAIN_DATA

{'classes': ['PERSON',
  'ORGANISATION',
  'DATE',
  'LOCATION',
  'MONEY',
  'DESIGNATION'],
 'annotations': [['Company: AsterFin Capital Ltd.',
   {'entities': [[9, 30, 'ORGANISATION']]}],
  ['Financial Year: FY 2025-26', {'entities': [[16, 26, 'DATE']]}],
  ['Report Date: 05 April 2025', {'entities': [[13, 26, 'DATE']]}],
  ['Location: Mumbai, Maharashtra, India',
   {'entities': [[10, 36, 'LOCATION']]}],
  ['AsterFin Capital Ltd. focuses on retail lending, SME credit, and wealth management across India.',
   {'entities': [[0, 21, 'ORGANISATION'], [90, 95, 'LOCATION']]}],
  ['In Q4 FY 2024-25, the company opened branches in Kochi, Coimbatore, and Visakhapatnam.',
   {'entities': [[3, 16, 'DATE'],
     [49, 54, 'LOCATION'],
     [56, 66, 'LOCATION'],
     [72, 85, 'LOCATION']]}],
  ['Total Revenue: ₹1,248,300,000 (₹124.83 Cr)',
   {'entities': [[15, 29, 'MONEY']]}],
  ['Interest Income: ₹820,000,000', {'entities': [[17, 29, 'MONEY']]}],
  ['Fee & Commission: ₹220,500,000', {'entities

In [19]:
from tqdm import tqdm

for item in tqdm(TRAIN_DATA['annotations']):
    if item is None or not isinstance(item, (list, tuple)) or len(item) != 2:
        print("Skipping invalid item:", item)
        continue

    text, annot = item
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot.get("entities", []):
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./training_data.spacy")

100%|██████████| 42/42 [00:00<00:00, 1858.04it/s]


In [20]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency


[38;5;1m✘ The provided output file already exists. To force overwriting the
config file, set the --force or -F flag.[0m



In [21]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     59.65    0.00    0.00    0.00    0.00
 50     200         58.37   2121.48  100.00  100.00  100.00    1.00
113     400          0.00      0.00  100.00  100.00  100.00    1.00
180     600          0.00      0.00  100.00  100.00  100.00    1.00
279     800          0.00      0.00  100.00  100.00  100.00    1.00
379    1000          0.00      0.00  100.00  100.00  100.00    1.00
483    1200          0.00      0.00  100.00  100.00  100.00    1.00
683    1400          0.00      0.00  100.00  100.00  100.00    1.00
883    1600          0.00      0.00  100.00  100.00  100.00    1.00
1083    1800          0.00      0.00  100.00  100.0

In [22]:
nlp_ner = spacy.load("/content/model-best")

In [23]:
doc = nlp_ner("Market analysts at MorganEast Research, based in Singapore, forecast that the Indian digital lending sector will surpass ₹1.8 lakh crore in transaction volume by 2027, with Aurora capturing approximately 6.4% market share.")

In [24]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter