<a href="https://colab.research.google.com/github/amalsalilan/B3-Developing-Named-Entity-Recognition-NER-Models-for-Financial-Data-Extraction-/blob/Naveen/financial_ner_training_docling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install -U spacy -q

In [2]:
!python -m spacy info

[1m

spaCy version    3.8.7                         
Location         /usr/local/lib/python3.12/dist-packages/spacy
Platform         Linux-6.6.105+-x86_64-with-glibc2.35
Python version   3.12.12                       
Pipelines        en_core_web_sm (3.8.0)        



In [3]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

In [5]:
import json

# Load data from the first file
with open('/content/Financial Insignts annotations.json') as f1:
    TRAIN_DATA1 = json.load(f1)

# Load data from the second file
with open('/content/annotations.json') as f2:
    TRAIN_DATA2 = json.load(f2)

# Combine the annotation data
# Assuming the structure is a dictionary with an 'annotations' key containing a list
# You might need to adjust this based on the actual structure of your JSON files
if 'annotations' in TRAIN_DATA1 and 'annotations' in TRAIN_DATA2:
    TRAIN_DATA = {
        'classes': list(set(TRAIN_DATA1.get('classes', []) + TRAIN_DATA2.get('classes', []))),
        'annotations': TRAIN_DATA1['annotations'] + TRAIN_DATA2['annotations']
    }
elif 'annotations' in TRAIN_DATA1:
    TRAIN_DATA = TRAIN_DATA1
elif 'annotations' in TRAIN_DATA2:
    TRAIN_DATA = TRAIN_DATA2
else:
    print("Warning: Could not find 'annotations' key in either JSON file. TRAIN_DATA is empty.")
    TRAIN_DATA = {'classes': [], 'annotations': []}

In [15]:
from google.colab import files

def upload_document():
  """Uploads a document file from the user's local machine."""
  uploaded = files.upload()
  for filename in uploaded.keys():
    print(f'User uploaded file "{filename}"')
    return filename # Return the first uploaded filename
  return None

In [16]:
from docling.document_converter import DocumentConverter
import os

def extract_text_from_document(filename):
  """Extracts text from a document using docling."""
  converter = DocumentConverter()
  try:
    result = converter.convert(f"/content/{filename}")
    return result.document.export_to_markdown()
  except Exception as e:
    print(f"Error converting document: {e}")
    return None

In [17]:
import re

def clean_text(text):
  """Cleans the extracted text."""
  if text is None:
    return None
  # Remove multiple newlines and replace with a single space
  cleaned_text = re.sub(r'\n\s*\n', ' ', text)
  # Remove extra whitespace
  cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
  return cleaned_text

In [23]:
import spacy

# Load the custom trained spaCy model
try:
    nlp_custom = spacy.load("/content/model-best")
    print("Custom model loaded successfully.")
except Exception as e:
    print(f"Error loading custom model: {e}")
    nlp_custom = None

# Load a fallback spaCy model (e.g., en_core_web_sm)
try:
    # Check if the fallback model is already loaded to avoid reloading
    if 'nlp_fallback' not in globals() or not isinstance(nlp_fallback, spacy.Language):
        nlp_fallback = spacy.load("en_core_web_sm")
        print("Fallback model loaded successfully.")
except Exception as e:
    print(f"Error loading fallback model: {e}")
    nlp_fallback = None

def extract_entities(text, custom_model, fallback_model):
  """Extracts entities using the custom and fallback models."""
  if text is None:
    return None

  doc_custom = None
  if custom_model:
    doc_custom = custom_model(text)

  doc_fallback = None
  if fallback_model:
      doc_fallback = fallback_model(text)

  # Combine entities from both models. Prioritize custom model entities
  entities = {}
  if doc_custom:
      for ent in doc_custom.ents:
          entities[(ent.start_char, ent.end_char)] = {"text": ent.text, "label": ent.label_}

  if doc_fallback:
      for ent in doc_fallback.ents:
          if (ent.start_char, ent.end_char) not in entities:
               entities[(ent.start_char, ent.end_char)] = {"text": ent.text, "label": ent.label_}

  # Convert dictionary back to a list of Spans or a similar structure for visualization
  # For simplicity, returning a list of dictionaries for now
  return [{"text": ent["text"], "start": start, "end": end, "label": ent["label"]} for (start, end), ent in entities.items()]

Error loading custom model: [E050] Can't find model '/content/model-best'. It doesn't seem to be a Python package or a valid path to a data directory.


In [21]:
import spacy

def visualize_entities(text, entities):
    """Visualizes entities using spacy.displacy."""
    if text is None or entities is None:
        print("No text or entities to visualize.")
        return

    # displacy expects a dictionary with "text" and "ents" keys for manual rendering
    # The "ents" key should contain a list of dictionaries, each with "start", "end", and "label"
    # The extract_entities function already returns this format
    data = [{
        "text": text,
        "ents": entities,
        "title": "Entity Visualization" # Optional title
    }]
    spacy.displacy.render(data, style="ent", jupyter=True, manual=True)

In [34]:
# Combine all the steps into a single workflow
def process_document():
  """Uploads a document, extracts and cleans text, extracts entities, and visualizes them."""
  print("Please upload your document.")
  uploaded_filename = upload_document()

  if uploaded_filename:
    print(f"Processing document: {uploaded_filename}")
    extracted_text = extract_text_from_document(uploaded_filename)

    if extracted_text:
      print("Text extracted successfully. Cleaning text...")
      cleaned_text = clean_text(extracted_text)

      if cleaned_text:
        print("Text cleaned. Extracting entities...")
        # Assuming nlp_custom and nlp_fallback are loaded globally
        entities = extract_entities(cleaned_text, nlp_custom, nlp_fallback)

        if entities:
          print("Entities extracted. Visualizing entities...")
          visualize_entities(cleaned_text, entities)
          return entities  # Return entities here
        else:
          print("No entities found in the document.")
      else:
        print("Text cleaning failed.")
    else:
      print("Text extraction failed.")
  else:
    print("No document uploaded.")
  return None # Return None if no entities were extracted or an error occurred

# You can call this function to start the process
# process_document()

In [35]:
import pandas as pd

# Call process_document() to get the extracted entities
entities = process_document()

if entities:
    entities_df = pd.DataFrame(entities)
    display(entities_df)
else:
    print("No entities were extracted or an error occurred during processing.")

Please upload your document.


Saving 🧾 Financial Document 2.docx to 🧾 Financial Document 2 (3).docx
User uploaded file "🧾 Financial Document 2 (3).docx"
Processing document: 🧾 Financial Document 2 (3).docx
Text extracted successfully. Cleaning text...
Text cleaned. Extracting entities...
Entities extracted. Visualizing entities...


Unnamed: 0,text,start,end,label
0,"February 3, 2025",77,93,DATE
1,1,96,97,CARDINAL
2,Xenon Aerospace Ltd.,99,119,ORG
3,XENA,121,125,ORG
4,quarterly,149,158,DATE
5,$5.9 billion,174,186,MONEY
6,$780 million,281,293,MONEY
7,$620 million,308,320,MONEY
8,last year,323,332,DATE
9,1.95,358,362,MONEY


In [13]:
def visualize_entities(text, entities):
    """Visualizes entities using spacy.displacy."""
    if text is None or entities is None:
        print("No text or entities to visualize.")
        return

    # displacy expects a dictionary with "text" and "ents" keys
    # The "ents" key should contain a list of dictionaries, each with "start", "end", and "label"
    # The extract_entities function already returns this format
    data = {
        "text": text,
        "ents": entities,
    }
    spacy.displacy.render(data, style="ent", jupyter=True)

In [9]:
# Load the custom trained spaCy model
try:
    nlp_custom = spacy.load("/content/model-best")
    print("Custom model loaded successfully.")
except Exception as e:
    print(f"Error loading custom model: {e}")
    nlp_custom = None

# Load a fallback spaCy model (e.g., en_core_web_sm)
try:
    # Check if the fallback model is already loaded to avoid reloading
    if 'nlp_fallback' not in globals() or not isinstance(nlp_fallback, spacy.Language):
        nlp_fallback = spacy.load("en_core_web_sm")
        print("Fallback model loaded successfully.")
except Exception as e:
    print(f"Error loading fallback model: {e}")
    nlp_fallback = None

def extract_entities(text, custom_model, fallback_model):
  """Extracts entities using the custom and fallback models."""
  if text is None:
    return None

  doc_custom = None
  if custom_model:
    doc_custom = custom_model(text)

  doc_fallback = None
  if fallback_model:
      doc_fallback = fallback_model(text)

  # Combine entities from both models. Prioritize custom model entities
  entities = {}
  if doc_custom:
      for ent in doc_custom.ents:
          entities[(ent.start_char, ent.end_char)] = {"text": ent.text, "label": ent.label_}

  if doc_fallback:
      for ent in doc_fallback.ents:
          if (ent.start_char, ent.end_char) not in entities:
               entities[(ent.start_char, ent.end_char)] = {"text": ent.text, "label": ent.label_}

  # Convert dictionary back to a list of Spans or a similar structure for visualization
  # For simplicity, returning a list of dictionaries for now
  return [{"text": ent["text"], "start_char": start, "end_char": end, "label": ent["label"]} for (start, end), ent in entities.items()]

Error loading custom model: [E050] Can't find model '/content/model-best'. It doesn't seem to be a Python package or a valid path to a data directory.
Fallback model loaded successfully.


In [8]:
import re

def clean_text(text):
  """Cleans the extracted text."""
  if text is None:
    return None
  # Remove multiple newlines and replace with a single space
  cleaned_text = re.sub(r'\n\s*\n', ' ', text)
  # Remove extra whitespace
  cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
  return cleaned_text

In [7]:
from docling.document_converter import DocumentConverter
import os

def extract_text_from_document(filename):
  """Extracts text from a document using docling."""
  converter = DocumentConverter()
  try:
    result = converter.convert(f"/content/{filename}")
    return result.document.export_to_markdown()
  except Exception as e:
    print(f"Error converting document: {e}")
    return None

In [6]:
from google.colab import files

def upload_document():
  """Uploads a document file from the user's local machine."""
  uploaded = files.upload()
  for filename in uploaded.keys():
    print(f'User uploaded file "{filename}"')
    return filename # Return the first uploaded filename
  return None

In [None]:
TRAIN_DATA

In [30]:
for item in tqdm(TRAIN_DATA['annotations']):
    if item is None:
        print("Skipping None item")
        continue
    text, annot = item
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./training_data.spacy")

100%|██████████| 54/54 [00:00<00:00, 1335.22it/s]

Skipping None item





In [28]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [31]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     53.72    0.00    0.00    0.00    0.00
  9     200       1261.38   3927.49   78.28   79.00   77.58    0.78
 21     400        159.84    581.71   98.88   98.66   99.10    0.99
 36     600        111.50    118.72   99.78  100.00   99.55    1.00
 55     800         71.87     52.40  100.00  100.00  100.00    1.00
 80    1000         40.04     16.01  100.00  100.00  100.00    1.00
110    1200         42.33     19.55  100.00  100.00  100.00    1.00
148    1400        457.83     45.48  100.00  100.00  100.00    1.00
194    1600        502.80    133.83  100.00  100.00  100.00    1.00
251    1800        179.54     27.41  100.00  100.00

In [36]:
nlp_ner = spacy.load("/content/model-best")

In [37]:
doc = nlp_ner("Market analysts at MorganEast Research, based in Singapore, forecast that the Indian digital lending sector will surpass ₹1.8 lakh crore in transaction volume by 2027, with Aurora capturing approximately 6.4% market share.")

In [38]:
doc1 = nlp_ner("During the annual financial audit of Riverstone Textiles Ltd conducted on August 12, 2025, auditors discovered discrepancies in the petty cash account managed by Accountant Deepak Sharma. An amount of ₹48,500 was unaccounted for between May and June 2025, allegedly due to delayed supplier reimbursements. The board requested an internal review by Omkar Consultants, who later confirmed the missing funds were reimbursed on September 3, 2025. The company’s total revenue for FY2025 was reported at ₹78.4 crore, reflecting a 5% year-over-year increase.")

In [39]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [40]:
spacy.displacy.render(doc1, style="ent", jupyter=True) # display in Jupyter