### **Training of NER Model**

In [1]:
! pip install -U spacy -q

In [2]:
!python -m spacy info

[1m

spaCy version    3.8.7                         
Location         /usr/local/lib/python3.12/dist-packages/spacy
Platform         Linux-6.6.105+-x86_64-with-glibc2.35
Python version   3.12.12                       
Pipelines        en_core_web_sm (3.8.0)        



In [4]:
import json
f = open('/content/annotations.json')
TRAIN_DATA = json.load(f)

In [5]:
TRAIN_DATA

{'classes': ['ORG',
  'PEOPLE',
  'MONEY',
  'STOCK',
  'PERCENTAGE',
  'DATE',
  'COUNTRY',
  'FINANCIAL ENTITIES',
  'CONTINENT',
  'NATURAL NUMBERS',
  'EVENT',
  'PRODUCT',
  'TIME'],
 'annotations': [['Bitcoin’s price surged by 18% in the past week, breaking the $45,000 mark after news that Tesla Inc. (TSLA) has begun accepting cryptocurrency for vehicle purchases. The announcement sent shockwaves through the market, with other major companies like Square Inc. (SQ) and PayPal (PYPL) also signaling interest in integrating Bitcoin payments. Analysts predict that Bitcoin could rise another 10% by the end of Q4 2025 if institutional adoption continues to increase.\r',
   {'entities': [[0, 9, 'FINANCIAL ENTITIES'],
     [26, 29, 'PERCENTAGE'],
     [61, 68, 'MONEY'],
     [90, 107, 'ORG'],
     [128, 142, 'FINANCIAL ENTITIES'],
     [251, 267, 'ORG'],
     [272, 284, 'ORG'],
     [325, 332, 'FINANCIAL ENTITIES'],
     [392, 395, 'PERCENTAGE'],
     [410, 417, 'DATE']]}],
  ['\r', {'ent

In [6]:
for item in tqdm(TRAIN_DATA['annotations']):
   if item is None or len(item) != 2:
        continue
   text, annot = item
   doc = nlp.make_doc(text)
   ents = []
   for start, end, label in annot["entities"]:
       span = doc.char_span(start, end, label=label, alignment_mode="contract")
       if span is None:
           print("Skipping entity")
       else:
           ents.append(span)
   doc.ents = ents
   db.add(doc)
   db.to_disk("./training_data.spacy")

100%|██████████| 59/59 [00:00<00:00, 377.48it/s]


In [7]:
!python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency --force

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [8]:
!python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     49.18    1.25    1.31    1.19    0.01
  6     200       2293.72   3501.26   69.94   72.15   67.86    0.70
 13     400        993.41    851.09   96.02   96.40   95.63    0.96
 23     600        467.52    339.19   99.40   99.60   99.21    0.99
 37     800        140.69     99.68   99.40   99.60   99.21    0.99
 53    1000         81.72     61.42   99.60   99.60   99.60    1.00
 74    1200         44.06     56.04   99.60   99.60   99.60    1.00
100    1400        148.20     97.41  100.00  100.00  100.00    1.00
133    1600         87.59     59.69  100.00  100.00  100.00    1.00
173    1800         31.30      9.34  100.00  100.00

In [9]:
nlp_ner = spacy.load("/content/model-best")

### **Docling Installation**

In [10]:
!pip install docling

Collecting docling
  Downloading docling-2.58.0-py3-none-any.whl.metadata (11 kB)
Collecting docling-core<3.0.0,>=2.48.2 (from docling-core[chunking]<3.0.0,>=2.48.2->docling)
  Downloading docling_core-2.49.0-py3-none-any.whl.metadata (6.7 kB)
Collecting docling-parse<5.0.0,>=4.7.0 (from docling)
  Downloading docling_parse-4.7.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting docling-ibm-models<4,>=3.9.1 (from docling)
  Downloading docling_ibm_models-3.10.2-py3-none-any.whl.metadata (7.3 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting pypdfium2!=4.30.1,<5.0.0,>=4.30.0 (from docling)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting rapidocr<4.0.0,>=3.3 (from docling)
  Downloading

### **Build Document-to-Entity Visualization Pipeline.**

In [5]:
import os
import spacy
from docling.document_converter import DocumentConverter
from spacy import displacy
from IPython.display import HTML, display

MODEL_PATH = r"/content/model-best"
SAMPLE_FILE = r"/content/NER_Synthetic_data.txt"
OUTPUT_HTML = "entities_output.html"

def extract_text(source_path):
    ext = os.path.splitext(source_path)[1].lower()

    if ext == ".txt":
        print("Reading plain text file...")
        with open(source_path, "r", encoding="utf-8") as f:
            return f.read()
    else:
        print(f"Extracting text using Docling from: {source_path}")
        converter = DocumentConverter()
        result = converter.convert(source_path)
        return result.document.export_to_text()

def run_ner(text):
    print("Loading NER model...")
    nlp = spacy.load(MODEL_PATH)
    doc = nlp(text)
    return doc

def visualize_entities(doc):
    print("Generating entity visualization...")
    html = displacy.render(doc, style="ent", page=True, jupyter=False)
    with open(OUTPUT_HTML, "w", encoding="utf-8") as f:
        f.write(html)
    display(HTML(html))
    print(f"Visualization saved to: {OUTPUT_HTML}")

def main():
    text = extract_text(SAMPLE_FILE)
    if not text.strip():
        print("No text extracted from the document.")
        return
    doc = run_ner(text)
    if not doc.ents:
        print("No entities detected in the document.")
    else:
        print(f"Detected {len(doc.ents)} entities.")
    visualize_entities(doc)
    print("Pipeline completed successfully.")

if __name__ == "__main__":
    main()


Reading plain text file...
Loading NER model...
Detected 252 entities.
Generating entity visualization...


Visualization saved to: entities_output.html
Pipeline completed successfully.
