In [5]:
import random

import spacy
nlp = spacy.load("en_core_web_md")
import en_core_web_md
nlp = en_core_web_md.load()

from spacy.util import minibatch, compounding
from spacy.training import Example
from spacy.language import Language
import re

In [15]:
#test
doc = nlp("Hello World, Apple INC, - - 1234")
print([(w.text, w.pos_) for w in doc])

[('Hello', 'INTJ'), ('World', 'PROPN'), (',', 'PUNCT'), ('Apple', 'PROPN'), ('INC', 'PROPN'), (',', 'PUNCT'), ('-', 'PUNCT'), ('-', 'PUNCT'), ('1234', 'NUM')]


In [6]:
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
import random
from spacy.language import Language

nlp = spacy.load("en_core_web_md")
if "ner" in nlp.pipe_names:
    nlp.remove_pipe("ner")
ner = nlp.add_pipe("ner")

custom_labels = ["TransactionType"]
standard_labels = ["PERSON", "ORG", "GPE", "MONEY", "CARDINAL"]
for label in custom_labels + standard_labels:
    ner.add_label(label)

# Define training data
TRAIN_DATA = [
    ("NEFT", {"entities": [(0, 4, "TransactionType")]}),
    ("RTGS", {"entities": [(0, 4, "TransactionType")]}),
    ("ACH/JAL", {"entities": [(0, 7, "TransactionType")]}),
    ("NTS", {"entities": [(0, 3, "TransactionType")]}),
    ("IMPS", {"entities": [(0, 4, "TransactionType")]}),
    ("UPI", {"entities": [(0, 3, "TransactionType")]}),
    ("NACH", {"entities": [(0, 4, "TransactionType")]}),
    ("FT", {"entities": [(0, 2, "TransactionType")]}),
    ("BHIM UPI", {"entities": [(5, 8, "TransactionType")]}),
    ("DD", {"entities": [(0, 2, "TransactionType")]}),
    ("ECS", {"entities": [(0, 3, "TransactionType")]}),
    ("AEPS", {"entities": [(0, 4, "TransactionType")]}),
    ("NEFTIN", {"entities": [(0, 6, "TransactionType")]}),
    ("RTGSIN", {"entities": [(0, 6, "TransactionType")]}),
    ("SWIFT", {"entities": [(0, 5, "TransactionType")]}),
    ("POS", {"entities": [(0, 3, "TransactionType")]})
    
]

@Language.component("custom_filter")
def custom_filter(doc):
    entities = []
    for ent in doc.ents:
        # Only keep uppercase transaction types with correct length
        if ent.text.isupper() and 2 <= len(ent.text) <= 6:
            entities.append(ent)
        elif ent.label_ in ["ORG", "PERSON", "GPE", "MONEY", "CARDINAL"]:
            entities.append(ent)
    doc.ents = entities
    return doc

# Add the custom filter component to the pipeline
nlp.add_pipe("custom_filter", after="ner")

n_iter = 100
batch_size = 4

# Training loop with higher dropout
with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != "ner"]):
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        
        for batch in batches:
            examples = []
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                examples.append(example)
            nlp.update(examples, drop=0.5, losses=losses)  # Reduced dropout
        if itn % 25 == 0:
            print(f"Iteration {itn}, Losses: {losses}")

# Save the model
nlp.to_disk("./improved_ner_model")

Iteration 0, Losses: {'ner': np.float32(17.992395)}
Iteration 25, Losses: {'ner': np.float32(3.0380247)}
Iteration 50, Losses: {'ner': np.float32(0.002953638)}
Iteration 75, Losses: {'ner': np.float32(1.3937567e-05)}


In [7]:
nlp = spacy.load("./improved_ner_model")
test_texts = [
    "RTGS transfer from SUDHAKARA INFRATECH PRIVATE LIMITED",
    "NEFT payment to INFRATECH",
    "UPI transaction with PRIVATE",
    "NEFT Cr-SBIN-SBIN0020828-SUDHAKARA INFRATECHPVT LTD"
]

for text in test_texts:
    doc = nlp(text)
    print("\nText:", text)
    print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])


Text: RTGS transfer from SUDHAKARA INFRATECH PRIVATE LIMITED
Entities: [('RTGS', 'TransactionType')]

Text: NEFT payment to INFRATECH
Entities: [('NEFT', 'TransactionType')]

Text: UPI transaction with PRIVATE
Entities: [('UPI', 'TransactionType')]

Text: NEFT Cr-SBIN-SBIN0020828-SUDHAKARA INFRATECHPVT LTD
Entities: [('NEFT', 'TransactionType'), ('SBIN', 'TransactionType'), ('LTD', 'TransactionType')]
