In [1]:
# pip install transformers

from transformers import pipeline
import pandas as pd
import re

In [2]:
# sample lookup data with financial corpora
data = {
    'abbreviation': ['Inc.', 'Co.', 'Ltd.', '&'],
    'expansion': ['Incorporated', 'Company', 'Limited', 'and']
}
df = pd.DataFrame(data)
df

Unnamed: 0,abbreviation,expansion
0,Inc.,Incorporated
1,Co.,Company
2,Ltd.,Limited
3,&,and


In [4]:
#s Sample financial text containing several organization names and other financial entities.
financial_text = (
    "Apple Inc. reported a 12% increase in revenue in Q1 2024, beating analyst expectations. "
    "JPMorgan Chase & Co. experienced a decline in net income by 15% due to rising expenses. "
    "The European Central Bank (ECB) announced a new monetary policy from 1 Jan 2024 to stabilize the euro. "
    "Goldman Sachs Ltd. and Morgan Stanley Inc. increase for compliance issues."
)


In [5]:
# Initialise ner pipeline with a huggingface's pre-trained model.
# Remember to use HF_TOKEN to log in
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")


config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


In [6]:
# Run the NER pipeline on the financial text.
entities = ner_pipeline(financial_text)

In [7]:
for ent in entities:
    print(ent)

{'entity_group': 'ORG', 'score': 0.99953616, 'word': 'Apple Inc', 'start': 0, 'end': 9}
{'entity_group': 'ORG', 'score': 0.99930257, 'word': 'JPMorgan Chase & Co', 'start': 88, 'end': 107}
{'entity_group': 'ORG', 'score': 0.99946886, 'word': 'European Central Bank', 'start': 180, 'end': 201}
{'entity_group': 'ORG', 'score': 0.9991114, 'word': 'ECB', 'start': 203, 'end': 206}
{'entity_group': 'ORG', 'score': 0.99943584, 'word': 'Goldman Sachs Ltd', 'start': 279, 'end': 296}
{'entity_group': 'ORG', 'score': 0.99944955, 'word': 'Morgan Stanley Inc', 'start': 302, 'end': 320}


Standardize Organization Names Using the Data Labeling Table

In [8]:
# Standardiz the organization name by replacing common abbreviations
def standardize_entity(entity_str, labeling_df):

    std_entity = entity_str # initialise with input string

    # Iterate over each row in the labeling table and apply replacements.
    for _, row in labeling_df.iterrows():
        # Use regex to safely replace abbreviations (handle special characters)
        abbr = re.escape(row['abbreviation'])
        expansion = row['expansion']
        std_entity = re.sub(abbr, expansion, std_entity)
    #  normalize whitespace and convert to lower case for uniformity
    std_entity = std_entity.strip()
    std_entity = std_entity.lower()
    return std_entity



In [9]:
# Apply

for ent in entities:
    # Process only entities labeled ORG.
    if ent.get("entity_group") == "ORG":
        original = ent["word"]
        standardized = standardize_entity(original, df)
        print(f"Original: {original} -> Standardized: {standardized}")


Original: Apple Inc -> Standardized: apple inc
Original: JPMorgan Chase & Co -> Standardized: jpmorgan chase and co
Original: European Central Bank -> Standardized: european central bank
Original: ECB -> Standardized: ecb
Original: Goldman Sachs Ltd -> Standardized: goldman sachs ltd
Original: Morgan Stanley Inc -> Standardized: morgan stanley inc
