In [1]:
# pip install transformers

from transformers import pipeline
import pandas as pd
import re

In [None]:
# sample lookup data with financial corpora
data = {
    'abbreviation': ['Inc.', 'Co.', 'Ltd.', '&'],
    'expansion': ['Incorporated', 'Company', 'Limited', 'and']
}
df = pd.DataFrame(data)
df

In [4]:
#s Sample financial text containing several organization names and other financial entities.
financial_text = (
    "Apple Inc. reported a 12% increase in revenue in Q1 2024, beating analyst expectations. "
    "JPMorgan Chase & Co. experienced a decline in net income by 15% due to rising expenses. "
    "The European Central Bank (ECB) announced a new monetary policy from 1 Jan 2024 to stabilize the euro. "
    "Goldman Sachs Ltd. and Morgan Stanley Inc. increase for compliance issues."
)


In [None]:
# Initialise ner pipeline with a huggingface's pre-trained model.
# Remember to use HF_TOKEN to log in
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")


In [6]:
# Run the NER pipeline on the financial text.
entities = ner_pipeline(financial_text)

In [None]:
for ent in entities:
    print(ent)

Standardize Organization Names Using the Data Labeling Table

In [8]:
# Standardiz the organization name by replacing common abbreviations
def standardize_entity(entity_str, labeling_df):

    std_entity = entity_str # initialise with input string

    # Iterate over each row in the labeling table and apply replacements.
    for _, row in labeling_df.iterrows():
        # Use regex to safely replace abbreviations (handle special characters)
        abbr = re.escape(row['abbreviation'])
        expansion = row['expansion']
        std_entity = re.sub(abbr, expansion, std_entity)
    #  normalize whitespace and convert to lower case for uniformity
    std_entity = std_entity.strip()
    std_entity = std_entity.lower()
    return std_entity



In [None]:
# Apply

for ent in entities:
    # Process only entities labeled ORG.
    if ent.get("entity_group") == "ORG":
        original = ent["word"]
        standardized = standardize_entity(original, df)
        print(f"Original: {original} -> Standardized: {standardized}")
