In [34]:
import pandas as pd
import spacy
from fuzzywuzzy import fuzz, process
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nlp = spacy.load("en_core_web_sm")

# Load your DataFrame
df = pd.read_csv("articles_coref.csv")


In [31]:
entity_mapping = { #rule based entity mapping for abbreviations
    "AAP": "Aam Aadmi Party",
    "BJP": "Bharatiya Janata Party",
    "INC": "Indian National Congress",
    "Congress": "Indian National Congress",
    "EC": "Election Commission",
    "ECI": "Election Commission of India"
}

In [32]:
def normalize_entities(entities): #normalises entities with certain predifined rules
   
    normalized = []
    for entity in entities:
        entity = entity_mapping.get(entity, entity)  # Replace abbreviations 
        entity = entity.replace(" ji", "").strip()  # Remove 'ji' 
        normalized.append(entity)
    return list(set(normalized))  # Remove duplicates

def extract_entities(text): #extracts entities from Spacy

    doc = nlp(text)
    return list(set(ent.text for ent in doc.ents if ent.label_ in ["ORG", "PERSON", "GPE", "LOC", "FAC", "EVENT"])) # exludes certain numeric labels like DATE, TIME, MONEY, which do not need to be resolved

In [33]:
# Extract the entities 
df["entities"] = df["articles"].apply(extract_entities)

# Normalize them 
df["normalized_entities"] = df["entities"].apply(normalize_entities)

# Get all unique entities 
all_entities = list(set(entity for entities in df["normalized_entities"] for entity in entities))

In [27]:
print(all_entities)

['ADR', 'Bengal', 'Shinde', 'Suleman', 'Nepali', 'minister.jpg', 'Amarjeet Prasad', 'the Bharatiya Janata Party', 'Bangladesh', 'Brahm Prakash', 'Jitender Mahajan', 'Madhur Verma', 'Mangolpuri', 'Monsoon', 'Council of Ministers', 'LJP', 'JLN Marg', 'Sahib Singh Verma', 'Jee', 'Kumar Singh', 'Jalyukt Shivar Abhiyaan', "the International Women's Day", 'the Janata Dal (United', 'Chugh', 'Karawal Nagar Assembly', 'Babarpur', 'Digital India', 'Jairam Ramesh', 'Observer- Kunal Silku', 'R.K Puram', 'Rajesh Gupta', 'the Council’s LoPs', 'Sachdeva', 'the Lok Sabha', 'Haryanvi', 'Kiradi', 'Baijayant Jay Panda', 'Kumar Kochar', 'Ramlila Maidan', 'Lodhi Road', 'Alka Lamba', 'Atal Canteens', 'DMK', 'Connaught Place', 'Mohalla Clinics', 'Dinesh Lal Yadav', 'Okhla', 'East Delhi', 'the Public Welfare Department', 'FST', 'NDA', 'Kalkaji AAP', 'Prema Devi', 'Yamuna Ghat', 'Pankaj Singh', 'the Lodhi Road', 'GK', 'Vasant Kunj', 'Tilak Ram Gupta', 'Mundka', 'Shifa Ur Rehman', 'Email', 'Ashok Agarwal', 'X.\

Used both fuzzy matching and cosine similarity to resolve entities

In [38]:
def fuzzy_match(entities, threshold=80):
  
    matched_entities = {}
    for entity in entities:
        best_match, score = process.extractOne(entity, entities)
        if score >= threshold:
            matched_entities[entity] = best_match
        else:
            matched_entities[entity] = entity
    return matched_entities



In [39]:
fuzz_resolved = fuzzy_match(all_entities)

In [None]:
vectorizer = TfidfVectorizer().fit_transform(list(fuzz_resolved.values()))
similarity_matrix = cosine_similarity(vectorizer)

# using cosine similarity 
threshold = 0.75
entity_clusters = defaultdict(list)
visited = set()

for i, entity in enumerate(fuzz_resolved.values()):
    if entity in visited:
        continue
    cluster = [entity]
    for j, other_entity in enumerate(fuzz_resolved.values()):
        if i != j and similarity_matrix[i, j] > threshold:
            cluster.append(other_entity)
            visited.add(other_entity)
    entity_clusters[entity] = list(set(cluster))
    visited.update(cluster)


In [47]:
entity_resolution_map = {}
for rep_entity, cluster in entity_clusters.items():
    for entity in cluster:
        entity_resolution_map[entity] = rep_entity


In [48]:
def resolve_entities(entities):
    return list(set(entity_resolution_map.get(ent, ent) for ent in entities))


In [49]:
df["resolved_entities"] = df["normalized_entities"].apply(resolve_entities)


In [52]:
print("Resolved Entity Mapping:")
for entity, resolved in entity_resolution_map.items():
    print(f"{entity} → {resolved}")

Resolved Entity Mapping:
ADR → ADR
Bengal Assembly → Bengal
Bengal → Bengal
Shinde → Shinde
Suleman → Suleman
Nepali → Nepali
minister.jpg → minister.jpg
Amarjeet Prasad → Amarjeet Prasad
the Bharatiya Janata Party’s → the Bharatiya Janata Party
the Bharatiya Janata Party → the Bharatiya Janata Party
Bharatiya Janata Party → the Bharatiya Janata Party
the Janata Party → the Bharatiya Janata Party
Bharatiya Janata Party’s → the Bharatiya Janata Party
the Bharatiya Janata Party ’s → the Bharatiya Janata Party
Bharatiya Janata Party’s Parvesh → the Bharatiya Janata Party
Bangladesh → Bangladesh
Brahm Prakash → Brahm Prakash
MLA Jitender Mahajan → Jitender Mahajan
Jitender Mahajan → Jitender Mahajan
Madhur Verma → Madhur Verma
Mangolpuri → Mangolpuri
Monsoon → Monsoon
the Council of Ministers → Council of Ministers
Council of Ministers → Council of Ministers
LJP → LJP
JLN Marg → JLN Marg
Sahib Singh Verma → Sahib Singh Verma
Singh Verma → Verma
Jee → Jee
Kumar → Kumar Singh
Kumar Singh → K

In [53]:
df.to_csv("resolved_entities.csv", index=False)