In [None]:
# 📘 Named Entity Recognition (NER) from News Articles
# Task 4 - NER with SpaCy

# ================================
# 📦 Setup
# ================================
!pip install spacy pandas

import pandas as pd
import spacy
from spacy.matcher import Matcher
from spacy import displacy

# Load SpaCy models
nlp_sm = spacy.load("en_core_web_sm")    # Small, fast model
try:
    nlp_trf = spacy.load("en_core_web_trf")  # Transformer-based model
except:
    !python -m spacy download en_core_web_trf
    nlp_trf = spacy.load("en_core_web_trf")

# ================================
# 📊 Dataset (CoNLL 2003 or sample)
# ================================
# If dataset available, load. Otherwise, fallback to example text.

try:
    df = pd.read_csv("data/ner_dataset.csv", encoding="latin1")
    print("✅ Dataset Loaded")
    print(df.head())
    sample_text = "Barack Obama was born in Hawaii and served as the 44th President of the United States."
except:
    print("⚠️ Dataset not found. Using sample text instead.")
    sample_text = "Apple CEO Tim Cook met with President Joe Biden in Washington."

# ================================
# 📝 Rule-Based NER
# ================================
print("\n--- Rule-Based NER ---\n")
doc = nlp_sm(sample_text)

matcher = Matcher(nlp_sm.vocab)

# Example: Match "President <Name>"
pattern = [{"LOWER": "president"}, {"IS_TITLE": True, "OP": "+"}]
matcher.add("PRESIDENT_PATTERN", [pattern])

matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print("Rule-Based Match:", span.text)

# ================================
# 🤖 Model-Based NER (SpaCy sm)
# ================================
print("\n--- Model-Based NER (en_core_web_sm) ---\n")
doc_sm = nlp_sm(sample_text)
for ent in doc_sm.ents:
    print(ent.text, "->", ent.label_)

# ================================
# 🤖 Model-Based NER (SpaCy trf)
# ================================
print("\n--- Model-Based NER (en_core_web_trf) ---\n")
doc_trf = nlp_trf(sample_text)
for ent in doc_trf.ents:
    print(ent.text, "->", ent.label_)

# ================================
# 🎨 Visualization
# ================================
print("\n--- Visualization ---\n")
displacy.render(doc_trf, style="ent", jupyter=True)
