In [1]:
# Pre-Block: Downloads & Setup 
import nltk

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("framenet_v17")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('names')

%pip install geopandas
%pip install geopy
%pip install stanza
%pip install geodatasets
%pip install folium
%pip install xgboost
%pip install scikit-learn
%pip install spacy
%pip install fuzzywuzzy
%pip install contractions
%pip install rapidfuzz
%pip install pycountry



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alicja/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/alicja/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/alicja/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package framenet_v17 to
[nltk_data]     /Users/alicja/nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/alicja/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/alicja/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/alicja/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already 

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Block 1: Imports 


import os, re, time, json, unicodedata
from pathlib import Path


import yaml
import fitz  # PyMuPDF
import requests

# Data / ML
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

# Geo / viz
import folium
from folium.plugins import AntPath
from shapely.geometry import Point
from geopy.distance import geodesic

from typing import List, Tuple, Dict, Set, Union
from tqdm import tqdm
from rapidfuzz import fuzz




# NLP / NLTK
import nltk
from nltk.corpus import stopwords, names, wordnet as wn, framenet as fn
from nltk import ne_chunk, pos_tag, word_tokenize

# Project helpers
from geoparser import nlp_helpers as nh
import importlib; importlib.reload(nh)
from geoparser import enrichment_helpers

nlp, stanza_pipeline = nh.init_nlp(lang="es", with_stanza=False)

print("spaCy pipeline:", nlp.pipe_names)

# stopwords for mixed ES/PT/EN corpora:
stopset = nh.get_stopwords(nlp, langs=["es", "pt", "en"])



# Ensure a real English model is installed and loaded
try:
    import spacy
    try:
        nlp = spacy.load("en_core_web_md")
    except Exception:
        nlp = spacy.load("en_core_web_sm")
except OSError as e:
    raise RuntimeError(
        "spaCy model not installed. Run: "
        "python -m spacy download en_core_web_md  (or en_core_web_sm)"
    ) from e

print("spaCy pipeline:", nlp.pipe_names)
if set(nlp.pipe_names) <= {"sentencizer","senter"}:
    raise RuntimeError(
        "spaCy model lacks tagger/ner; install en_core_web_md or en_core_web_sm."
    )



spaCy pipeline: ['sentencizer']
spaCy pipeline: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [3]:

# Block 2: Extract text from PDF using PyMuPDF; pages 28-148

def extract_text_from_pdf(pdf_path: str, start_page: int, end_page: int) -> str:
    """Extracts and returns text from a PDF given a path and page range."""
    doc = fitz.open(pdf_path)
    pages = doc[start_page:end_page]
    return "\n".join(page.get_text() for page in pages)



#Load configuration from YAML file
def load_config(config_path: str = "config.yaml") -> dict:
    """Loads YAML configuration file and returns it as a dictionary."""
    with open(config_path, "r") as file:
        
        return yaml.safe_load(file)
    
config = load_config()
pdf_conf = config["pdf"]
raw_text = extract_text_from_pdf(pdf_conf["path"], pdf_conf["start_page"], pdf_conf["end_page"])




In [4]:
#Block 3: Text Preprocessing Functions

from geoparser.nlp_helpers import (
    init_nlp,
    get_stopwords,
    tag_named_entities,
    extract_text_from_pdf,
    load_config,
    normalize_punctuation,
    clean_light,
    preprocess_text,
    segment_sentences,
    clean_heavy,
    load_filters,
    normalize_diacritics,
    apply_ocr_replacements,
    is_caption_line,
    filter_raw_sentences
    
)

In [5]:
# Block 4: Clean and Save Tagged + NLP Versions 

# Load config
config = load_config()
pdf_conf = config["pdf"]
gaz_conf = config["gazetteer"]

# Output filenames based on input PDF
base_name = Path(pdf_conf["path"]).stem
tagged_path = Path("outputs") / f"cleaned_{base_name}_geoparsing.txt"
heavy_path = Path("outputs") / f"cleaned_{base_name}_nlp.txt"

# Create outputs dir if needed
os.makedirs("outputs", exist_ok=True)

# 🧹 Clean raw text (light and heavy)
light_cleaned = clean_light(raw_text)
sentences = segment_sentences(light_cleaned, nlp)
sentences_with_tags = [f"[SENT {i+1}] {s}" for i, s in enumerate(sentences)]

all_stops = get_stopwords(nlp)
heavy_cleaned = clean_heavy(light_cleaned, nlp, all_stops)

# Save both cleaned versions (one seems by this stage to be unnecessary, can as well drop the second pdf creation)
with open(tagged_path, "w", encoding="utf-8") as f:
    f.write("\n".join(sentences_with_tags))

with open(heavy_path, "w", encoding="utf-8") as f:
    f.write(heavy_cleaned)

print("Cleaned outputs saved:")
print(f"- Geoparsing (light): {tagged_path}")
print(f"- NLP prep (heavy): {heavy_path}")

# Prepare sentence_data for Block 7
sentence_data = [(i, sent.strip()) for i, sent in enumerate(sentences)]
print(f"sentence_data prepared with {len(sentence_data)} narrative sentences.")



Cleaned outputs saved:
- Geoparsing (light): outputs/cleaned_MotorcycleDiaries_geoparsing.txt
- NLP prep (heavy): outputs/cleaned_MotorcycleDiaries_nlp.txt
sentence_data prepared with 2007 narrative sentences.


In [6]:
# Build symbolic verb lexicon from WordNet (only once)


def get_symbolic_verb_synonyms():
    base_words = [
        "dream", "hope", "struggle", "escape", "resist", "believe", "follow", 
        "ride", "rebel", "fight", "flee", "live", "return", "envision", "imagine", "create", "inspire", "transform", "change", "grow"
    ]
    synonyms = set()
    for word in base_words:
        for syn in wordnet.synsets(word, pos=wordnet.VERB):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name().lower().replace("_", " "))
    return synonyms

from nltk.corpus import wordnet # Ensure WordNet is available, 
# Save the expanded verb set to reuse
SYMBOLIC_VERBS = get_symbolic_verb_synonyms()
print(f"Loaded {len(SYMBOLIC_VERBS)} symbolic verb forms from WordNet.")

# Build movement verb list using WordNet

def get_movement_verbs():
    base = ["go", "move", "travel", "walk", "drive", "ride", "arrive", "depart", "leave", "return", "cross", "fly", "sail", "swim", "follow","hike"]
    move_verbs = set()
    for word in base:
        for syn in wordnet.synsets(word, pos=wordnet.VERB):
            for lemma in syn.lemmas():
                move_verbs.add(lemma.name().lower().replace("_", " "))
                
    return move_verbs

MOVEMENT_VERBS = get_movement_verbs()
print(f"Loaded {len(MOVEMENT_VERBS)} movement verb forms from WordNet.")

Loaded 227 symbolic verb forms from WordNet.
Loaded 238 movement verb forms from WordNet.


In [7]:

# --- Block 6 preamble: make sure gazetteer helpers are importable ---
import sys, json, yaml
from pathlib import Path

# Ensure project root is on sys.path so 'geoparser/...' resolves
def _ensure_project_root_on_path() -> None:
    here = Path.cwd()
    for p in [here, *list(here.parents)][:5]:
        if (p / "geoparser" / "gazetteer_helpers.py").exists():
            if str(p) not in sys.path:
                sys.path.insert(0, str(p))
            return
_ensure_project_root_on_path()

# Try normal import
try:
    from geoparser.gazetteer_helpers import build_gazetteer_from_conf, build_gazetteer  # type: ignore
except ImportError:
    # Fallback: same-dir module name
    try:
        from gazetteer_helpers import build_gazetteer_from_conf, build_gazetteer  # type: ignore
    except ImportError as e:
        raise ImportError(
            "Couldn't import gazetteer helpers. "
            "Check that 'geoparser/gazetteer_helpers.py' exists AND 'geoparser/__init__.py' is present."
        ) from e

# If the module exports only build_gazetteer, create a tiny wrapper so Block 6 can call the same name
if "build_gazetteer_from_conf" not in globals():
    def build_gazetteer_from_conf(gconf: dict):
        return build_gazetteer(
            username=gconf["username"],
            countries=gconf["countries"],
            max_rows=gconf.get("max_rows", 1000),
            host=gconf.get("host", "api.geonames.org"),
            https=bool(gconf.get("https", False)),
            page_size=gconf.get("page_size", 1000),
            timeout=gconf.get("timeout", 20),
            retries=gconf.get("retries", 4),
            backoff_base=gconf.get("backoff_base", 1.5),
            sleep_between=gconf.get("sleep_between", 0.6),
        )

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)
gaz_conf = config["gazetteer"]

gazetteer_path = Path("outputs/gazetteer_cities.json")

if gaz_conf.get("force_rebuild", False) or not gazetteer_path.exists():
    print("Building gazetteer from GeoNames (config-driven)…")
    gazetteer = build_gazetteer_from_conf(gaz_conf)
    gazetteer_path.write_text(json.dumps(gazetteer, ensure_ascii=False, indent=2))
    print("Gazetteer built and saved with coordinates.")
else:
    print("Found existing gazetteer file, loading…")
    gazetteer = json.loads(gazetteer_path.read_text())


Building gazetteer from GeoNames (config-driven)…
Downloading cities from GeoNames...
AR: SSL error on HTTPS; falling back to HTTP…
AR: Loaded 1000 cities (kept most-populous per name).
CL: Loaded 1000 cities (kept most-populous per name).
PE: Loaded 1000 cities (kept most-populous per name).
CO: Loaded 1000 cities (kept most-populous per name).
VE: Loaded 1000 cities (kept most-populous per name).
BO: Loaded 1000 cities (kept most-populous per name).
EC: Loaded 1000 cities (kept most-populous per name).
PA: Loaded 1000 cities (kept most-populous per name).
CR: Loaded 1000 cities (kept most-populous per name).
GT: Loaded 1000 cities (kept most-populous per name).
MX: Loaded 1000 cities (kept most-populous per name).
CU: Loaded 1000 cities (kept most-populous per name).
BR: Loaded 1000 cities (kept most-populous per name).
GY: Loaded 906 cities (kept most-populous per name).
PY: Loaded 1000 cities (kept most-populous per name).
SR: Loaded 548 cities (kept most-populous per name).
UY: Lo

In [8]:
cfg = load_config("config.yaml")
filters = load_filters(cfg)

# Apply OCR fixes to original text (not lowercasing)
if isinstance(sentence_data, list) and sentence_data and isinstance(sentence_data[0], tuple):
    sentence_data_fixed = []
    for sid, txt in sentence_data:
        txt2 = apply_ocr_replacements(txt, filters["ocr_replacements"])
        sentence_data_fixed.append((sid, txt2))
else:
    sentence_data_fixed = [
        apply_ocr_replacements(txt, filters["ocr_replacements"]) for txt in sentence_data
    ]

# Drop footnotes/captions/sections based on config / one of the initial problems that caused a lot of noise 
sentence_data_prefiltered = filter_raw_sentences(sentence_data_fixed, filters)

print(f"Pre-filtered sentences: kept {len(sentence_data_prefiltered)} / {len(sentence_data)}")
SENTENCES_FOR_NER = sentence_data_prefiltered


Pre-filtered sentences: kept 1971 / 2007


In [9]:
# === Block 7: NER + Gazetteer with Metonymy Filtering (Country-Aware, Disambiguation-Scored, Final Tweaks) ===
from __future__ import annotations

# stdlib
from pathlib import Path
import re, json, unicodedata
from typing import List, Dict, Set, Tuple, Optional
from collections import defaultdict

# third-party
import pandas as pd
from tqdm import tqdm
from rapidfuzz import fuzz, process
import spacy

# project helpers
from geoparser.nlp_helpers import (
    init_nlp, load_config, get_stopwords
)
from geoparser.gazetteer_helpers import (
    build_gazetteer, gazetteer_names,
    build_gazetteer_patterns, match_gazetteer_precompiled,
    remove_overlapping_shorter
)

# ---------------------- NLP & Config ----------------------
try:
    nlp  # noqa: F821
except NameError:
    nlp, stanza_pipeline = init_nlp(lang="en", prefer=["en_core_web_md", "en_core_web_sm"], with_stanza=False)

USE_STANZA = bool(globals().get("stanza_pipeline"))
print("spaCy pipeline:", getattr(nlp, "pipe_names", []))
if set(getattr(nlp, "pipe_names", [])) <= {"sentencizer", "senter"}:
    raise RuntimeError("spaCy model lacks tagger/ner. Load 'en_core_web_md' or 'en_core_web_sm' before Block 7.")

try:
    cfg  # noqa: F821
except NameError:
    cfg = load_config("config.yaml")

stop_words = get_stopwords(nlp, langs=["en", "es", "pt"])

# ---------------------- Gazetteer cache I/O ----------------------
CACHE_PATH = Path("outputs/geonames_cache.json")
CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)

def _load_gazetteer_cache(path: Path) -> Dict[str, Dict]:
    if not path.exists():
        return {}
    try:
        data = json.loads(path.read_text(encoding="utf-8"))
        if isinstance(data, list):
            return {row["name"].lower(): {k: v for k, v in row.items() if k != "name"} for row in data if "name" in row}
        if isinstance(data, dict):
            return {k.lower(): v for k, v in data.items()}
    except Exception:
        pass
    return {}

def _save_gazetteer_cache(g: Dict[str, Dict], path: Path) -> None:
    try:
        rows = [{"name": k, **v} for k, v in g.items()]
        path.write_text(json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8")
        print(f"💾 Saved gazetteer cache: {path}")
    except Exception as e:
        print(f"Cache save failed: {e}")

# SAFETY: ensure 'gazetteer' is dict
try:
    gazetteer  # noqa: F821
    if not isinstance(gazetteer, dict):
        print(f"⚠️ Detected stale gazetteer of type {type(gazetteer).__name__}; resetting.")
        gazetteer = {}
except NameError:
    gazetteer = {}

gcfg = cfg.get("gazetteer", {})
_cache = _load_gazetteer_cache(CACHE_PATH)
gazetteer_loaded_from_cache = False

if (_cache):
    gazetteer = _cache
    gazetteer_loaded_from_cache = True
    print(f"💾 Loaded gazetteer cache: {len(gazetteer)} names")
else:
    print("↺ Building gazetteer from source…")
    try:
        candidate = build_gazetteer(
            username=gcfg.get("username", ""),
            countries=gcfg.get("countries", []),
            max_rows=int(gcfg.get("max_rows", 1000))
        )
        if not candidate or not any(("country" in v or "country_code" in v) for v in candidate.values()):
            raise RuntimeError("GeoNames quota/throttle? Missing 'country'/'country_code' in build.")
        gazetteer = candidate
        _save_gazetteer_cache(gazetteer, CACHE_PATH)
    except Exception as e:
        print(f"⏭️ Skipping gazetteer rebuild due to API issue: {e}")
        gazetteer = _load_gazetteer_cache(CACHE_PATH) or {}

# ensure country fields exist (retrofill if only country_code present)
def _retrofill_country(g: Dict[str, Dict]) -> int:
    COUNTRY_NAME = {
        "AR":"Argentina","CL":"Chile","PE":"Peru","CO":"Colombia","VE":"Venezuela",
        "BO":"Bolivia","EC":"Ecuador","PA":"Panama","CR":"Costa Rica","GT":"Guatemala",
        "MX":"Mexico","CU":"Cuba","BR":"Brazil","GY":"Guyana","PY":"Paraguay",
        "SR":"Suriname","UY":"Uruguay","HN":"Honduras","SV":"El Salvador","NI":"Nicaragua",
        "GF":"French Guiana"
    }
    changed = 0
    for k, v in g.items():
        cc = v.get("country_code") or v.get("countryCode") or v.get("cc")
        if cc and "country" not in v:
            v["country"] = COUNTRY_NAME.get(cc, cc)
            changed += 1
    return changed

def _gaz_has_country(g) -> bool:
    if not isinstance(g, dict) or not g:
        return False
    ok = sum(("country" in v and "country_code" in v) for v in g.values())
    return (ok / max(len(g), 1)) >= 0.95

if not _gaz_has_country(gazetteer):
    _retrofill_country(gazetteer)

# ---------------------- Gazetteer regex filters ----------------------
try:
    from nltk.corpus import stopwords as nltk_stopwords
    _EN_STOP = set(nltk_stopwords.words("english"))
except Exception:
    _EN_STOP = set(w.lower() for w in stop_words)

BANNED_SINGLE_HEADS = {
    "hospital","station","school","airport","bridge","park","market",
    "university","college","city","province","region","mama","friends","best"
}
BANNED_FULL_NAMES = {"the best"}
_COUNTRY_NAMES_EN = {
    "argentina","chile","peru","colombia","venezuela","bolivia","ecuador",
    "panama","costa rica","guatemala","mexico","cuba","brazil","guyana",
    "paraguay","suriname","uruguay","honduras","el salvador","nicaragua"
}

def _looks_like_place_name(name: str) -> bool:
    t = (name or "").strip().lower()
    if not t: return False
    if t in BANNED_FULL_NAMES: return False
    toks = re.findall(r"[a-zà-ÿ]+", t)
    if not toks: return False
    if len(toks) == 1 and toks[0] in BANNED_SINGLE_HEADS: return False
    if all(tok in _EN_STOP for tok in toks): return False
    return True

_PLACES: Set[str] = {n for n in gazetteer_names(gazetteer) if _looks_like_place_name(n)}
GAZ_PATTERNS = build_gazetteer_patterns(_PLACES, _EN_STOP)

def match_gazetteer_safe(text: str) -> List[Tuple[str, str, int, int]]:
    hits = match_gazetteer_precompiled(text, GAZ_PATTERNS)
    safe = []
    for sl, _, s, e in hits:
        # drop bare country words from regex layer (handled explicitly later)
        if " " not in sl and sl.lower() in _COUNTRY_NAMES_EN:
            continue
        if sl.strip().lower() in BANNED_FULL_NAMES:
            continue
        safe.append((sl, "GAZETTEER", s, e))
    return safe

# ---------------------- Narrative cues / heuristics ----------------------
try:
    from geoparser.nlp_helpers import get_symbolic_verb_synonyms
    SYMBOLIC_VERBS = get_symbolic_verb_synonyms()
except Exception:
    SYMBOLIC_VERBS = {"govern","rule","dominate","represent","symbolize","embody"}

MOVEMENT_VERBS = {"arrive","depart","cross","enter","leave","visit",
                  "walk","ride","drive","sail","return","board","catch","take"}

PREPS = {"in","to","from","into","through","via","at","near","towards","toward","onto","across","over","along","past","around"}

def heuristic_accept(ent_text: str, doc) -> bool:
    ent_l = ent_text.lower()
    span = next((ent for ent in doc.ents if ent.text.lower() == ent_l), None)
    if span is None:
        for i, t in enumerate(doc):
            if ent_l in t.text.lower():
                span = doc[i:i+1]; break
    if span is None:
        return True
    head = span.root if hasattr(span, "root") else span[0]
    cap_ok = (span.text[:1].isupper() or len(span) > 1) and (head.pos_ in {"PROPN","NOUN"} or span.label_ in {"GPE","LOC","FAC"})
    if cap_ok:
        return True
    L, R = max(0, head.i-3), min(len(doc), head.i+4)
    if any(tok.pos_ == "ADP" and tok.lower_ in PREPS for tok in doc[L:R]):
        return True
    if any(tok.pos_ == "VERB" and tok.lemma_.lower() in MOVEMENT_VERBS for tok in doc[L:R]):
        return True
    return False

# ---------------------- Person / stoplike helpers ----------------------
try:
    from nltk.corpus import names
    _EN_FIRSTNAMES: Set[str] = {n.lower() for n in names.words()}
except Exception:
    _EN_FIRSTNAMES = set()

# small, high-frequency ES first names (diacritic-insensitive check)
_ES_FIRSTNAMES = {
    "jose","juan","luis","carlos","miguel","jorge","pedro","maria","ana","rosa",
    "teresa","marta","carmen","elena","laura","silvia","sofia","patricia","andrea","daniela",
    "benjamin","benjamín"
}

_MONTHS_EN = {"january","february","march","april","may","june","july","august","september","october","november","december"}
_MONTHS_EN_ABBR = {"jan","feb","mar","apr","may","jun","jul","aug","sep","sept","oct","nov","dec"}
_MONTHS_ES = {"enero","febrero","marzo","abril","mayo","junio","julio","agosto","septiembre","setiembre","octubre","noviembre","diciembre"}
_MONTHS_ES_ABBR = {"ene","feb","mar","abr","may","jun","jul","ago","sept","set","oct","nov","dic"}
_DAYS_EN   = {"monday","tuesday","wednesday","thursday","friday","saturday","sunday"}

def strip_diacritics(s: str) -> str:
    return unicodedata.normalize("NFKD", s or "").encode("ascii","ignore").decode("utf-8")

def _is_stoplike(tok: str) -> bool:
    t = tok.lower()
    return t in _EN_STOP or t in _MONTHS_EN or t in _DAYS_EN

def _looks_like_firstname(tok: str) -> bool:
    t = tok.lower()
    return (t in _EN_FIRSTNAMES) or (strip_diacritics(t) in _ES_FIRSTNAMES)

def _valid_toponym(span_text: str) -> bool:
    t = span_text.strip()
    toks = re.findall(r"[A-Za-zÀ-ÿ]+", t)
    if not toks:
        return False
    if len(toks) == 1:
        tok = toks[0]
        if t.islower(): return False
        if _is_stoplike(tok): return False
        if _looks_like_firstname(tok): return False
    return True

# PATCH: extend composite heads (helps grow one-word leads)
COMPOSITE_HEADS = {
    "villa","puerto","bahía","bahia","río","rio","cerro","san","santa","santo",
    "laguna","lago","isla","punta","playa","quebrada","arroyo","valle","valley"
}
AMBIGUOUS_SINGLETONS = {"sierra","villa","serra","rio"}

def _person_prefix_rule(sent_text: str, span_text: str) -> bool:
    head = re.findall(r"[A-Za-zÀ-ÿ]+", span_text.lower())[:1]
    if head and head[0] in COMPOSITE_HEADS:
        return False
    # allow diacritics & multi-token first names
    m = re.search(r"\b([A-ZÀ-Ý][a-zà-ÿ]+(?:\s+[A-ZÀ-Ý][a-zà-ÿ]+)?)\s+" + re.escape(span_text) + r"\b", sent_text)
    return bool(m and _looks_like_firstname(m.group(1)))

NOISE_NAMES   = {"la pedrosa"}  # 'la poderosa' handled as transport
VEHICLE_TERMS = {
    "motorcycle","motorbike","bike","bicycle","boat","ship","raft","car","truck",
    "jeep","bus","van","lorry","pickup","steamer","steamship","vessel","launch","lancha","barco","bote","ferry"
}

_HONORIFICS = r"(Mr|Mrs|Ms|Dr|Sr|Sra|Srta)\."
_HONORIFIC_NEAR_ENTITY = re.compile(rf"\b{_HONORIFICS}\s+([A-ZÀ-Ý][a-zà-ÿ]+)\b")

_THE_TITLE = re.compile(r"^the\s+[A-Z][\wÀ-ÿ-]+(?:\s+[A-Z][\wÀ-ÿ-]+)?$", re.IGNORECASE)

_SHIP_MOVE_VERBS = {
    "board","embark","sail","moor","dock","berth","launch","load","unload","carry","hoist",
    "abordar","embarcar","zarpar","atracar","fondear","cargar","descargar"
}

def _norm_text(s: str) -> str:
    s = (s or "")
    s = unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("utf-8")
    return re.sub(r"\s+", " ", s.strip().lower())

def _norm_entity(s: str) -> str:
    s = (s or "").strip().lower()
    s = re.sub(r"[’']s\b", "", s)
    s = s.strip('"\'' "“”‘’")
    s = unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("utf-8")
    return re.sub(r"\s+", " ", s)

def is_named_object(entity_text: str, sentence: str) -> bool:
    ent_n = _norm_entity(entity_text)
    if ent_n in NOISE_NAMES:
        return True
    # transport proximity → probably not a place
    doc = nlp(sentence)
    for i, tok in enumerate(doc):
        lemma = tok.lemma_.lower()
        if lemma in VEHICLE_TERMS or lemma in _SHIP_MOVE_VERBS:
            L, R = max(0, i-6), min(len(doc), i+7)
            win_text = _norm_text(" ".join(t.text for t in doc[L:R]))
            if ent_n and ent_n in win_text:
                return True
    # honorific fallback
    m = _HONORIFIC_NEAR_ENTITY.search(sentence)
    if m and entity_text.strip() == m.group(1):
        return True
    # "The Modesta Victoria" with movement context
    if _THE_TITLE.fullmatch(entity_text.strip()) and re.search(
        r"\b(board|embark|sail|ship|boat|barco|lancha)\b", sentence.lower()
    ):
        return True
    return False

def is_probable_metonymy(entity_text: str, sentence: str, label: str) -> bool:
    if label not in {"GPE","COUNTRY"}:
        return False
    doc = nlp(sentence)
    ent_l = entity_text.lower()
    cue_nouns = {"government","policy","military","regime","parliament","industry","media","press","power"}
    has_cue = any(tok.lemma_.lower() in cue_nouns for tok in doc)
    if not has_cue:
        return False
    idxs = [i for i,t in enumerate(doc) if ent_l in t.text.lower()]
    return any(
        any(abs(j - i) <= 5 for j,_ in enumerate(doc) if doc[j].lemma_.lower() in cue_nouns)
        for i in idxs
    )

# ---------------------- Temporal + Transport (EN + ES) ----------------------
DATE_PATTERNS = [
    # English
    (re.compile(r"\b(?:%s)\s+\d{1,2},\s*(\d{4})\b" % "|".join(_MONTHS_EN), re.I), "year"),
    (re.compile(r"\b\d{1,2}\s+(?:%s)\s+(\d{4})\b" % "|".join(_MONTHS_EN), re.I), "year"),
    (re.compile(r"\b(?:%s)\s+(\d{4})\b" % "|".join(_MONTHS_EN), re.I), "year"),
    (re.compile(r"\b(?:%s)\.?\s+(\d{4})\b" % "|".join(_MONTHS_EN_ABBR), re.I), "year"),
    (re.compile(r"\b(19|20)\d{2}\b"), "year"),
    # Spanish
    (re.compile(r"\b(\d{1,2})\s+de\s+(%s)(?:\s+de\s+(\d{4}))?\b" % "|".join(_MONTHS_ES), re.I), "day-month-year?"),
    (re.compile(r"\b(en|del)\s+(%s)\s+de\s+(\d{4})\b" % "|".join(_MONTHS_ES), re.I), "month-year"),
    (re.compile(r"\b(%s)\s+de\s+(\d{4})\b" % "|".join(_MONTHS_ES), re.I), "month-year"),
    (re.compile(r"\b(?:%s)\.?\s+(?:de\s+)?(\d{4})\b" % "|".join(_MONTHS_ES_ABBR), re.I), "year"),
]

def _extract_first_date(text: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
    """
    Returns (date_norm, year, granularity).
    Robust 4-digit year capture; Spanish keeps month names for downstream normalization.
    """
    if not isinstance(text, str) or not text.strip():
        return None, None, None
    for rx, kind in DATE_PATTERNS:
        m = rx.search(text)
        if not m:
            continue
        if kind == "year":
            # prefer any captured 4-digit group; else find 4-digit in the full match
            y = None
            if m.lastindex:
                for i in range(1, m.lastindex + 1):
                    gi = m.group(i)
                    if gi and re.fullmatch(r"\d{4}", gi):
                        y = gi; break
            if not y:
                m_year = re.search(r"\b(19|20)\d{2}\b", m.group(0))
                y = m_year.group(0) if m_year else None
            return (y, y, "year") if y else (None, None, None)
        if kind == "day-month-year?":
            d, mon, y = m.group(1), m.group(2).lower(), m.group(3)
            if y:
                return (f"{y}-{mon}-{d}", y, "day-month-year")
            else:
                return (f"{mon}-{d}", None, "day-month")
        if kind in {"month-year"}:
            mon = (m.group(2) if m.lastindex and m.lastindex >= 2 else m.group(1)).lower()
            y   = (m.group(3) if m.lastindex and m.lastindex >= 3 else (m.group(2) if m.lastindex and m.lastindex >= 2 else None))
            if y:
                return (f"{y}-{mon}", y, "month-year")
    return None, None, None

# ---------------------- Transport (recall-boosted) ----------------------
TRANSPORT_MAP = {
    # EN
    "motorcycle":"motorcycle","motorbike":"motorcycle","bike":"motorcycle","moped":"motorcycle","scooter":"motorcycle",
    "bicycle":"bicycle","cycle":"bicycle",
    "car":"car","truck":"truck","jeep":"car","van":"car","pickup":"truck","lorry":"truck",
    "bus":"bus","coach":"bus","minibus":"bus",
    "train":"train","railway":"train",
    "boat":"boat","raft":"raft","canoe":"boat","kayak":"boat","kayac":"boat","yacht":"boat","sailboat":"boat","barge":"boat",
    "ship":"boat","ferry":"boat",
    "plane":"plane","airplane":"plane","aeroplane":"plane","aircraft":"plane",
    "horse":"horse","mule":"horse","donkey":"horse","foot":"foot","walk":"foot","walking":"foot","hike":"foot","trek":"foot",
    # ES
    "moto":"motorcycle","motocicleta":"motorcycle","mototaxi":"motorcycle","moto-taxi":"motorcycle",
    "bicicleta":"bicycle","bici":"bicycle",
    "auto":"car","coche":"car","camión":"truck","camion":"truck","camioneta":"truck","pick-up":"truck",
    "ómnibus":"bus","omnibus":"bus","autobús":"bus","autobus":"bus","colectivo":"bus","micro":"bus",
    "tren":"train","ferrocarril":"train",
    "balsa":"raft","bote":"boat","barco":"boat","lancha":"boat","barcaza":"boat","canoa":"boat","kayak":"boat","yate":"boat","velero":"boat",
    "avión":"plane","avion":"plane","aeronave":"plane",
    "caballo":"horse","mula":"horse","burro":"horse","a pie":"foot","a caballo":"horse",
    # diary-specific name
    "la poderosa":"motorcycle","la poderosa i":"motorcycle","la poderosa ii":"motorcycle",
}

VERB_TRIGGERS = {
    # EN lemmas
    "ride","take","board","catch","drive","sail","row","paddle","cross","walk","return",
    "carry","load","unload","hoist","travel","hike","trek","cycle","canoe","kayak",
    # ES lemmas
    "tomar","subir","montar","viajar","cruzar","llegar","salir","partir",
    "embarcar","descender","avanzar","cabalgar","navegar","abordar","cargar","descargar","remar","caminar","ir"
}

# broader set of prepositions/intros that can precede vehicles
PREP_TRIGGERS = {"by","on","in","onto","into","aboard","over","via","en","a","al","sobre","por","con"}

def _normalize_transport(tok: str) -> Optional[str]:
    t = tok.lower().strip()
    return TRANSPORT_MAP.get(t)

# Regex fallback (POS-free) with "a bordo de" / "on board (of)" and broader preps.
_TRANSPORT_FALLBACK = re.compile(
    r"\b(?:a\s+bordo\s+de|on\s+board(?:\s+of)?|by|on|aboard|in|en|a|al|sobre|por|con)\s+"
    r"(?:el|la|los|las)?\s*"
    r"(motorcycle|motorbike|bike|bicycle|cycle|bus|coach|minibus|train|truck|car|boat|raft|ship|ferry|yacht|sailboat|barge|canoe|kayak|plane|airplane|aeroplane|aircraft|moped|scooter|"
    r"moto|motocicleta|mototaxi|moto-taxi|bicicleta|bici|camión|camion|camioneta|pick-up|ómnibus|omnibus|autobús|autobus|colectivo|micro|tren|ferrocarril|balsa|bote|barco|lancha|barcaza|canoa|kayak|yate|velero|avión|avion|aeronave|caballo|mula|burro|a pie|a caballo)\b"
    r"|la\s+poderosa\s*(?:ii|i)?",
    flags=re.IGNORECASE
)

def extract_transport_spans(sentence: str) -> Tuple[Optional[str], bool]:
    if not isinstance(sentence, str) or not sentence.strip():
        return None, False
    doc = nlp(sentence)

    # lemma-based movement cue
    verbs = {t.lemma_.lower() for t in doc if t.pos_ == "VERB"}
    has_move = bool(verbs & VERB_TRIGGERS)

    toks = [t.text for t in doc]
    lemmas = [t.lemma_.lower() for t in doc]
    pos = [t.pos_ for t in doc]
    lowers = [t.text.lower() for t in doc]

    # diary-specific name first
    joined = " ".join(lowers)
    for pname in ("la poderosa ii","la poderosa i","la poderosa"):
        if pname in joined:
            return TRANSPORT_MAP[pname], True

    # PREP(+optional stuff) + VEHICLE  (add special-case for "a bordo de …")
    i = 0
    while i < len(lemmas):
        w, p = lemmas[i], pos[i]
        if w in PREP_TRIGGERS and p == "ADP":
            j = i + 1

            # special-case: "a bordo de" / "on board (of)"
            if j + 2 < len(lemmas):
                if (lowers[i] == "a" and lowers[j] == "bordo" and pos[j+1] == "ADP") or \
                   (lowers[i] == "on" and lowers[j] == "board"):
                    # skip "a|on board (de|of)?"
                    j = j + 2 if lowers[i] == "on" else j + 2  # move to token after 'de'/'board'
                    # if there's an extra 'of' after 'on board', skip it
                    if lowers[i] == "on" and j < len(lemmas) and lowers[j] == "of":
                        j += 1
                    # skip optional determiners after that
                    while j < len(lemmas) and pos[j] in {"DET","ADJ","PRON","ADP"}:
                        j += 1
                    if j < len(lemmas):
                        norm = _normalize_transport(toks[j])
                        if norm:
                            return norm, True

            # generic skip of det/adj/pron/adp before the candidate vehicle
            while j < len(lemmas) and pos[j] in {"DET","ADJ","PRON","ADP"}:
                j += 1
            if j < len(lemmas):
                norm = _normalize_transport(toks[j])
                if norm:
                    return norm, has_move
        i += 1

    # VERB(trigger) + * + VEHICLE
    for i, (w, p) in enumerate(zip(lemmas, pos)):
        if p == "VERB" and w in VERB_TRIGGERS:
            j = i + 1
            while j < len(lemmas) and pos[j] in {"DET","ADJ","PRON","ADP"}:
                j += 1
            if j < len(lemmas):
                norm = _normalize_transport(toks[j])
                if norm:
                    return norm, True

    # bare vehicle token if a movement verb exists
    if has_move:
        for w in toks:
            norm = _normalize_transport(w)
            if norm:
                return norm, True

    # regex fallback (covers a bordo de / on board / broader preps)
    m = _TRANSPORT_FALLBACK.search(sentence)
    if m:
        s = m.group(0).lower()
        if "poderosa" in s:
            return "motorcycle", True
        for g in m.groups():
            if g:
                g = g.lower()
                return TRANSPORT_MAP.get(g, {"motorbike":"motorcycle"}.get(g, g)), True

    return None, has_move

# ---------------------- Sentence docs & context ----------------------
DOCS = {sid: nlp(text) for sid, text in SENTENCES_FOR_NER}  # noqa: F821

persons_by_sentence = {
    sid: [ent.text for ent in DOCS[sid].ents if ent.label_ == "PERSON"]
    for sid, _ in SENTENCES_FOR_NER
}

GLOBAL_PERSONS: Set[str] = {p.lower().strip() for plist in persons_by_sentence.values() for p in plist}
GLOBAL_PERSON_HEADS: Set[str] = {p.split()[-1] for p in GLOBAL_PERSONS if p}
GLOBAL_PERSON_HEADS_L: Set[str] = {h.lower() for h in GLOBAL_PERSON_HEADS}
PERSON_BLACKLIST: Set[str] = {"alberto granado","granado","ernesto","ernesto guevara","che","guevara"}

def _looks_like_person_here(ent_text: str, sid: int) -> bool:
    cand = ent_text.lower().strip()
    persons = [p.lower() for p in persons_by_sentence.get(sid, [])]
    if cand in persons:
        return True
    return any(fuzz.token_set_ratio(cand, p) >= 90 for p in persons)

# country lexicon (EN; extended with common variants)
COUNTRY_ISO = {
    "argentina":"AR","chile":"CL","peru":"PE","colombia":"CO","venezuela":"VE",
    "bolivia":"BO","ecuador":"EC","panama":"PA","costa rica":"CR","guatemala":"GT",
    "mexico":"MX","cuba":"CU","brazil":"BR","guyana":"GY","paraguay":"PY",
    "suriname":"SR","uruguay":"UY","honduras":"HN","el salvador":"SV","nicaragua":"NI",
    "french guiana":"GF",
    # extended
    "united states":"US","the united states":"US","usa":"US","u.s.":"US","u.s.a.":"US",
    "haiti":"HT","dominican republic":"DO","the dominican republic":"DO","bahamas":"BS",
    "trinidad and tobago":"TT","jamaica":"JM"
}

countries_by_sentence = {
    sid: [t.text.lower() for t in DOCS[sid] if t.text.lower() in COUNTRY_ISO]
    for sid, _ in SENTENCES_FOR_NER
}

def _is_bare_country(ent_text: str) -> bool:
    return ent_text.lower().strip() in COUNTRY_ISO

# ---------------------- Main extraction (NER + Gazetteer) ----------------------
def _effective_label(label: str, ent_text: str) -> str:
    if label == "GAZETTEER":
        return "COUNTRY" if ent_text.lower() in COUNTRY_ISO else "GPE"
    if label in {"CITY","STATE_OR_PROVINCE","COUNTRY"}:
        return "GPE" if label != "COUNTRY" else "COUNTRY"
    return label

def combine_ner_gazetteer(sentences: List[Tuple[int, str]], gazetteer_set: Set[str]) -> List[Dict]:
    allowed = {"GPE","LOC","GAZETTEER","FAC","CITY","STATE_OR_PROVINCE","COUNTRY"}
    results: List[Dict] = []

    for sid, text in tqdm(sentences, desc="NER + Gazetteer"):
        ents: List[Tuple[str,str,int,int]] = []
        # NER (spaCy)
        ents += [(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in DOCS[sid].ents]
        # Optional Stanza
        if USE_STANZA and stanza_pipeline is not None:
            try:
                for sent in stanza_pipeline(text).sentences:
                    for ent in getattr(sent, "ents", []):
                        ents.append((ent.text, ent.type, ent.start_char, ent.end_char))
            except Exception:
                pass
        # Gazetteer spans
        ents += match_gazetteer_safe(text)

        keep = []
        for ent_text, label, start, end in ents:
            if label not in allowed:
                continue
            if not _valid_toponym(ent_text):
                continue

            eff_label = _effective_label(label, ent_text)

            # drop pure country-name mentions entirely
            if eff_label == "COUNTRY" and _is_bare_country(ent_text):
                continue

            # NER spans get narrative cue gate; gazetteer spans skip here to preserve recall
            if label != "GAZETTEER" and not heuristic_accept(ent_text, DOCS[sid]):
                continue

            # person prefixes (e.g., "Dr. Montoya")
            if _person_prefix_rule(text, ent_text):
                continue

            low = ent_text.lower().strip()

            # suppress gazetteer hits that are actually people
            if label == "GAZETTEER":
                if _looks_like_person_here(ent_text, sid):
                    continue
                if low in GLOBAL_PERSONS or low in GLOBAL_PERSON_HEADS or low in PERSON_BLACKLIST:
                    continue

            # near-PERSON similarity: only suppress for gazetteer; keep NER
            if any(fuzz.token_set_ratio(ent_text, p) >= 90 for p in persons_by_sentence.get(sid, [])):
                if label == "GAZETTEER":
                    continue

            # drop single-token NER spans that match a global person head (surname)
            if label != "GAZETTEER":
                toks = re.findall(r"[A-Za-zÀ-ÿ]+", ent_text.strip())
                if len(toks) == 1 and toks[0].lower() in GLOBAL_PERSON_HEADS_L:
                    continue

            if is_named_object(ent_text, text):
                continue

            # singleton ambiguity rule (gazetteer only)
            if label == "GAZETTEER":
                toks = re.findall(r"[A-Za-zÀ-ÿ]+", ent_text.strip())
                if len(toks) == 1:
                    head = toks[0].lower()
                    if head in {"sierra","villa","serra","rio"} and (head not in gazetteer_set):
                        continue

            # metonymy filter now applies with effective label
            if is_probable_metonymy(ent_text, text, eff_label):
                continue

            keep.append((ent_text, eff_label, start, end))

        # longest-nonoverlapping spans + de-dup by near-duplicate text (diacritics-aware)
        spans = []
        if keep:
            dfk = pd.DataFrame([{"t":t, "l":l, "s":s, "e":e, "len": e-s} for t,l,s,e in keep]).sort_values("len", ascending=False)
            for r in dfk.itertuples():
                if not any(not (r.e <= s or r.s >= e) for _,_,s,e in spans):
                    t_norm = strip_diacritics(r.t).lower()
                    if any(fuzz.token_set_ratio(t_norm, strip_diacritics(t0).lower()) >= 95 for t0,_,_,_ in spans):
                        continue
                    spans.append((r.t, r.l, r.s, r.e))

        for ent_text, eff_label, start, end in spans:
            results.append({
                "sentence_id": sid,
                "entity": ent_text,
                "entity_norm": ent_text.lower().strip(),
                "label": eff_label,
                "start_char": start,
                "end_char": end,
                "sentence": text,
                "persons_in_sentence": persons_by_sentence.get(sid, []),
            })
    return results

print("🔎 Running ensemble NER + gazetteer matching (with metonymy & filters)…")
entities_combined = combine_ner_gazetteer(SENTENCES_FOR_NER, _PLACES)  # noqa: F821
df_combined = pd.DataFrame(entities_combined)

# --- keep schema even if empty; avoid ValueError down-pipeline ---
REQUIRED_COLS = ["sentence_id","entity","entity_norm","label","start_char","end_char","sentence","persons_in_sentence"]
for c in REQUIRED_COLS:
    if c not in df_combined.columns:
        df_combined[c] = pd.Series(dtype=object)

if not df_combined.empty:
    df_combined = remove_overlapping_shorter(df_combined)
else:
    print("ℹ️ No entities to overlap-filter; continuing with empty frame.")

# ---------------------- Composite-head expansion & aliases ----------------------
# extend expansion set with laguna/lago/isla + coastal/geomorph heads
COMPOSITE_HEADS_EXPAND = {
    "sierra","rio","río","cerro","san","santa","santo","villa","puerto",
    "bahía","bahia","laguna","lago","isla","punta","playa","quebrada","arroyo","valle","valley"
}

def _grow_composite_token(ent_text: str, sentence: str) -> str:
    """
    If entity is a single composite head, try to grow to next capitalized chunk(s).
    Handles 'Sierra Maestra', 'Río de la Plata', 'San Martín', etc.
    """
    if not isinstance(ent_text, str) or not ent_text.strip():
        return ent_text
    head = ent_text.strip().lower()
    if " " in ent_text or head not in COMPOSITE_HEADS_EXPAND:
        return ent_text
    m = re.search(
        rf"\b{re.escape(ent_text)}\s+((?:[A-ZÀ-Ý][\wÀ-ÿ-]+)(?:\s+(?:de|del|la|las|los)\s+[A-ZÀ-Ý][\wÀ-ÿ-]+)?)",
        sentence
    )
    return f"{ent_text} {m.group(1)}" if m else ent_text

if not df_combined.empty:
    df_combined["entity_expanded"] = df_combined.apply(
        lambda r: _grow_composite_token(r["entity"], r["sentence"]), axis=1
    )
    mask_changed = df_combined["entity_expanded"].str.lower() != df_combined["entity"].str.lower()
    df_combined.loc[mask_changed, "entity"] = df_combined.loc[mask_changed, "entity_expanded"]
    df_combined.drop(columns=["entity_expanded"], inplace=True, errors="ignore")
else:
    print("ℹ️ Empty df_combined; skipping composite expansion.")

# --- Normalization: squash whitespace, EN→ES surface rewrites, aliases ---
def _squash_ws(s: str) -> str:
    return re.sub(r"\s+", " ", s or "").strip()

def _en_to_es_surface(s: str) -> str:
    t = s.lower()
    t = re.sub(r"^lake\s+", "lago ", t)
    t = re.sub(r"^river\s+", "río ", t)
    t = re.sub(r"^(mount|mt\.?)\s+", "cerro ", t)
    return t

def _es_to_en_surface(s: str) -> str:
    t = s.lower()
    t = re.sub(r"^lago\s+", "lake ", t)
    t = re.sub(r"^(río|rio)\s+", "river ", t)
    t = re.sub(r"^cerro\s+", "mount ", t)
    return t

ALIASES = {
    "cuzco":"cusco",
    "easter island":"isla de pascua",
    "easter-island":"isla de pascua",
    "rapa nui":"isla de pascua",
}

df_combined["entity_lower"] = (
    df_combined["entity"].map(_squash_ws).map(_en_to_es_surface).str.lower().map(lambda x: ALIASES.get(x, x))
)

# remove "Poderosa" as an entity (keep as transport signal only)
df_combined = df_combined[~df_combined["entity_lower"].str.contains("poderosa", na=False)].copy()

# --- Remove country-only mentions defensively (in case any slipped through) ---
before_c = len(df_combined)
df_combined = df_combined[~df_combined["entity_lower"].isin(set(COUNTRY_ISO.keys()))].copy()
print(f"↪️ dropped country-only mentions (defensive): {before_c - len(df_combined)}")

# --- Region/ocean pruning (keep only explicit '... Ocean') ---
OCEAN_BASINS = {"atlantic","pacific","indian","arctic","southern"}
NON_TOPONYMS = {"americas","south america","north america","europe","africa","mediterranean"}
mask_ocean_bare = df_combined["entity_lower"].isin(OCEAN_BASINS) & ~df_combined["entity_lower"].str.contains("ocean", na=False)
mask_regions = df_combined["entity_lower"].isin(NON_TOPONYMS)
before_r = len(df_combined)
df_combined = df_combined[~(mask_ocean_bare | mask_regions)].copy()
print(f"↪️ pruned oceans/regions: {before_r - len(df_combined)}")

# ---------------------- Sentence-level time & transport (+/- 1 borrowing) ----------------------
_sent_df = pd.DataFrame(SENTENCES_FOR_NER, columns=["sentence_id","sentence"]).drop_duplicates()

_sid_to_date, _sid_to_year, _sid_to_gran = {}, {}, {}
for sid, sent in _sent_df.itertuples(index=False):
    dnorm, y, gran = _extract_first_date(sent)
    _sid_to_date[int(sid)] = dnorm
    _sid_to_year[int(sid)] = y
    _sid_to_gran[int(sid)] = gran

_sid_to_transport, _sid_to_moveverb = {}, {}
for sid, sent in _sent_df.itertuples(index=False):
    tnorm, has_moveverb = extract_transport_spans(sent)
    if not tnorm:
        # POS-free safety net (regex)
        m = _TRANSPORT_FALLBACK.search(sent or "")
        if m:
            s = m.group(0).lower()
            if "poderosa" in s:
                tnorm = "motorcycle"
            else:
                for g in m.groups():
                    if g:
                        g = g.lower()
                        tnorm = TRANSPORT_MAP.get(g, {"motorbike":"motorcycle"}.get(g, g))
                        break
    _sid_to_transport[int(sid)] = tnorm
    _sid_to_moveverb[int(sid)] = bool(has_moveverb)

def _nearest_sentence_value(sid: int, mapping: dict[int, object]) -> object|None:
    return mapping.get(sid) or mapping.get(sid-1) or mapping.get(sid+1)

df_combined["date_norm"] = df_combined["sentence_id"].apply(lambda s: _nearest_sentence_value(int(s), _sid_to_date))
df_combined["year"] = df_combined["sentence_id"].apply(lambda s: _nearest_sentence_value(int(s), _sid_to_year))
df_combined["date_granularity"] = df_combined["sentence_id"].apply(lambda s: _nearest_sentence_value(int(s), _sid_to_gran))
df_combined["transport"] = df_combined["sentence_id"].apply(lambda s: _nearest_sentence_value(int(s), _sid_to_transport))
df_combined["movement_verb_present"] = df_combined["sentence_id"].apply(lambda s: bool(_nearest_sentence_value(int(s), _sid_to_moveverb)))

# ---------------------- Enrichment with disambiguation scoring ----------------------
# flatten gazetteer into a candidates table (DO NOT dedupe by name)
gaz_rows = []
for n, geo in gazetteer.items():
    gaz_rows.append({
        "name_lower":          n,
        "name_stripped":       strip_diacritics(n),
        "lat":                 geo.get("lat"),
        "lon":                 geo.get("lon"),
        "country":             geo.get("country"),
        "country_code":        geo.get("country_code"),
        "feature_class":       geo.get("feature_class"),
        "feature_code":        geo.get("feature_code"),
        "admin1":              geo.get("admin1"),
        "admin2":              geo.get("admin2"),
    })
gaz_df = pd.DataFrame(gaz_rows)

# name cleaning to help island/parenthetical variants
def _clean_name(s: str) -> str:
    s = re.sub(r"\(.*?\)", " ", s or "")
    s = re.sub(r"[^A-Za-zÀ-ÿ\s-]", " ", s)
    return re.sub(r"\s+", " ", s).strip().lower()

gaz_df["name_clean"] = gaz_df["name_lower"].map(_clean_name)

# build quick indices
idx_by_name_lower = defaultdict(list)
idx_by_name_stripped = defaultdict(list)
idx_by_name_clean = defaultdict(list)
for i, r in gaz_df.iterrows():
    idx_by_name_lower[r["name_lower"]].append(i)
    idx_by_name_stripped[r["name_stripped"]].append(i)
    idx_by_name_clean[r["name_clean"]].append(i)
gaz_names_all = list(idx_by_name_lower.keys())
gaz_names_clean = list(gaz_df["name_clean"].unique())

# context hint: ±2 sentence window
def _sentence_country_hint_window(sid: int) -> Optional[str]:
    window = range(int(sid) - 2, int(sid) + 3)
    for s in window:
        row = _sent_df.loc[_sent_df["sentence_id"] == s]
        if not row.empty:
            toks = re.findall(r"[A-Za-zÀ-ÿ]+", str(row.iloc[0]["sentence"]).lower())
            for tok in toks:
                if tok in COUNTRY_ISO:
                    return COUNTRY_ISO[tok]
    return None

# add a few hydro/relief tokens for better surface hints
_WATER_TOKENS = {"lake","lago","river","río","laguna","lagoon","canal","arroyo","quebrada"}
_RELIEF_TOKENS = {"cerro","sierra","cordillera","monte","valle","valley","punta"}

def _candidate_score(row, cand) -> float:
    """
    Score a gazetteer candidate 'cand' (Series) for entity row.
    Higher is better. Combines exactness, similarity, feature class, and context.
    """
    score = 0.0
    ent = row["entity_lower"]
    ent_stripped = strip_diacritics(ent)
    name_l = cand["name_lower"]
    name_s = cand["name_stripped"]

    if ent == name_l: score += 50
    if ent_stripped == name_s: score += 35

    # string similarity
    score += 0.20 * fuzz.WRatio(ent, name_l)

    # prefer populated/admin for GPE; allow islands (T/ISL) to compete
    if row["label"] in {"GPE","LOC"}:
        if cand.get("feature_class") == "P": score += 12
        elif cand.get("feature_class") == "A": score += 7
        elif cand.get("feature_class") == "T" and cand.get("feature_code") == "ISL": score += 9

    # hydro/relief boosts when hinted by surface form
    if cand.get("feature_class") == "H" and any(w in ent for w in _WATER_TOKENS):  # hydro
        score += 10
    if cand.get("feature_class") == "T" and any(w in ent for w in _RELIEF_TOKENS): # terrain
        score += 10

    # ±2 sentence context boost (rebalanced to reduce score saturation)
    hint_cc = _sentence_country_hint_window(row["sentence_id"])
    if hint_cc and cand.get("country_code") == hint_cc:
        score += 35  # tuned from +45

    # if gazetteer lacks feature metadata (city-only build), don't penalize
    if not cand.get("feature_class"):
        score += 0

    # light preference for configured countries
    allowed = set(gcfg.get("countries", [])) or set(COUNTRY_ISO.values())
    if cand.get("country_code") in allowed:
        score += 5

    return score

def _best_non_country_row(row) -> Dict[str, Optional[str]]:
    """
    Pick best candidate among exact/stripped/clean/fuzzy matches using _candidate_score.
    """
    ent_base = _squash_ws(row["entity"]).lower()
    variants = {ent_base, _en_to_es_surface(ent_base), _es_to_en_surface(ent_base)}
    cand_idx: Set[int] = set()
    for ent in variants:
        ent_s = strip_diacritics(ent)
        ent_c = _clean_name(ent)
        cand_idx.update(idx_by_name_lower.get(ent, []))
        cand_idx.update(idx_by_name_stripped.get(ent_s, []))
        cand_idx.update(idx_by_name_clean.get(ent_c, []))

    # if no direct candidates, try fuzzy names
    if not cand_idx:
        for ent in variants:
            fuzzy = process.extract(ent, gaz_names_all, scorer=fuzz.WRatio, limit=5)
            for name, sim, _ in fuzzy:
                if sim >= 88:
                    cand_idx.update(idx_by_name_lower.get(name, []))
            ent_c = _clean_name(ent)
            fuzzy_c = process.extract(ent_c, gaz_names_clean, scorer=fuzz.WRatio, limit=5)
            for name, sim, _ in fuzzy_c:
                if sim >= 82:
                    cand_idx.update(idx_by_name_clean.get(name, []))

    if not cand_idx:
        return {"country_code": None, "country": None, "lat": None, "lon": None,
                "feature_class": None, "feature_code": None, "match_source": "none", "disamb_score": 0.0}

    best, best_score = None, float("-inf")
    for i in cand_idx:
        cand = gaz_df.loc[i]
        sc = _candidate_score(row, cand)
        if sc > best_score:
            best, best_score = cand, sc

    return {
        "country_code": best.get("country_code"),
        "country": best.get("country"),
        "lat": best.get("lat"),
        "lon": best.get("lon"),
        "feature_class": best.get("feature_class"),
        "feature_code": best.get("feature_code"),
        "match_source": "exact/stripped/fuzzy",
        "disamb_score": float(best_score),
    }

def enrich_rows(df: pd.DataFrame) -> pd.DataFrame:
    cols = ["country_code","country","lat","lon","feature_class","feature_code","match_source","disamb_score"]
    out = {c: [] for c in cols}
    for r in tqdm(df.itertuples(index=False), total=len(df), desc="Enrich + disambiguate"):
        row = r._asdict() if hasattr(r, "_asdict") else dict(r._asdict())
        info = _best_non_country_row(row)  # country-only rows were dropped upstream
        for c in cols:
            out[c].append(info.get(c))
    for c in cols:
        df[c] = out[c]
    return df

# --- Composite cleanup applied; now enrich ---
df_enriched = enrich_rows(df_combined.copy())

# serialize persons_in_sentence as JSON for CSV safety
df_enriched["persons_in_sentence"] = df_enriched["persons_in_sentence"].apply(
    lambda xs: xs if isinstance(xs, str) and xs.startswith("[") else json.dumps(xs, ensure_ascii=False)
)

# ---------------------- Phrase stoplist + score floor cleanup ----------------------
PHRASE_STOPLIST = {
    "latin america","north america","america","the americas","americas","south america",
    "amazon","rising sun","two friends","alliance","north","europe","africa","mediterranean"
}
SCORE_FLOOR = 58  # slightly lower to recover legit hydro/relief matches

mask_phrase = df_enriched["entity_lower"].isin(PHRASE_STOPLIST)
rows_before = len(df_enriched)
df_enriched = df_enriched[~mask_phrase].copy()
dropped_phrase = rows_before - len(df_enriched)

mask_low = (df_enriched["disamb_score"].fillna(0) < SCORE_FLOOR)
df_enriched.loc[mask_low, ["lat","lon","country","country_code"]] = [None, None, None, None]
df_enriched.loc[mask_low, "match_source"] = "rejected_low_evidence"

# --- Facility head cleanup (guarded) ---
# non-capturing group avoids pandas "match groups" warning
_FAC_HEADS = re.compile(r"\b(?:hospital|station|university|college|bridge|airport)\b", re.I)
mask_fac_bad = (
    df_enriched["entity"].astype(str).str.contains(_FAC_HEADS, na=False)
    & (
        (df_enriched["match_source"].isin({"none","rejected_low_evidence"}))
        | (df_enriched["disamb_score"].fillna(0) < SCORE_FLOOR)
    )
)
dropped_fac = int(mask_fac_bad.sum())
if dropped_fac:
    df_enriched = df_enriched[~mask_fac_bad].copy()
    print(f"↪️ dropped unresolved facilities: {dropped_fac}")

# OPTIONAL: SA viz flag (non-destructive)
SOUTH_AM = {"AR","CL","PE","CO","VE","BO","EC","BR","PY","UY","GY","SR","GF"}
df_enriched["in_south_america"] = df_enriched["country_code"].isin(SOUTH_AM)

# Export
Path("outputs").mkdir(parents=True, exist_ok=True)
out_csv = Path("outputs/geoparsing_ner_ensemble.csv")
df_enriched.to_csv(out_csv, index=False)
print(f"Saved: {out_csv}  ({len(df_enriched)} rows) | pruned phrases: {dropped_phrase} | low-evidence nulled: {int(mask_low.sum())}")


spaCy pipeline: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
💾 Loaded gazetteer cache: 15632 names
🔎 Running ensemble NER + gazetteer matching (with metonymy & filters)…


NER + Gazetteer: 100%|██████████| 1971/1971 [00:48<00:00, 40.53it/s]


↪️ dropped country-only mentions (defensive): 98
↪️ pruned oceans/regions: 13


Enrich + disambiguate: 100%|██████████| 258/258 [00:04<00:00, 57.26it/s]

↪️ dropped unresolved facilities: 1
Saved: outputs/geoparsing_ner_ensemble.csv  (231 rows) | pruned phrases: 26 | low-evidence nulled: 81





In [10]:
# === Block 8: Symbolic Enrichment (modular, using helpers) ===
from __future__ import annotations

from pathlib import Path
import pandas as pd
import spacy

from geoparser.nlp_helpers import init_nlp
from geoparser.enrichment_helpers import (
    load_gazetteer_compat,
    normalize_entity_surface,
    gaz_lookup_latlon,
    safe_person_list,
    is_named_object_context,
    movement_verb_present,
    symbolic_context,
    metonymy_flag,
    build_sid_to_transport,
    infer_transport_for_row,
    extract_year_regex,
    final_label_decision,
    reorder_columns,
)

# ---------------------- NLP ----------------------
try:
    nlp  # noqa: F821
except NameError:
    nlp, _ = init_nlp(lang="en", prefer=["en_core_web_md", "en_core_web_sm"])

# ---------------------- Paths ----------------------
outputs = Path("outputs")
in_path = outputs / "geoparsing_ner_ensemble.csv"    # Block 7 output
gaz_path = outputs / "geonames_cache.json"

# ---------------------- Load data ----------------------
df = pd.read_csv(in_path)

# persons_in_sentence parsing (robust)
if "persons_in_sentence" in df.columns:
    df["persons_in_sentence"] = df["persons_in_sentence"].apply(safe_person_list)
else:
    df["persons_in_sentence"] = [[] for _ in range(len(df))]

# Normalized surface (match Block 7)
df["entity_lower"] = df.get("entity", "").astype(str).map(normalize_entity_surface)

# Gazetteer cache (Block-7 compatible)
rows, by_lower, by_stripped, by_clean, gaz_set = load_gazetteer_compat(gaz_path)

# Fill lat/lon if missing (non-clobbering)
if "lat" not in df.columns: df["lat"] = pd.NA
if "lon" not in df.columns: df["lon"] = pd.NA

lat_fill = df["entity_lower"].map(lambda k: gaz_lookup_latlon(k, by_lower, by_stripped, by_clean)[0])
lon_fill = df["entity_lower"].map(lambda k: gaz_lookup_latlon(k, by_lower, by_stripped, by_clean)[1])

df["lat"] = df["lat"].where(df["lat"].notna(), lat_fill)
df["lon"] = df["lon"].where(df["lon"].notna(), lon_fill)

# country_valid: prefer Block 7 signal; else gaz membership/coords
if "country_code" in df.columns:
    df["country_valid"] = df["country_code"].notna()
else:
    df["country_valid"] = df["entity_lower"].isin(gaz_set) | df["lat"].notna()

# ---------------------- Enrichment flags ----------------------
# named-object (boats etc.)
df["named_object_flag"] = df.apply(lambda r: is_named_object_context(nlp, r.get("entity",""), r.get("sentence","")), axis=1)

# movement / symbolic (coalesce with existing columns if present)
if "movement_verb_present" not in df.columns:
    df["movement_verb_present"] = df.apply(lambda r: movement_verb_present(nlp, r.get("sentence",""), r.get("entity",""), r.get("persons_in_sentence",[])), axis=1)
else:
    df["movement_verb_present"] = df["movement_verb_present"].astype(bool)

df["symbolic_context"] = df.apply(lambda r: symbolic_context(nlp, r.get("sentence",""), r.get("entity",""), r.get("persons_in_sentence",[])), axis=1)

# metonymy (keep if present)
if "metonymy_flagged" not in df.columns:
    df["metonymy_flagged"] = df.apply(lambda r: metonymy_flag(nlp, r.get("entity",""), r.get("sentence","")), axis=1)
else:
    df["metonymy_flagged"] = df["metonymy_flagged"].astype(bool)

# ---------------------- Dates ----------------------
# Keep Block 7's date_norm/year/date_granularity if present; else regex year fallback
if "year" not in df.columns:
    df["year"] = df.get("sentence","").apply(extract_year_regex)

# ---------------------- Transport (WordNet/regex + context backfill) ----------------------
if "transport" not in df.columns:
    df["transport"] = pd.NA

sid2t = build_sid_to_transport(df)
prev_t = int(df["transport"].notna().sum())
df["transport"] = df.apply(lambda r: infer_transport_for_row(r, sid2t, enable_wordnet=True), axis=1)
now_t = int(df["transport"].notna().sum())
print(f"Transport filled (helpers): +{now_t - prev_t} (now {now_t}/{len(df)})")

# ---------------------- Final label ----------------------
df["final_label"] = df.apply(final_label_decision, axis=1)

# ---------------------- People summary ----------------------
df["people_involved"] = df["persons_in_sentence"].apply(lambda xs: ", ".join(xs) if xs else None)

# ---------------------- Column order ----------------------
preferred_cols = [
    "sentence_id","entity","entity_norm","entity_lower","label",
    "lat","lon","country","country_code","country_valid",
    "symbolic_context","movement_verb_present","metonymy_flagged","named_object_flag",
    "final_label","date_norm","date_granularity","year","transport",
    "start_char","end_char","sentence","persons_in_sentence","people_involved",
    "feature_class","feature_code","disamb_score","match_source"
]
df = reorder_columns(df, preferred_cols)

# ---------------------- Save ----------------------
out_path = outputs / "geoparsing_final_enriched.csv"
out_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out_path, index=False)
print(f"Enriched data saved → {out_path}  ({len(df)} rows)")


Transport filled (helpers): +18 (now 57/231)
Enriched data saved → outputs/geoparsing_final_enriched.csv  (231 rows)


In [11]:
# === Block 10: ML Classification (balanced LR + NOISE-threshold calibration) ===
from __future__ import annotations

import json, re, unicodedata
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

# ---------------------------------------------------------
# 0) Load data (binary LITERAL vs NOISE; keep negatives; no country filter)
# ---------------------------------------------------------
df = pd.read_csv("outputs/geoparsing_final_enriched.csv")
df = df[df["final_label"].isin(["LITERAL", "NOISE"])].copy()
# Guard: LITERAL must have coords
df = df[~((df["final_label"]=="LITERAL") & (df["lat"].isna() | df["lon"].isna()))].copy()
print("Class counts BEFORE split:\n", df["final_label"].value_counts())

# Persons column may be serialized -> normalize to list (not used here, but safe)
def _as_list(x):
    if isinstance(x, list): return x
    if isinstance(x, str) and x.strip().startswith("["):
        import ast
        try: return [str(t) for t in ast.literal_eval(x)]
        except Exception: return []
    return [] if pd.isna(x) else [str(x)]
if "persons_in_sentence" in df.columns:
    df["persons_in_sentence"] = df["persons_in_sentence"].apply(_as_list)

# ---------------------------------------------------------
# 1) Gazetteer (optional cue; robust load)
# ---------------------------------------------------------
try:
    with open("outputs/geonames_cache.json","r") as f:
        _gj = json.load(f)
    gazetteer = {r["name"].lower() for r in _gj if isinstance(r, dict) and "name" in r}
except Exception:
    gazetteer = set()

# ---------------------------------------------------------
# 2) Lexicons / regex
# ---------------------------------------------------------
MONTHS = {"january","february","march","april","may","june","july","august","september","october","november","december"}
YEAR_RE = re.compile(r"\b(1[89]\d{2}|20\d{2})\b")

def _nlp_has_vectors(nlp_obj) -> bool:
    try:
        return hasattr(nlp_obj, "vocab") and any(w.has_vector for w in nlp_obj.vocab)
    except Exception:
        return False
_HAS_VECS = _nlp_has_vectors(nlp)  # uses `nlp` initialized earlier

# ---------------------------------------------------------
# 3) Feature extractor (interpretable, leak-free)
# ---------------------------------------------------------
def extract_features_row(row) -> dict:
    entity   = str(row["entity"])
    sentence = str(row["sentence"])
    start    = int(row.get("start_char", 0))

    doc = nlp(sentence)  # from earlier blocks

    ent = entity.strip()
    L   = len(ent)
    feats = {
        # surface
        "entity_len": L,
        "token_count": len(ent.split()),
        "entity_capital_ratio": sum(c.isupper() for c in ent) / (L or 1),
        "starts_with_cap": ent[:1].isupper(),
        "has_digits": any(c.isdigit() for c in ent),
        "has_hyphen": "-" in ent,
        "has_apos": ("'" in ent) or ("’" in ent),
        "is_ascii": ent.isascii(),
        # context-lite
        "entity_position_ratio": start / (len(sentence) or 1),
        "mentions_year": int(bool(YEAR_RE.search(sentence))),
        "mentions_month": int(any(m in sentence.lower() for m in MONTHS)),
        # optional gazetteer cue
        "gazetteer_match": int(ent.lower() in gazetteer),
    }

    # vectors-safe similarity
    if _HAS_VECS:
        try:
            feats["entity_sentence_sim"] = doc.similarity(nlp(ent))
        except Exception:
            feats["entity_sentence_sim"] = 0.0
    else:
        feats["entity_sentence_sim"] = 0.0

    return feats

X_dict = [extract_features_row(r) for _, r in df.iterrows()]
y      = df["final_label"].reset_index(drop=True)

vec = DictVectorizer(sparse=True)
X   = vec.fit_transform(X_dict)

# ---------------------------------------------------------
# 4) Split → Train/Val/Test (stratified)
#    We’ll tune threshold on a validation split, then evaluate on test.
# ---------------------------------------------------------
# main split (train+val vs test)
X_trv, X_test, y_trv, y_test, df_trv, df_test = train_test_split(
    X, y, df, test_size=0.20, stratify=y, random_state=42
)
# train vs val
X_train, X_val, y_train, y_val = train_test_split(
    X_trv, y_trv, test_size=0.25, stratify=y_trv, random_state=42
)  # 0.25 of 0.8 = 0.2 → so final: 60/20/20 split

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc   = le.transform(y_val)
y_test_enc  = le.transform(y_test)
classes     = list(le.classes_)
assert set(classes) == {"LITERAL","NOISE"}, f"Expecting binary classes; got {classes}"
NOISE_IDX   = list(classes).index("NOISE")  # index in predict_proba columns

# model
clf = LogisticRegression(
    class_weight="balanced",
    solver="liblinear",
    max_iter=400,
    random_state=42
)
clf.fit(X_train, y_train_enc)

# ---------------------------------------------------------
# 5) Threshold calibration on validation set (optimize NOISE F1)
# ---------------------------------------------------------
proba_val = clf.predict_proba(X_val)[:, NOISE_IDX]

def _f1_for_threshold(y_true_enc, proba_pos, thresh: float):
    pred_enc = (proba_pos >= thresh).astype(int)  # 1 = NOISE
    # remap to string labels for metrics
    y_true = le.inverse_transform(y_true_enc)
    y_pred = le.inverse_transform(pred_enc)
    P, R, F1, _ = precision_recall_fscore_support(y_true, y_pred, labels=["NOISE"], zero_division=0)
    return P[0], R[0], F1[0]

grid = np.linspace(0.01, 0.99, 99)
scores = [(_t, *_f1_for_threshold(y_val_enc, proba_val, _t)) for _t in grid]
best_t, best_p, best_r, best_f1 = max(scores, key=lambda x: x[3])

print(f"Best NOISE threshold on val (by F1): {best_t:.3f}  |  P={best_p:.2f} R={best_r:.2f} F1={best_f1:.2f}")

# retrain on full train+val with same hyperparams
clf.fit(X_trv, le.transform(y_trv))

# ---------------------------------------------------------
# 6) Evaluate on held-out test with calibrated threshold
# ---------------------------------------------------------
proba_test = clf.predict_proba(X_test)[:, NOISE_IDX]
pred_test_enc = (proba_test >= best_t).astype(int)  # 1 = NOISE, 0 = LITERAL
y_pred = le.inverse_transform(pred_test_enc)

print("\nClassification Report (thresholded):")
print(classification_report(y_test, y_pred))
print("Confusion Matrix (thresholded):")
print(confusion_matrix(y_test, y_pred))

# ---------------------------------------------------------
# 7) Save test predictions for visualization
# ---------------------------------------------------------
df_out = df_test.copy()
df_out["proba_noise"] = proba_test
df_out["ml_prediction"] = y_pred
out_path = "outputs/geoparser_ml_predictions_thresholded.csv"
df_out.to_csv(out_path, index=False)
print(f" Saved test predictions → {out_path}")


Class counts BEFORE split:
 final_label
LITERAL    150
NOISE       80
Name: count, dtype: int64
Best NOISE threshold on val (by F1): 0.180  |  P=1.00 R=1.00 F1=1.00

Classification Report (thresholded):
              precision    recall  f1-score   support

     LITERAL       1.00      1.00      1.00        30
       NOISE       1.00      1.00      1.00        16

    accuracy                           1.00        46
   macro avg       1.00      1.00      1.00        46
weighted avg       1.00      1.00      1.00        46

Confusion Matrix (thresholded):
[[30  0]
 [ 0 16]]
 Saved test predictions → outputs/geoparser_ml_predictions_thresholded.csv


In [12]:
# === Block 12: Interactive Map (thresholded predictions; robust & pretty) ===
import pandas as pd
import folium
from folium.plugins import AntPath
from pathlib import Path

# ---------------------------------------------------------
# 0) Load predictions
# ---------------------------------------------------------
pred_path = Path("outputs/geoparser_ml_predictions_thresholded.csv")
if not pred_path.exists():
    raise FileNotFoundError(f"Missing {pred_path}. Run Block 10 first.")

df = pd.read_csv(pred_path)

# defensively fill expected metadata columns, if absent
for col in ["people_involved","transport","year","sentence_id","entity","lat","lon","ml_prediction"]:
    if col not in df.columns:
        df[col] = None

# keep only literal points with coordinates
df_map = df[(df["ml_prediction"]=="LITERAL") & df["lat"].notna() & df["lon"].notna()].copy()
if df_map.empty:
    raise ValueError("No LITERAL rows with coordinates to plot. Check thresholds or upstream steps.")

# order: if you have sentence_id keep its order, else keep current index
if "sentence_id" in df_map.columns and df_map["sentence_id"].notna().any():
    df_map = df_map.sort_values(by=["sentence_id"]).reset_index(drop=True)
else:
    df_map = df_map.reset_index(drop=True)

# ---------------------------------------------------------
# 1) Hover text builder
# ---------------------------------------------------------
def generate_hover_info(row):
    parts=[]
    p = str(row.get("people_involved") or "").strip()
    t = str(row.get("transport") or "").strip()
    y = str(row.get("year") or "").strip()
    if p: parts.append(f"👥 {p}")
    if t: parts.append(f"🚗 {t}")
    if y and y.lower() != "none": parts.append(f"📅 {y}")
    return "<br>".join(parts) if parts else "ℹ️ No metadata"

df_map["hover_text"] = df_map.apply(generate_hover_info, axis=1)

# ---------------------------------------------------------
# 2) Build map
# ---------------------------------------------------------
route_coords = df_map[["lat","lon"]].values.tolist()
start_lat, start_lon = route_coords[0]

m = folium.Map(location=[start_lat, start_lon], zoom_start=4, tiles="CartoDB positron")

# Add animated route if there are ≥2 points
if len(route_coords) >= 2:
    AntPath(route_coords, color="red", weight=3, delay=1000).add_to(m)

# Add markers
for i, row in df_map.iterrows():
    folium.CircleMarker(
        location=[row["lat"], row["lon"]],
        radius=6,
        color="blue",
        fill=True,
        fill_opacity=0.85,
        popup=folium.Popup(row["hover_text"], max_width=320),
        tooltip=f"#{i+1}: {row.get('entity','(unknown)')}"
    ).add_to(m)

# ---------------------------------------------------------
# 3) Save
# ---------------------------------------------------------
out_html = Path("outputs/interactive_geoparsing_map.html")
out_html.parent.mkdir(parents=True, exist_ok=True)
m.save(str(out_html))
print(f"Interactive map saved → {out_html}")


Interactive map saved → outputs/interactive_geoparsing_map.html


In [13]:
from graphviz import Digraph

# ---------- helpers ----------
def stage_node(g, node_id, title, subtitle, emoji, colors):
    start, end = colors
    label = f"<<TABLE BORDER='0' CELLBORDER='0' CELLSPACING='0'>\n" \
            f"  <TR><TD ALIGN='LEFT'><FONT POINT-SIZE='20'>{emoji}</FONT>  <B><FONT FACE='Inter,Helvetica' POINT-SIZE='14'>{title}</FONT></B></TD></TR>\n" \
            f"  <TR><TD ALIGN='LEFT'><FONT FACE='Inter,Helvetica' POINT-SIZE='11' COLOR='#374151'>{subtitle}</FONT></TD></TR>\n" \
            f"</TABLE>>"
    g.node(
        node_id,
        label=label,
        shape='box',
        style='rounded,filled',
        gradientangle='90',
        fillcolor=f"{start}:{end}",
        color='#111827',
        penwidth='1.6'
    )

# ---------- diagram ----------
flow = Digraph('GeoPipelinePretty', format='svg')
flow.attr(rankdir='TB', nodesep='0.7', ranksep='0.9', splines='spline')
flow.attr('edge', color='#9CA3AF', penwidth='1.4', arrowsize='0.9')

# palette (start, end)
CLOUD = ('#F3F4F6', '#E5E7EB')
BLUE = ('#DBEAFE', '#BFDBFE')
AMBER = ('#FDE68A', '#FCD34D')
GREEN = ('#BBF7D0', '#86EFAC')
ROSE = ('#FECACA', '#FDA4AF')
VIOLET = ('#E9D5FF', '#C4B5FD')
PINK = ('#F5D0FE', '#FBCFE8')

# clusters for grouping
with flow.subgraph(name='cluster_input') as c:
    c.attr(label='📥 Input', style='dashed', color='#D1D5DB')
    stage_node(c, 'input', 'PDF Input', '(source documents)', '📄', CLOUD)

with flow.subgraph(name='cluster_processing') as c:
    c.attr(label='⚙️ Processing', style='dashed', color='#D1D5DB')
    stage_node(c, 'pre', 'Preprocessing', 'cleaning, OCR fixes, stopwords, sentence splitting', '🧹', BLUE)
    stage_node(c, 'nlp', 'NLP (NER)', 'spaCy / Stanza for location extraction', '🔍', AMBER)
    stage_node(c, 'gaz', 'Gazetteer Matching', 'GeoNames + OSM + fuzzy matching', '🌍', GREEN)
    stage_node(c, 'enrich', 'Enrichment', 'add coordinates, country codes, metadata', '📌', ROSE)
    stage_node(c, 'route', 'Route Reconstruction', 'context window, transport keywords, symbolic filtering', '🛣️', VIOLET)

with flow.subgraph(name='cluster_output') as c:
    c.attr(label='📤 Output', style='dashed', color='#D1D5DB')
    stage_node(c, 'vis', 'Visualization', 'map output + GeoJSON export', '🗺️', PINK)

# edges
edges = [('input','pre'), ('pre','nlp'), ('nlp','gaz'), ('gaz','enrich'), ('enrich','route'), ('route','vis')]
for a, b in edges:
    flow.edge(a, b)

# export directly in both formats (avoid Digraph.from_dot_data, which doesn't exist)
svg_path = flow.render('geo_pipeline_flowchart_pretty_svg', format='svg', cleanup=True)
png_path = flow.render('geo_pipeline_flowchart_pretty_png', format='png', cleanup=True)

print('SVG saved to:', svg_path)
print('PNG saved to:', png_path)


SVG saved to: geo_pipeline_flowchart_pretty_svg.svg
PNG saved to: geo_pipeline_flowchart_pretty_png.png
