In [None]:
# Pre-Block: Downloads & Setup 
import nltk

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("framenet_v17")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('names')

%pip install geopandas
%pip install geopy
%pip install stanza
%pip install geodatasets
%pip install folium
%pip install xgboost
%pip install scikit-learn
%pip install spacy
%pip install fuzzywuzzy
%pip install contractions
%pip install rapidfuzz
%pip install pycountry



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alicja/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/alicja/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/alicja/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package framenet_v17 to
[nltk_data]     /Users/alicja/nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/alicja/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/alicja/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/alicja/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already 

Collecting geopandas
  Downloading geopandas-1.1.1-py3-none-any.whl.metadata (2.3 kB)
Collecting pyogrio>=0.7.2 (from geopandas)
  Downloading pyogrio-0.11.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (5.3 kB)
Collecting pyproj>=3.5.0 (from geopandas)
  Downloading pyproj-3.7.1-cp313-cp313-macosx_14_0_arm64.whl.metadata (31 kB)
Collecting shapely>=2.0.0 (from geopandas)
  Downloading shapely-2.1.1-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.8 kB)
Downloading geopandas-1.1.1-py3-none-any.whl (338 kB)
Downloading pyogrio-0.11.1-cp313-cp313-macosx_12_0_arm64.whl (19.4 MB)
[2K   [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m10.2/19.4 MB[0m [31m89.3 kB/s[0m eta [36m0:01:44[0m
[?25h
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[31mERROR: Exception:
Trace

In [None]:
# Block 1: Imports 


import os, re, time, json, unicodedata
from pathlib import Path


import yaml
import fitz  # PyMuPDF
import requests

# Data / ML
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

# Geo / viz
import folium
from folium.plugins import AntPath
from shapely.geometry import Point
from geopy.distance import geodesic

from typing import List, Tuple, Dict, Set, Union
from tqdm import tqdm
from rapidfuzz import fuzz




# NLP / NLTK
import nltk
from nltk.corpus import stopwords, names, wordnet as wn, framenet as fn
from nltk import ne_chunk, pos_tag, word_tokenize

# Project helpers
from geoparser import nlp_helpers as nh
import importlib; importlib.reload(nh)


nlp, stanza_pipeline = nh.init_nlp(lang="es", with_stanza=False)

print("spaCy pipeline:", nlp.pipe_names)

# stopwords for mixed ES/PT/EN corpora:
stopset = nh.get_stopwords(nlp, langs=["es", "pt", "en"])






spaCy pipeline: ['sentencizer']


In [None]:

# Block 2: Extract text from PDF using PyMuPDF; pages 28-148

def extract_text_from_pdf(pdf_path: str, start_page: int, end_page: int) -> str:
    """Extracts and returns text from a PDF given a path and page range."""
    doc = fitz.open(pdf_path)
    pages = doc[start_page:end_page]
    return "\n".join(page.get_text() for page in pages)



#Load configuration from YAML file
def load_config(config_path: str = "config.yaml") -> dict:
    """Loads YAML configuration file and returns it as a dictionary."""
    with open(config_path, "r") as file:
        
        return yaml.safe_load(file)
    
config = load_config()
pdf_conf = config["pdf"]
raw_text = extract_text_from_pdf(pdf_conf["path"], pdf_conf["start_page"], pdf_conf["end_page"])




In [None]:
#Block 3: Text Preprocessing Functions

from geoparser.nlp_helpers import (
    init_nlp,
    get_stopwords,
    tag_named_entities,
    extract_text_from_pdf,
    load_config,
    normalize_punctuation,
    clean_light,
    preprocess_text,
    segment_sentences,
    clean_heavy,
    load_filters,
    normalize_diacritics,
    apply_ocr_replacements,
    is_caption_line,
    filter_raw_sentences
    
)

In [None]:
# Block 4: Clean and Save Tagged + NLP Versions 

# Load config
config = load_config()
pdf_conf = config["pdf"]
gaz_conf = config["gazetteer"]

# Output filenames based on input PDF
base_name = Path(pdf_conf["path"]).stem
tagged_path = Path("outputs") / f"cleaned_{base_name}_geoparsing.txt"
heavy_path = Path("outputs") / f"cleaned_{base_name}_nlp.txt"

# Create outputs dir if needed
os.makedirs("outputs", exist_ok=True)

# 🧹 Clean raw text (light and heavy)
light_cleaned = clean_light(raw_text)
sentences = segment_sentences(light_cleaned, nlp)
sentences_with_tags = [f"[SENT {i+1}] {s}" for i, s in enumerate(sentences)]

all_stops = get_stopwords(nlp)
heavy_cleaned = clean_heavy(light_cleaned, nlp, all_stops)

# Save both cleaned versions (one seems by this stage to be unnecessary, can as well drop the second pdf creation)
with open(tagged_path, "w", encoding="utf-8") as f:
    f.write("\n".join(sentences_with_tags))

with open(heavy_path, "w", encoding="utf-8") as f:
    f.write(heavy_cleaned)

print("Cleaned outputs saved:")
print(f"- Geoparsing (light): {tagged_path}")
print(f"- NLP prep (heavy): {heavy_path}")

# Prepare sentence_data for Block 7
sentence_data = [(i, sent.strip()) for i, sent in enumerate(sentences)]
print(f"sentence_data prepared with {len(sentence_data)} narrative sentences.")



✅ Cleaned outputs saved:
- Geoparsing (light): outputs/cleaned_MotorcycleDiaries_geoparsing.txt
- NLP prep (heavy): outputs/cleaned_MotorcycleDiaries_nlp.txt
✅ sentence_data prepared with 1975 narrative sentences.


In [None]:
# Build symbolic verb lexicon from WordNet (only once)


def get_symbolic_verb_synonyms():
    base_words = [
        "dream", "hope", "struggle", "escape", "resist", "believe", "follow", 
        "ride", "rebel", "fight", "flee", "live", "return", "envision", "imagine", "create", "inspire", "transform", "change", "grow"
    ]
    synonyms = set()
    for word in base_words:
        for syn in wordnet.synsets(word, pos=wordnet.VERB):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name().lower().replace("_", " "))
    return synonyms

from nltk.corpus import wordnet # Ensure WordNet is available, 
# Save the expanded verb set to reuse
SYMBOLIC_VERBS = get_symbolic_verb_synonyms()
print(f"Loaded {len(SYMBOLIC_VERBS)} symbolic verb forms from WordNet.")

# Build movement verb list using WordNet

def get_movement_verbs():
    base = ["go", "move", "travel", "walk", "drive", "ride", "arrive", "depart", "leave", "return", "cross", "fly", "sail", "swim", "follow","hike"]
    move_verbs = set()
    for word in base:
        for syn in wordnet.synsets(word, pos=wordnet.VERB):
            for lemma in syn.lemmas():
                move_verbs.add(lemma.name().lower().replace("_", " "))
                
    return move_verbs

MOVEMENT_VERBS = get_movement_verbs()
print(f"Loaded {len(MOVEMENT_VERBS)} movement verb forms from WordNet.")

🧠 Loaded 227 symbolic verb forms from WordNet.
🛣️ Loaded 238 movement verb forms from WordNet.


In [None]:
# Block 6: Build Gazetteer


# JSON 
def make_json_safe(obj):
    if isinstance(obj, set):
        return list(obj)
    raise TypeError(f"Not JSON serializable: {type(obj)}")

#  Load config.yaml
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)
gaz_conf = config["gazetteer"]

# Gazetteer path
gazetteer_path = Path("outputs/gazetteer_cities.json")

# Build or load gazetteer
if gazetteer_path.exists():
    print("Found existing gazetteer file, loading...")
    with open(gazetteer_path, "r") as f:
        gazetteer = json.load(f)
else:
    print("No gazetteer file found, building...")
    gazetteer = build_gazetteer(
        username=gaz_conf["username"],
        countries=gaz_conf["countries"],
        max_rows=gaz_conf["max_rows"]
    )
    with open(gazetteer_path, "w") as f:
        json.dump(gazetteer, f, indent=2, default=make_json_safe) 
    print("Gazetteer built and saved with coordinates.")

📂 Found existing gazetteer file, loading...


In [None]:
cfg = load_config("config.yaml")
filters = load_filters(cfg)

# Apply OCR fixes to original text (not lowercasing)
if isinstance(sentence_data, list) and sentence_data and isinstance(sentence_data[0], tuple):
    sentence_data_fixed = []
    for sid, txt in sentence_data:
        txt2 = apply_ocr_replacements(txt, filters["ocr_replacements"])
        sentence_data_fixed.append((sid, txt2))
else:
    sentence_data_fixed = [
        apply_ocr_replacements(txt, filters["ocr_replacements"]) for txt in sentence_data
    ]

# Drop footnotes/captions/sections based on config / one of the initial problems that caused a lot of noise 
sentence_data_prefiltered = filter_raw_sentences(sentence_data_fixed, filters)

print(f"Pre-filtered sentences: kept {len(sentence_data_prefiltered)} / {len(sentence_data)}")
SENTENCES_FOR_NER = sentence_data_prefiltered


🧼 Pre-filtered sentences: kept 1933 / 1975


In [None]:
# Block 7: NER + Gazetteer with Metonymy Filtering (Country-Aware) 


from geoparser.gazetteer_helpers import (
    build_gazetteer, gazetteer_names,
    build_gazetteer_patterns, match_gazetteer_precompiled,
    remove_overlapping_shorter
)

# --- Ensure NLP is loaded (nlp, stanza_pipeline come from your helpers) ---
try:
    nlp  # noqa: F821
except NameError:
    nlp, stanza_pipeline = init_nlp()  # noqa: F821

# Make sure Stanza is only used if initialized
USE_STANZA = bool(globals().get("stanza_pipeline"))

#  make sure spaCy has NER
try:
    print("spaCy pipeline:", getattr(nlp, "pipe_names", []))
except Exception:
    pass

# Stopwords and config 
stop_words = get_stopwords(nlp)   # noqa: F821
try:
    cfg  # noqa: F821
except NameError:
    cfg = load_config("config.yaml")  # noqa: F821

# Gazetteer: try cache -> then build if needed 
CACHE_PATH = Path("outputs/geonames_cache.json")
gazetteer_loaded_from_cache = False

def _load_gazetteer_cache(path: Path) -> Dict[str, Dict]:
    if not path.exists():
        return {}
    try:
        df = pd.read_json(path)
        # stored as index = name; orient=index
        if "name" in df.columns:
            df = df.set_index("name")
        return df.to_dict(orient="index")
    except Exception:
        return {}

def _save_gazetteer_cache(g: Dict[str, Dict], path: Path) -> None:
    try:
        Path(path).parent.mkdir(parents=True, exist_ok=True)
        df = pd.DataFrame.from_dict(g, orient="index").reset_index().rename(columns={"index":"name"})
        df.to_json(path, orient="records")
        print(f"💾 Saved gazetteer cache: {path}")
    except Exception:
        pass

# Build gazetteer (must include coords + country info); 
try:
    gazetteer  # noqa: F821
except NameError:
    gazetteer = _load_gazetteer_cache(CACHE_PATH)
    if gazetteer:
        gazetteer_loaded_from_cache = True
        print(f"💾 Loaded gazetteer cache: {len(gazetteer)} names")
    else:
        gcfg = cfg.get("gazetteer", {})
        gazetteer = build_gazetteer(
            username=gcfg.get("username", ""),
            countries=gcfg.get("countries", []),
            max_rows=int(gcfg.get("max_rows", 1000))
        )
        _save_gazetteer_cache(gazetteer, CACHE_PATH)

# Ensure gazetteer has country fields; rebuild or retrofit if needed
def _gaz_has_country(g: Dict[str, Dict]) -> bool:
    return bool(g) and all(("country" in v and "country_code" in v) for v in g.values())

def _retrofill_country(g: Dict[str, Dict]) -> int:
    COUNTRY_NAME = {
        "AR":"Argentina","CL":"Chile","PE":"Peru","CO":"Colombia","VE":"Venezuela",
        "BO":"Bolivia","EC":"Ecuador","PA":"Panama","CR":"Costa Rica","GT":"Guatemala",
        "MX":"Mexico","CU":"Cuba","BR":"Brazil","GY":"Guyana","PY":"Paraguay",
        "SR":"Suriname","UY":"Uruguay","HN":"Honduras","SV":"El Salvador","NI":"Nicaragua"
    }
    changed = 0
    for k, v in g.items():
        cc = v.get("country_code") or v.get("countryCode") or v.get("cc")
        if cc and "country" not in v:
            v["country"] = COUNTRY_NAME.get(cc, cc)
            changed += 1
    return changed

if not _gaz_has_country(gazetteer):
    fixed = _retrofill_country(gazetteer)
    if not _gaz_has_country(gazetteer):
        # only rebuild if we didn't load from cache (avoid rate limit loops)
        if gazetteer_loaded_from_cache:
            print("Gazetteer cache lacks country fields; rebuild later when API quota allows.")
        else:
            print("↺ Rebuilding gazetteer with country fields (old object lacked them)...")
            gcfg = cfg.get("gazetteer", {})
            gazetteer = build_gazetteer(
                username=gcfg.get("username", ""),
                countries=gcfg.get("countries", []),
                max_rows=int(gcfg.get("max_rows", 1000))
            )
            _save_gazetteer_cache(gazetteer, CACHE_PATH)
            if not _gaz_has_country(gazetteer):
                raise RuntimeError("Rebuilt gazetteer still lacks 'country'/'country_code'. Check GeoNames username and network.")
    else:
        print(f"Retrofilled 'country' names for {fixed} entries from existing 'country_code'.")

# Figurative single-word country names (avoid as literal place mentions)
_COUNTRY_NAMES_EN = {
    "argentina","chile","peru","colombia","venezuela","bolivia","ecuador",
    "panama","costa rica","guatemala","mexico","cuba","brazil","guyana",
    "paraguay","suriname","uruguay","honduras","el salvador","nicaragua"
}

# Safer gazetteer name gate BEFORE building regex 
_EN_STOP = set(stopwords.words("english"))

# head-words that are too generic alone; ban as singletons (used to be outliers that i could not figure out how to classify)
BANNED_SINGLE_HEADS = {
    "hospital","station","school","airport","bridge","park","market",
    "university","college","city","province","region","mama","friends","best"
}

# banned full names / bigrams that slipped through earlier (last resort; had to hard-code it since regex was too permissive)
BANNED_FULL_NAMES = {"the best"}

def _looks_like_place_name(name: str) -> bool:
    t = (name or "").strip().lower()
    if not t:
        return False
    if t in BANNED_FULL_NAMES:
        return False
    toks = re.findall(r"[a-zà-ÿ]+", t)
    if not toks:
        return False
    if len(toks) == 1 and toks[0] in BANNED_SINGLE_HEADS:
        return False
    # kill all-stopword ngrams incl. "the best"
    if all(tok in _EN_STOP for tok in toks):
        return False
    return True

_PLACES: Set[str] = {n for n in gazetteer_names(gazetteer) if _looks_like_place_name(n)}
GAZ_PATTERNS = build_gazetteer_patterns(_PLACES, _EN_STOP)

def match_gazetteer_safe(text: str) -> List[Tuple[str, str, int, int]]:
    hits = match_gazetteer_precompiled(text, GAZ_PATTERNS)
    safe = []
    for sl, _, s, e in hits:
        if " " not in sl and sl.lower() in _COUNTRY_NAMES_EN:
            continue
        if sl.strip().lower() in BANNED_FULL_NAMES:
            continue
        safe.append((sl, "GAZETTEER", s, e))
    return safe

# English verb lexicons (for metonymy / movement cues)
# the wordbook had been done in the previous step, so that the pipeline goes smoother
SYMBOLIC_VERBS = get_symbolic_verb_synonyms()
MOVEMENT_VERBS = get_movement_verbs()

# --- NER extractors (spaCy + optional Stanza) ---
def extract_entities_spacy(text: str) -> List[Tuple[str, str, int, int]]:
    doc = nlp(text)
    return [(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents]

def extract_entities_stanza(text: str) -> List[Tuple[str, str, int, int]]:
    if not USE_STANZA or stanza_pipeline is None:
        return []
    try:
        doc = stanza_pipeline(text)
    except Exception:
        return []
    out = []
    for sent in doc.sentences:
        for ent in getattr(sent, "ents", []):
            out.append((ent.text, ent.type, ent.start_char, ent.end_char))
    return out

# Lexical filters & helpers
_EN_FIRSTNAMES: Set[str] = {n.lower() for n in names.words()}
_MONTHS = {"january","february","march","april","may","june","july","august","september","october","november","december"}
_DAYS   = {"monday","tuesday","wednesday","thursday","friday","saturday","sunday"}

def _is_stoplike(tok: str) -> bool:
    t = tok.lower()
    return t in _EN_STOP or t in _MONTHS or t in _DAYS

def _looks_like_firstname(tok: str) -> bool:
    return tok.lower() in _EN_FIRSTNAMES

def _valid_toponym(span_text: str) -> bool:
    t = span_text.strip()
    toks = re.findall(r"[A-Za-zÀ-ÿ]+", t)
    if not toks:
        return False
    if len(toks) == 1:
        tok = toks[0]
        if t.islower(): return False
        if _is_stoplike(tok): return False
        if _looks_like_firstname(tok): return False
    return True

COMPOSITE_HEADS = {"villa","puerto","bahía","bahia","río","rio","cerro","san","santa","santo"}
AMBIGUOUS_SINGLETONS = {"sierra","villa","serra","rio"}

def _person_prefix_rule(sent_text: str, span_text: str) -> bool:
    head = re.findall(r"[A-Za-zÀ-ÿ]+", span_text.lower())[:1]
    if head and head[0] in COMPOSITE_HEADS:
        return False
    m = re.search(r"\b([A-Z][a-z]+)\s+" + re.escape(span_text) + r"\b", sent_text)
    return bool(m and m.group(1).lower() in _EN_FIRSTNAMES)

def _ambiguous_singleton_ok(ent_text: str, sentence: str, gazetteer_names: Set[str]) -> bool:
    toks = re.findall(r"[A-Za-zÀ-ÿ]+", ent_text.strip())
    if len(toks) != 1:
        return True
    head = toks[0].lower()
    if head not in AMBIGUOUS_SINGLETONS:
        return True
    if head in gazetteer_names:
        return True
    return False

def _norm_entity(s: str) -> str:
    s = s.strip().lower()
    s = re.sub(r"[’']s\b", "", s)
    s = s.strip('"\'' "“”‘’")
    s = unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("utf-8")
    return re.sub(r"\s+", " ", s)

NOISE_NAMES   = {"la poderosa","la poderosa i","la poderosa ii","la pedrosa"}
VEHICLE_TERMS = {"motorcycle","motorbike","bike","bicycle","boat","ship","raft","car","truck","jeep","bus","van"}

def is_named_object(entity_text: str, sentence: str) -> bool:
    ent_n = _norm_entity(entity_text)
    if ent_n in NOISE_NAMES:
        return True
    doc = nlp(sentence)
    idxs = [i for i,t in enumerate(doc) if (t.lemma_.lower() in VEHICLE_TERMS)]
    for i in idxs:
        L, R = max(0, i-6), min(len(doc), i+7)
        if any(_norm_entity(t.text) == ent_n for t in doc[L:R]):
            return True
    return False

def is_probable_metonymy(entity_text: str, sentence: str, label: str) -> bool:
    if label not in {"GPE","COUNTRY"}:
        return False
    doc = nlp(sentence)
    ent_l = entity_text.lower()
    cue_nouns = {"government","policy","military","regime","parliament","industry","media","press","power"}
    has_cue = any(tok.lemma_.lower() in cue_nouns for tok in doc)
    if not has_cue:
        return False
    idxs = [i for i,t in enumerate(doc) if ent_l in t.text.lower()]
    return any(
        any(abs(j - i) <= 5 for j,_ in enumerate(doc) if doc[j].lemma_.lower() in cue_nouns)
        for i in idxs
    )

# Cache one spaCy Doc per sentence 
DOCS = {sid: nlp(text) for sid, text in SENTENCES_FOR_NER}  
persons_by_sentence = {
    sid: [ent.text for ent in DOCS[sid].ents if ent.label_ == "PERSON"]
    for sid, _ in SENTENCES_FOR_NER  
}

# Global + manual person blacklist (kills Granado even if local PERSON not detected) 
GLOBAL_PERSONS: Set[str] = {p.lower().strip() for plist in persons_by_sentence.values() for p in plist}
GLOBAL_PERSON_HEADS: Set[str] = {p.split()[-1] for p in GLOBAL_PERSONS if p}

# Corpus-specific safety net
PERSON_BLACKLIST: Set[str] = {
    "alberto granado","granado","ernesto","ernesto guevara","che","guevara"
}

def _looks_like_person_here(ent_text: str, sid: int) -> bool:
    cand = ent_text.lower().strip()
    persons = [p.lower() for p in persons_by_sentence.get(sid, [])]
    if cand in persons:
        return True
    return any(fuzz.token_set_ratio(cand, p) >= 90 for p in persons)

#  Main pipeline 
def combine_ner_gazetteer(sentences: List[Tuple[int, str]], gazetteer_set: Set[str]) -> List[Dict]:
    allowed = {"GPE","LOC","GAZETTEER","FAC","CITY","STATE_OR_PROVINCE","COUNTRY"}
    results: List[Dict] = []

    for sid, text in tqdm(sentences, desc="NER + Gazetteer"):
        ents: List[Tuple[str,str,int,int]] = []
        # NER first (keep these unless they fail the *light* checks)
        ents += [(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in DOCS[sid].ents]
        if USE_STANZA and stanza_pipeline is not None:
            try:
                for sent in stanza_pipeline(text).sentences:
                    for ent in getattr(sent, "ents", []):
                        ents.append((ent.text, ent.type, ent.start_char, ent.end_char))
            except Exception:
                pass
        # Gazetteer hits
        ents += match_gazetteer_safe(text)

        keep = []
        for ent_text, label, start, end in ents:
            if label not in allowed:
                continue
            if not _valid_toponym(ent_text):
                continue
            if _person_prefix_rule(text, ent_text):
                continue

            low = ent_text.lower().strip()

            # suppress gazetteer hits that are actually persons (local + global + manual)
            if label == "GAZETTEER":
                if _looks_like_person_here(ent_text, sid):
                    continue
                if low in GLOBAL_PERSONS or low in GLOBAL_PERSON_HEADS or low in PERSON_BLACKLIST:
                    continue

            # if similar to a PERSON mention, only suppress for gazetteer; keep NER
            if any(fuzz.token_set_ratio(ent_text, p) >= 90 for p in persons_by_sentence.get(sid, [])):
                if label == "GAZETTEER":
                    continue

            if is_named_object(ent_text, text):
                continue
            if label == "GAZETTEER" and not _ambiguous_singleton_ok(ent_text, text, gazetteer_set):
                continue
            if is_probable_metonymy(ent_text, text, "GPE" if label in {"CITY","STATE_OR_PROVINCE","COUNTRY"} else label):
                continue

            keep.append((ent_text, label, start, end))

        # Per-sentence longest-match + strict dedup (exact + near-dup)
        spans = []
        if keep:
            dfk = pd.DataFrame(
                [{"t":t, "l":l, "s":s, "e":e, "len": e-s} for t,l,s,e in keep]
            ).sort_values("len", ascending=False)
            for r in dfk.itertuples():
                if not any(not (r.e <= s or r.s >= e) for _,_,s,e in spans):
                    # avoid adding duplicates that are nearly the same string in same sentence
                    if any(fuzz.token_set_ratio(r.t, t0) >= 95 for t0,_,_,_ in spans):
                        continue
                    spans.append((r.t, r.l, r.s, r.e))

        for ent_text, label, start, end in spans:
            results.append({
                "sentence_id": sid,
                "entity": ent_text,
                "entity_norm": ent_text.lower().strip(),
                "label": ("GPE" if label in {"CITY","STATE_OR_PROVINCE","COUNTRY"} else label),
                "start_char": start,
                "end_char": end,
                "sentence": text,
            })
    return results

# Run pipeline + prune overlaps 
print("🔎 Running ensemble NER + gazetteer matching (with metonymy & filters)...")
entities_combined = combine_ner_gazetteer(SENTENCES_FOR_NER, _PLACES)  # noqa: F821
df_combined = pd.DataFrame(entities_combined)
df_combined = remove_overlapping_shorter(df_combined)

# Enrich with coordinates + country (diacritics-safe) 
def strip_diacritics(s: str) -> str:
    return unicodedata.normalize("NFKD", s or "").encode("ascii","ignore").decode("utf-8")

gaz_rows = [{
    "name_lower":   n,
    "name_stripped":strip_diacritics(n),
    "lat":          geo.get("lat"),
    "lon":          geo.get("lon"),
    "country":      geo.get("country"),
    "country_code": geo.get("country_code"),
} for n, geo in gazetteer.items()]
gaz_df = pd.DataFrame(gaz_rows).drop_duplicates(subset=["name_lower"])

df_combined["entity_lower"]    = df_combined["entity"].str.lower()
df_combined["entity_stripped"] = df_combined["entity_lower"].map(strip_diacritics)

df_enriched = df_combined.merge(gaz_df, left_on="entity_lower", right_on="name_lower", how="left")

missing = df_enriched["lat"].isna()
if missing.any():
    fallback = df_combined[missing].merge(
        gaz_df, left_on="entity_stripped", right_on="name_stripped", how="left"
    )
    for col in ["lat","lon","country","country_code"]:
        df_enriched.loc[missing, col] = fallback[col].values

#  restrict to South America countries (prevents MX/Central America leakage) ---
SOUTH_AM = {"AR","CL","PE","CO","VE","BO","EC","BR","PY","UY","GY","SR"}
if "country_code" in df_enriched.columns:
    before_sa = len(df_enriched)
    df_enriched = df_enriched[df_enriched["country_code"].isin(SOUTH_AM)].copy()
    print(f"🌎 Restricted to South America: {before_sa} -> {len(df_enriched)} rows")


if df_enriched["country"].isna().all():
    print(" Country enrichment missing for all rows — check gazetteer build & GeoNames credentials.")

Path("outputs").mkdir(parents=True, exist_ok=True)
out_csv = "outputs/geoparsing_ner_ensemble.csv"
df_enriched.drop(columns=["name_lower","name_stripped"], errors="ignore").to_csv(out_csv, index=False)
print(f"Saved: {out_csv}")


spaCy pipeline: ['sentencizer']
🔎 Running ensemble NER + gazetteer matching (with metonymy & filters)...


NER + Gazetteer: 100%|██████████| 1933/1933 [02:18<00:00, 14.00it/s]

🌎 Restricted to South America: 391 -> 332 rows
✅ Saved: outputs/geoparsing_ner_ensemble.csv





In [None]:
# Block 8: Symbolic Enrichment (with Enhanced Metadata Extraction) ===




# 0) Resolve inputs (config + presence)

CFG_PATH = Path("config.yaml")
cfg = {}
if CFG_PATH.exists():
    with open(CFG_PATH) as f:
        cfg = yaml.safe_load(f) or {}

# Prefer full run if it exists; allow config override
outputs = Path("outputs")
full_csv   = outputs / "geoparsing_ner_ensemble.csv"
#sample_csv = outputs / "geoparsing_ner_sample_test.csv"

#sample_only_cfg = bool(cfg.get("testing", {}).get("sample_only", False))
#if not sample_only_cfg and full_csv.exists():
    #in_path = full_csv
    #SAMPLE = False
#else:
   # in_path = sample_csv
    #SAMPLE = True

#print(f"Using Block 7 output: {in_path}  (SAMPLE={SAMPLE})")


# 1) Load gazetteer

gaz_path = outputs / "gazetteer_cities.json"
with open(gaz_path, "r") as f:
    gazetteer = json.load(f)  # keys are lowercase
gazetteer_set = set(gazetteer.keys())


# 2) Load Block 7 result

df = pd.read_csv(in_path)

# persons_in_sentence is serialized list -> parse safely
if "persons_in_sentence" in df.columns:
    def _safe_list(x):
        if isinstance(x, list): return x
        if isinstance(x, str) and x.strip().startswith("["):
            try: return ast.literal_eval(x)
            except Exception: return []
        return [] if pd.isna(x) else [str(x)]
    df["persons_in_sentence"] = df["persons_in_sentence"].apply(_safe_list)
else:
    df["persons_in_sentence"] = [[] for _ in range(len(df))]

# 3) Helpers

def _norm(text: str) -> str:
    if not isinstance(text, str): return ""
    text = text.lower()
    text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("utf-8")
    return re.sub(r"[^\w\s]", "", text).strip()

def get_lat(entity): return gazetteer.get(_norm(entity), {}).get("lat")
def get_lon(entity): return gazetteer.get(_norm(entity), {}).get("lon")
def country_valid(entity): return _norm(entity) in gazetteer_set

# Named-object (vehicle/boat/etc.) signals from Block 7 logs (optional but helpful)
obj_log = outputs / "object_filtered.csv"
named_obj = set()
if obj_log.exists():
    try:
        _obj = pd.read_csv(obj_log)
        if "entity" in _obj.columns:
            named_obj = set(_obj["entity"].dropna().str.lower().unique())
    except Exception:
        pass

VEHICLE_TERMS = {
    "motorcycle","motorbike","bike","bicycle","moto","motocicleta",
    "boat","ship","barco","lancha","car","truck","jeep","bus","camion","train","plane","avion"
}

def is_named_object_context(entity: str, sentence: str) -> bool:
    e = entity.lower().strip()
    if e in named_obj:
        return True
    # quick context sweep for vehicle terms near the entity
    doc = nlp(sentence)
    # find any vehicle token and check a small window for the entity
    veh_ix = [i for i,t in enumerate(doc) if t.lemma_.lower() in VEHICLE_TERMS or t.text.lower() in VEHICLE_TERMS]
    for i in veh_ix:
        L = max(0, i-6); R = min(len(doc), i+7)
        if any(e in t.text.lower() for t in doc[L:R]):
            return True
    return False

# Movement / symbolic cues (lightweight, language-mixed)
MOVEMENT_VERBS = {
    "travel","go","arrive","leave","depart","walk","ride","sail","drive","cross","reach","head","return",
    "ir","llegar","salir","partir","caminar","andar","montar","navegar","conducir","cruzar","alcanzar","volver"
}
SYMBOLIC_VERBS = {"govern","rule","dominate","represent","symbolize","embody","gobernar","dominar","representar","simbolizar","encarnar"}

def movement_verb_present(sentence: str, entity: str, persons: list[str]) -> bool:
    if not isinstance(sentence, str): return False
    doc = nlp(sentence)
    ent_l = entity.lower()
    ppl = [p.lower() for p in (persons or [])]
    for tok in doc:
        if tok.lemma_.lower() in MOVEMENT_VERBS:
            win = [tok] + list(tok.children) + [tok.head]
            if any(ent_l in t.text.lower() for t in win) or any(any(p in t.text.lower() for p in ppl) for t in win):
                return True
    return False

def symbolic_context(sentence: str, entity: str, persons: list[str]) -> bool:
    if not isinstance(sentence, str): return False
    doc = nlp(sentence)
    ent_l = entity.lower()
    ppl = [p.lower() for p in (persons or [])]
    for tok in doc:
        if tok.lemma_.lower() in SYMBOLIC_VERBS:
            win = [tok] + list(tok.children) + [tok.head]
            if any(ent_l in t.text.lower() for t in win) or any(any(p in t.text.lower() for p in ppl) for t in win):
                return True
    return False

# Reuse Block‑7 metonymy if available; otherwise define here
if "metonymy_flagged" in df.columns:
    _has_block7_meta = True
else:
    _has_block7_meta = False
    CUE_WORDS = {
        "government","policy","military","regime","parliament","industry",
        "media","revolution","capital","press","organization","power"
    }
    def is_probable_metonymy(entity_text: str, sentence: str) -> bool:
        doc = nlp(sentence)
        ent_l = entity_text.lower()
        entity_tokens = [t for t in doc if ent_l in t.text.lower()]
        for token in doc:
            if token.text.lower() in CUE_WORDS and token.pos_ == "NOUN":
                for ent_token in entity_tokens:
                    if abs(token.i - ent_token.i) <= 10:
                        return True
        return False


# 4) Compute enrichment columns

df["lat"] = df["entity"].apply(get_lat)
df["lon"] = df["entity"].apply(get_lon)
df["country_valid"] = df["entity"].apply(country_valid)

# named object context flag (helps keep “La Poderosa/Pedrosa” out downstream)
df["named_object_flag"] = df.apply(lambda r: is_named_object_context(r["entity"], r["sentence"]), axis=1)

df["movement_verb_present"] = df.apply(lambda r: movement_verb_present(r["sentence"], r["entity"], r["persons_in_sentence"]), axis=1)
df["symbolic_context"] = df.apply(lambda r: symbolic_context(r["sentence"], r["entity"], r["persons_in_sentence"]), axis=1)

if _has_block7_meta:
    # Already set by Block 7
    df["metonymy_flagged"] = df["metonymy_flagged"].astype(bool)
else:
    df["metonymy_flagged"] = df.apply(lambda r: is_probable_metonymy(r["entity"], r["sentence"]), axis=1)


# 5) Lightweight final label (no leakage into ML features)

# Note: this is for analysis & downstream use; ML in Block 10 will not use it as a feature.
def final_label(row):
    # Named objects trump everything: treat as NOISE
    if row["named_object_flag"]:
        return "NOISE"
    # Symbolic if symbolic cue or movement alongside metonymy
    if row["symbolic_context"] or (row["movement_verb_present"] and row["metonymy_flagged"]):
        return "SYMBOLIC"
    # Literal if gazetteer-backed and not metonymic
    if row["country_valid"] and not row["metonymy_flagged"]:
        return "LITERAL"
    return "NOISE"

df["final_label"] = df.apply(final_label, axis=1)


# 6) Metadata: year & transport

def extract_year(sentence: str):
    if not isinstance(sentence, str): return None
    m = re.search(r"\b(19\d{2}|20\d{2})\b", sentence)
    return m.group(0) if m else None

TRANSPORT = {
    "motorcycle","motorbike","bike","bicycle","bus","truck","car","jeep","boat","ship","raft","train","plane","foot",
    "moto","bicicleta","bus","camion","coche","auto","jeep","barco","lancha","tren","avion","a pie"
}
def extract_transport(sentence: str):
    if not isinstance(sentence, str): return None
    doc = nlp(sentence)
    found = {t.text.lower() for t in doc if t.lemma_.lower() in TRANSPORT and t.pos_ == "NOUN"}
    return ", ".join(sorted(found)) if found else None

df["year"] = df["sentence"].apply(extract_year)
df["transport"] = df["sentence"].apply(extract_transport)
df["people_involved"] = df["persons_in_sentence"].apply(lambda xs: ", ".join(xs) if xs else None)

# Keep columns tidy if present
preferred_cols = [
    "sentence_id","entity","entity_norm","label","lat","lon","country_valid",
    "symbolic_context","movement_verb_present","metonymy_flagged","named_object_flag",
    "final_label","year","transport","start_char","end_char","sentence","persons_in_sentence"
]
df = df[[c for c in preferred_cols if c in df.columns]]


# 7) Save

out_path = outputs / ("geoparsing_final_enriched.csv")
out_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out_path, index=False)
print(f"Enriched data saved → {out_path}  ({len(df)} rows)")


📥 Using Block 7 output: outputs/geoparsing_ner_ensemble.csv  (SAMPLE=False)
✅ Enriched data saved → outputs/geoparsing_final_enriched.csv  (406 rows)


In [None]:


# Block 10: ML Classification (Clean, Symbolic-aware, Person-aware) ===

# Load data
df = pd.read_csv("outputs/geoparsing_final_enriched.csv")
df = df[df["final_label"].isin(["LITERAL", "SYMBOLIC", "NOISE"])].copy()
df = df[df["entity_is_valid"] == True].copy()
df = df[~((df["final_label"] == "LITERAL") & (df["lat"].isna() | df["lon"].isna()))].copy()

# Gazetteer
with open("outputs/gazetteer_cities.json", "r") as f:
    gazetteer = set(json.load(f).keys())

# Load SpaCy model with vectors
try:
    nlp = spacy.load("en_core_web_md")
except:
    raise ValueError("Model 'en_core_web_md' not found. Run: python -m spacy download en_core_web_md")

# Regex helpers
month_regex = r"\b(january|february|march|april|may|june|july|august|september|october|november|december)\b"
year_regex = r"\b(19|20)\d{2}\b"
def contains_date_mention(text: str) -> bool:
    return bool(re.search(month_regex, text.lower())) or bool(re.search(year_regex, text.lower()))

REGION_LIKE = {
    "andes", "amazon", "patagonia", "altiplano", "la plata",
    "pampas", "amazonas", "chaco", "conosur"
}
def is_region_like(entity): return entity.lower().strip() in REGION_LIKE

historical_keywords = {
    "revolution", "regime", "independence", "liberation", "martyr",
    "battle", "hero", "military", "colonial", "freedom", "leader", "movement"
}
def verb_density(sentence):
    doc = nlp(sentence)
    return sum(1 for t in doc if t.pos_ == "VERB") / (len(doc) or 1)

def name_is_person(entity):
    doc = nlp(entity)
    return any(ent.label_ == "PERSON" for ent in doc.ents)

# Feature Extractor 
def extract_final_features(row):
    entity = row["entity"]
    sentence = row["sentence"]
    person_names = row["persons_in_sentence"] if isinstance(row["persons_in_sentence"], list) else []

    doc = nlp(sentence)
    ent_doc = nlp(entity)
    sim = doc.similarity(ent_doc) if doc.vector_norm and ent_doc.vector_norm else 0.0

    return {
        "entity_len": len(entity),
        "sentence_len": len(sentence),
        "entity_capital_ratio": sum(c.isupper() for c in entity) / (len(entity) or 1),
        "starts_with_cap": entity[0].isupper(),
        "has_digits": any(c.isdigit() for c in entity),
        "in_quotes": '"' in sentence or "'" in sentence,
        "person_like_fuzzy": any(fuzz.token_set_ratio(entity.lower(), p.lower()) > 85 for p in person_names),
        "mentions_date": contains_date_mention(sentence),
        "gazetteer_match": int(entity.lower() in gazetteer),
        "entity_position_ratio": row["start_char"] / (len(sentence) or 1),
        "entity_sentence_sim": sim,
        "historical_context": any(word in sentence.lower() for word in historical_keywords),
        "is_region_like": is_region_like(entity),
        "verb_density": verb_density(sentence),
        "name_is_person": name_is_person(entity)
    }

# Feature Extraction
X_dict_list = []
valid_rows = []

for i, row in df.iterrows():
    try:
        X_dict_list.append(extract_final_features(row))
        valid_rows.append(i)
    except Exception as e:
        print(f"⚠️ Skipped row {i}: {e}")

df = df.loc[valid_rows].reset_index(drop=True)
y = df["final_label"].reset_index(drop=True)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Vectorize features
vec = DictVectorizer(sparse=False)
X = vec.fit_transform(X_dict_list)

# Outlier filtering
iso = IsolationForest(contamination=0.05, random_state=42)
mask = iso.fit_predict(X) == 1
X = X[mask]
y_encoded = y_encoded[mask]
df = df.loc[mask].reset_index(drop=True)

print(f" Outlier removal: Kept {len(X)} rows")

# Save indices to track test rows
df["original_index"] = df.index

# Train/test split
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, y_encoded, df["original_index"].values, test_size=0.2, stratify=y_encoded, random_state=42
)

# Train XGBoost model
clf = XGBClassifier(n_estimators=120, use_label_encoder=False, eval_metric="mlogloss", random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Decode predictions
y_test_labels = label_encoder.inverse_transform(y_test)
y_pred_labels = label_encoder.inverse_transform(y_pred)

# Evaluation
print("Classification Report:")
print(classification_report(y_test_labels, y_pred_labels))
print("Confusion Matrix:")
print(confusion_matrix(y_test_labels, y_pred_labels))

# Feature Importances
importances = sorted(
    zip(vec.get_feature_names_out(), clf.feature_importances_),
    key=lambda x: x[1], reverse=True
)

plt.figure(figsize=(9, 6))
plt.barh([f for f, _ in importances], [imp for _, imp in importances])
plt.title("Feature Importances (Final Model)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Save test predictions
df_test = df.loc[idx_test].copy()
df_test["ml_prediction"] = y_pred_labels
df_test.to_csv("outputs/geoparser_ml_predictions_leakfree.csv", index=False)
print(" Saved final predictions to outputs/geoparser_ml_predictions_leakfree.csv")


KeyError: 'entity_is_valid'

In [None]:
# Block 12: Interactive Map with Hover Metadata 
def generate_hover_info(row):
    people = row.get("people_involved", "")
    transport = row.get("transport", "")
    year = row.get("year", "")
    parts = []
    if people: parts.append(f"👥 People: {people}")
    if transport: parts.append(f"🚗 Transport: {transport}")
    if year: parts.append(f"📅 Year: {year}")
    return "<br>".join(parts) if parts else "ℹ️ No metadata"

df = pd.read_csv("outputs/geoparser_ml_predictions_leakfree.csv")
df = df[(df["ml_prediction"] == "LITERAL") & df["lat"].notna() & df["lon"].notna()].copy()
df.reset_index(drop=True, inplace=True)
df["hover_text"] = df.apply(generate_hover_info, axis=1)

start_lat, start_lon = df.iloc[0]["lat"], df.iloc[0]["lon"]
m = folium.Map(location=[start_lat, start_lon], zoom_start=4, tiles="CartoDB positron")

route_coords = df[["lat", "lon"]].values.tolist()
AntPath(route_coords, color="red", weight=3, delay=1000).add_to(m)

for i, row in df.iterrows():
    folium.CircleMarker(
        location=[row["lat"], row["lon"]],
        radius=6,
        color="blue",
        fill=True,
        fill_opacity=0.8,
        popup=folium.Popup(row["hover_text"], max_width=300),
        tooltip=f"#{i+1}: {row['entity']}"
    ).add_to(m)

m.save("outputs/interactive_geoparsing_map.html")
print("Interactive map saved to outputs/interactive_geoparsing_map.html")


FileNotFoundError: [Errno 2] No such file or directory: 'outputs/geoparser_ml_predictions_leakfree.csv'