# [Setup]
Block 1 (light preprocessing)
Block 2 (gazetteer + spacy/stanza)
Block 3 (NER functions)
Block 7 (WordNet vague terms)
Block 8 (motion/transport terms)
Block 9 (sentence scoring prep)

# [Pipeline]
Block 1 (continue with PDF preprocessing)
Block 4 (load cleaned sentences)
Block 5 (NER + ML filtering)
Block 6 (boosting small towns)
Block 10 (GeoNames + Wikidata)
Block 11 (Wikidata enrichment)
Block 12 (clustering)


In [4]:
# === Pre-Block: Downloads & Setup ===
import nltk

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("framenet_v17")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alicja/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/alicja/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/alicja/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package framenet_v17 to
[nltk_data]     /Users/alicja/nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!


True

In [43]:
# === Block 1: Imports ===

# Standard
import os
import re
import time
import unicodedata
import yaml
import json
# Third-party
import requests
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from rapidfuzz import fuzz
from fuzzywuzzy import fuzz
import contractions
import fitz  # PyMuPDF


# NLP
from nltk.corpus import stopwords, wordnet as wn, framenet as fn

# Helpers
from nlp_helpers import (
    init_nlp,
    get_stopwords,
    tag_named_entities,
    extract_text_from_pdf,
    load_config,
    normalize_punctuation,  
    clean_light,
    preprocess_text,
    segment_sentences,
    clean_heavy
)
from gazetteer_helpers import build_gazetteer
from typing import List, Tuple, Dict, Set, Union
from tqdm import tqdm
  # Preview output
from IPython.display import display
from geopy.distance import geodesic

# === NLP Initialization ===
nlp, stanza_pipeline = init_nlp()

import spacy
nlp = spacy.load("en_core_web_sm")



2025-08-01 12:16:17 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-08-01 12:16:17 INFO: Downloaded file to /Users/alicja/stanza_resources/resources.json
2025-08-01 12:16:18 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

2025-08-01 12:16:18 INFO: Using device: cpu
2025-08-01 12:16:18 INFO: Loading: tokenize
2025-08-01 12:16:18 INFO: Loading: mwt
2025-08-01 12:16:18 INFO: Loading: ner
2025-08-01 12:16:22 INFO: Done loading processors!


In [2]:

# Block 2: Extract text from PDF using PyMuPDF.  pages 28-148

def extract_text_from_pdf(pdf_path: str, start_page: int, end_page: int) -> str:
    """Extracts and returns text from a PDF given a path and page range."""
    doc = fitz.open(pdf_path)
    pages = doc[start_page:end_page]
    return "\n".join(page.get_text() for page in pages)



#Load configuration from YAML file
def load_config(config_path: str = "config.yaml") -> dict:
    """Loads YAML configuration file and returns it as a dictionary."""
    with open(config_path, "r") as file:
        
        return yaml.safe_load(file)
    
config = load_config()
pdf_conf = config["pdf"]
raw_text = extract_text_from_pdf(pdf_conf["path"], pdf_conf["start_page"], pdf_conf["end_page"])




In [3]:
# === Block 3: Text Preprocessing Functions ===

from nlp_helpers import (
    init_nlp,
    get_stopwords,
    tag_named_entities,
    extract_text_from_pdf,
    load_config,
    normalize_punctuation,
    clean_light,
    preprocess_text,
    segment_sentences,
    clean_heavy
)

In [31]:
# === Block 4: Clean and Save Tagged + NLP Versions ===

# Load config
config = load_config()
pdf_conf = config["pdf"]
gaz_conf = config["gazetteer"]

# Output filenames based on input PDF
base_name = Path(pdf_conf["path"]).stem
tagged_path = Path("outputs") / f"cleaned_{base_name}_geoparsing.txt"
heavy_path = Path("outputs") / f"cleaned_{base_name}_nlp.txt"

# Create outputs dir if needed
os.makedirs("outputs", exist_ok=True)

# 🧹 Clean raw text (light and heavy)
light_cleaned = clean_light(raw_text)
sentences = segment_sentences(light_cleaned, nlp)
sentences_with_tags = [f"[SENT {i+1}] {s}" for i, s in enumerate(sentences)]

all_stops = get_stopwords(nlp)
heavy_cleaned = clean_heavy(light_cleaned, nlp, all_stops)

# 💾 Save both cleaned versions
with open(tagged_path, "w", encoding="utf-8") as f:
    f.write("\n".join(sentences_with_tags))

with open(heavy_path, "w", encoding="utf-8") as f:
    f.write(heavy_cleaned)

print("✅ Cleaned outputs saved:")
print(f"- Geoparsing (light): {tagged_path}")
print(f"- NLP prep (heavy): {heavy_path}")

# ✅ Prepare sentence_data for Block 7
sentence_data = [(i, sent.strip()) for i, sent in enumerate(sentences)]
print(f"✅ sentence_data prepared with {len(sentence_data)} narrative sentences.")



✅ Cleaned outputs saved:
- Geoparsing (light): outputs/cleaned_MotorcycleDiaries_geoparsing.txt
- NLP prep (heavy): outputs/cleaned_MotorcycleDiaries_nlp.txt
✅ sentence_data prepared with 2407 narrative sentences.


In [41]:
# === Build symbolic verb lexicon from WordNet (only once)


def get_symbolic_verb_synonyms():
    base_words = [
        "dream", "hope", "struggle", "escape", "resist", "believe", "follow", 
        "ride", "rebel", "fight", "flee", "live", "return"
    ]
    synonyms = set()
    for word in base_words:
        for syn in wordnet.synsets(word, pos=wordnet.VERB):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name().lower().replace("_", " "))
    return synonyms

# Save the expanded verb set to reuse
SYMBOLIC_VERBS = get_symbolic_verb_synonyms()
print(f"🧠 Loaded {len(SYMBOLIC_VERBS)} symbolic verb forms from WordNet.")

# === Build movement verb list using WordNet


def get_movement_verbs():
    base = ["go", "move", "travel", "walk", "drive", "ride", "arrive", "depart", "leave", "return", "cross", "fly", "sail"]
    move_verbs = set()
    for word in base:
        for syn in wordnet.synsets(word, pos=wordnet.VERB):
            for lemma in syn.lemmas():
                move_verbs.add(lemma.name().lower().replace("_", " "))
    return move_verbs

MOVEMENT_VERBS = get_movement_verbs()
print(f"🛣️ Loaded {len(MOVEMENT_VERBS)} movement verb forms from WordNet.")

🧠 Loaded 159 symbolic verb forms from WordNet.
🛣️ Loaded 205 movement verb forms from WordNet.


In [6]:
# === Block 6: Build Gazetteer ===


# 🔐 JSON safety patch
def make_json_safe(obj):
    if isinstance(obj, set):
        return list(obj)
    raise TypeError(f"❌ Not JSON serializable: {type(obj)}")

# 📥 Load config.yaml
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)
gaz_conf = config["gazetteer"]

# 📁 Gazetteer path
gazetteer_path = Path("outputs/gazetteer_cities.json")

# ⚙️ Build or load gazetteer
if gazetteer_path.exists():
    print("📂 Found existing gazetteer file, loading...")
    with open(gazetteer_path, "r") as f:
        gazetteer = json.load(f)
else:
    print("🌍 No gazetteer file found, building...")
    gazetteer = build_gazetteer(
        username=gaz_conf["username"],
        countries=gaz_conf["countries"],
        max_rows=gaz_conf["max_rows"]
    )
    with open(gazetteer_path, "w") as f:
        json.dump(gazetteer, f, indent=2, default=make_json_safe)  # 🔐 here
    print("✅ Gazetteer built and saved with coordinates.")

📂 Found existing gazetteer file, loading...


In [32]:
# === Block 7: NER + Gazetteer with Metonymy Filtering (Thesis-Ready) ===
# 📚 Metonymy-aware NER inspired by Gritta et al. (2018)
# GitHub: https://github.com/milangritta/WhatsMissingInGeoparsing

USE_STANZA = True  # Toggle Stanza NER support

stop_words = get_stopwords(nlp)

def extract_entities_spacy(text: str) -> List[Tuple[str, str, int, int]]:
    doc = nlp(text)
    return [(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents]

def extract_entities_stanza(text: str) -> List[Tuple[str, str, int, int]]:
    doc = stanza_pipeline(text)
    results = []
    for sent in doc.sentences:
        for ent in sent.ents:
            results.append((ent.text, ent.type, ent.start_char, ent.end_char))
    return results

def match_gazetteer(text: str, known_places: set[str]) -> List[Tuple[str, str, int, int]]:
    text_lower = text.lower()
    matches = []
    known_places_sorted = sorted([p for p in known_places if len(p) > 3 and p not in stop_words], key=len, reverse=True)

    for place in known_places_sorted:
        pattern = r'\b{}\b'.format(re.escape(place.lower()))
        for m in re.finditer(pattern, text_lower):
            start, end = m.start(), m.end()
            match_text = text[start:end].strip(".,;:!?()[]{}0123456789 ")
            if match_text:
                matches.append((match_text, "GAZETTEER", start, end))
    return matches

# === Enhanced Metonymy Detection Function ===
def is_probable_metonymy(entity_text: str, sentence: str) -> bool:
    """
    Detect whether a location entity is used metonymically (non-literal).
    - Looks for cue words near the entity in the sentence
    - Checks that cue is a noun (e.g. 'government', 'industry')
    """
    doc = nlp(sentence)
    entity_tokens = [t for t in doc if entity_text.lower() in t.text.lower()]
    cue_words = {
        "government", "policy", "military", "regime", "parliament", "industry",
        "media", "revolution", "capital", "press", "organization", "power"
    }

    for i, token in enumerate(doc):
        if token.text.lower() in cue_words and token.pos_ == "NOUN":
            # Check distance to entity
            for ent_token in entity_tokens:
                if abs(token.i - ent_token.i) <= 10:
                    return True
    return False

# === Main Pipeline ===
def combine_ner_gazetteer(
    sentences: List[Tuple[int, str]], 
    gazetteer: set[str]
) -> List[Dict]:
    """
    Combines entities from spaCy, optional Stanza, and gazetteer pattern matching.
    Filters to location-type entities only: GPE, LOC, and GAZETTEER.
    Applies metonymy-aware filtering. Stores all useful metadata per entity.
    """
    allowed_labels = {"GPE", "LOC", "GAZETTEER"}
    results = []
    metonymy_filtered = []

    # Pre-tag persons per sentence
    persons_by_sentence = {
        sid: [ent.text for ent in nlp(text).ents if ent.label_ == "PERSON"]
        for sid, text in sentences
    }

    for sent_id, text in tqdm(sentences, desc="NER + Gazetteer"):
        try:
            ents_spacy = extract_entities_spacy(text)
            ents_stanza = extract_entities_stanza(text) if USE_STANZA else []
            ents_gazetteer = match_gazetteer(text, gazetteer)

            all_ents = []
            for ent_text, label, start, end in ents_spacy + ents_stanza + ents_gazetteer:
                if label in allowed_labels:
                    if not is_probable_metonymy(ent_text, text):
                        all_ents.append((ent_text, label, start, end))
                    else:
                        metonymy_filtered.append({
                            "sentence_id": sent_id,
                            "entity": ent_text,
                            "label": label,
                            "sentence": text
                        })

            # Deduplicate person list
            raw_persons = persons_by_sentence.get(sent_id, [])
            cleaned_persons = []
            seen_persons = set()
            for p in raw_persons:
                p_clean = re.sub(r"\d+$", "", p).strip()
                norm = p_clean.lower()
                if not any(fuzz.ratio(norm, s) > 90 for s in seen_persons):
                    seen_persons.add(norm)
                    cleaned_persons.append(p_clean)

            # Store results
            seen = set()
            for ent_text, label, start, end in all_ents:
                norm = ent_text.lower()
                if not any(fuzz.ratio(norm, s) > 90 for s in seen):
                    seen.add(norm)
                    results.append({
                        "sentence_id": sent_id,
                        "entity": ent_text,
                        "entity_norm": norm,
                        "label": label,
                        "start_char": start,
                        "end_char": end,
                        "sentence": text,
                        "persons_in_sentence": cleaned_persons
                    })
        except Exception as e:
            print(f"❌ Error in sentence {sent_id}: {e}")

    # Save metonymy-flagged entities for analysis
    if metonymy_filtered:
        pd.DataFrame(metonymy_filtered).to_csv("outputs/metonymy_filtered.csv", index=False)
        print("📤 Logged metonymy-filtered entities to: outputs/metonymy_filtered.csv")

    return results

def remove_overlapping_shorter(df: pd.DataFrame) -> pd.DataFrame:
    clean = []
    for sid in df["sentence_id"].unique():
        sent_df = df[df["sentence_id"] == sid].sort_values("start_char")
        to_keep = []
        last_end = -1
        for _, row in sent_df.iterrows():
            if row["start_char"] >= last_end:
                to_keep.append(row)
                last_end = row["end_char"]
        clean.append(pd.DataFrame(to_keep))
    return pd.concat(clean, ignore_index=True)

# === Run full pipeline ===
print("🔎 Running ensemble NER + gazetteer matching (with metonymy awareness)...")
entities_combined = combine_ner_gazetteer(sentence_data, gazetteer)

df_combined = pd.DataFrame(entities_combined)
df_combined = remove_overlapping_shorter(df_combined)

df_combined.to_csv("outputs/geoparsing_ner_ensemble.csv", index=False)
print("✅ Saved: outputs/geoparsing_ner_ensemble.csv")


🔎 Running ensemble NER + gazetteer matching (with metonymy awareness)...


NER + Gazetteer:   0%|          | 2/2407 [00:02<52:53,  1.32s/it]


KeyboardInterrupt: 

In [33]:
# === TEST BLOCK: Sample NER + Gazetteer Run ===

def test_ner_pipeline(n: int = 100):
    """
    Runs NER + gazetteer matching on a sample of n sentences.
    """
    sample_sentences = sentence_data[:n]
    print(f"🧪 Testing on first {n} sentences...")

    results = combine_ner_gazetteer(sample_sentences, gazetteer)
    df_test = pd.DataFrame(results)

    output_path = Path("outputs") / "geoparsing_ner_sample_test.csv"
    df_test.to_csv(output_path, index=False)
    print(f"✅ Test results saved to: {output_path}")

# ✅ Run this
test_ner_pipeline(n=100)


🧪 Testing on first 100 sentences...


NER + Gazetteer: 100%|██████████| 100/100 [02:00<00:00,  1.21s/it]

📤 Logged metonymy-filtered entities to: outputs/metonymy_filtered.csv
✅ Test results saved to: outputs/geoparsing_ner_sample_test.csv





In [46]:
#actual blok 8 rule-based filtering 

# Filter only relevant, valid entities using rule-based filtering
# === Block 8: Manual Gazetteer Enrichment ===
# ✅ Rule-Based Filtering of Enriched Gazetteer Output


## === Block 8: Rule-Based Filtering + Symbolic Enrichment (Final Thesis-Ready) ===

# Load gazetteer
with open("outputs/gazetteer_cities.json", "r") as f:
    gazetteer = json.load(f)
gazetteer_set = set(gazetteer.keys())

# Load Block 7 output
df_combined = pd.read_csv("outputs/geoparsing_ner_sample_test.csv")
df_combined["persons_in_sentence"] = df_combined["persons_in_sentence"].apply(eval)

# === Gazetteer info ===
def get_lat(entity): return gazetteer.get(entity.lower(), {}).get("lat")
def get_lon(entity): return gazetteer.get(entity.lower(), {}).get("lon")
def country_valid(entity): return entity.lower() in gazetteer_set
def symbolic_flagged(entity, persons):
    return any(fuzz.token_set_ratio(entity.lower(), p.lower()) > 85 for p in persons)

# === Label Fix (safe relabel)
def normalize(text):
    text = text.lower()
    text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("utf-8")
    return re.sub(r"[^\w\s]", "", text)

def relabel_as_person(row):
    if row["label"] not in {"GPE", "LOC", "GAZETTEER"}:
        return row["label"]
    ent = normalize(row["entity"])
    if ent.lower() in gazetteer_set:
        return row["label"]
    for p in row["persons_in_sentence"]:
        if fuzz.token_set_ratio(ent, normalize(p)) >= 85:
            return "PERSON"
    return row["label"]


def movement_verb_present(row):
    doc = nlp(row["sentence"])
    ent = row["entity"].lower()
    persons = [p.lower() for p in row["persons_in_sentence"]]
    for token in doc:
        if token.lemma_.lower() in MOVEMENT_VERBS:
            context = [token] + list(token.children) + [token.head]
            for t in context:
                t_text = t.text.lower()
                if ent in t_text or any(p in t_text for p in persons):
                    return True
    return False

# === Symbolic Context via WordNet-enhanced Verbs
def advanced_symbolic_context(row):
    doc = nlp(row["sentence"])
    ent_text = row["entity"].lower()
    persons = [p.lower() for p in row["persons_in_sentence"]]
    for token in doc:
        if token.lemma_.lower() in SYMBOLIC_VERBS:
            related_tokens = [token] + list(token.children) + [token.head]
            for t in related_tokens:
                t_lower = t.text.lower()
                if ent_text in t_lower:
                    return True
                for p in persons:
                    if p in t_lower:
                        return True
    return False

# === Metonymy filter
def is_probable_metonymy(entity_text: str, sentence: str) -> bool:
    cue_words = {
        "government", "policy", "military", "regime", "parliament", "industry",
        "media", "revolution", "capital", "press", "organization", "power"
    }
    doc = nlp(sentence)
    entity_tokens = [t for t in doc if entity_text.lower() in t.text.lower()]
    for token in doc:
        if token.text.lower() in cue_words and token.pos_ == "NOUN":
            for ent_token in entity_tokens:
                if abs(token.i - ent_token.i) <= 10:
                    return True
    return False

# === Apply enrichments
df_combined["lat"] = df_combined["entity"].apply(get_lat)
df_combined["lon"] = df_combined["entity"].apply(get_lon)
df_combined["country_valid"] = df_combined["entity"].apply(country_valid)
df_combined["symbolic_flagged"] = df_combined.apply(
    lambda row: symbolic_flagged(row["entity"], row["persons_in_sentence"]), axis=1
)
df_combined["label"] = df_combined.apply(relabel_as_person, axis=1)
df_combined["symbolic_context"] = df_combined.apply(advanced_symbolic_context, axis=1)
df_combined["metonymy_flagged"] = df_combined.apply(
    lambda row: is_probable_metonymy(row["entity"], row["sentence"]), axis=1
)
df_combined["movement_verb_present"] = df_combined.apply(movement_verb_present, axis=1)

# === Regional Consistency Tracking (3-sentence memory)

def is_regionally_consistent(current_row, history, max_distance_km=500):
    if not pd.notnull(current_row["lat"]) or not pd.notnull(current_row["lon"]):
        return False
    current_coords = (current_row["lat"], current_row["lon"])

    for past in history:
        if not pd.notnull(past["lat"]) or not pd.notnull(past["lon"]):
            continue
        past_coords = (past["lat"], past["lon"])
        try:
            if geodesic(current_coords, past_coords).km <= max_distance_km:
                return True
        except ValueError:
            continue
    return False


region_history = []
consistency_flags = []

for _, row in df_combined.iterrows():
    flag = is_regionally_consistent(row, region_history)
    consistency_flags.append(flag)
    if row["label"] in {"GPE", "LOC", "GAZETTEER"} and row["country_valid"]:
        region_history.append(row)
        if len(region_history) > 3:
            region_history.pop(0)

df_combined["regional_consistency_flag"] = consistency_flags

# === Final symbolic score
df_combined["symbolic_score"] = (
    df_combined["symbolic_flagged"].astype(int)
    + df_combined["symbolic_context"].astype(int)
    + df_combined["movement_verb_present"].astype(int)
    + df_combined["regional_consistency_flag"].astype(int)
    - df_combined["metonymy_flagged"].astype(int)
)

df_combined["symbolic_confidence"] = df_combined["symbolic_score"].apply(
    lambda x: "high" if x >= 2 else ("low" if x > 0 else "none")
)

# === Final label (for ML)
def final_label(row):
    if row["symbolic_score"] >= 1:
        return "SYMBOLIC"
    elif row["label"] in {"GPE", "LOC", "GAZETTEER"} and row["country_valid"]:
        return "LITERAL"
    else:
        return "NOISE"

df_combined["final_label"] = df_combined.apply(final_label, axis=1)

# === Save result
df_combined.to_csv("outputs/geoparsing_final_enriched.csv", index=False)
print("✅ Final enriched symbolic data saved to: outputs/geoparsing_final_enriched.csv")


✅ Final enriched symbolic data saved to: outputs/geoparsing_final_enriched.csv


In [None]:
## !!! Ignore this for now 
# Block x: Train ML model to filter real geographic entities
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Load ensemble NER output
df = pd.read_csv("outputs/geoparsing_ner_ensemble.csv")

# Create features
df["label_GPE"] = (df["label"] == "GPE").astype(int)
df["label_LOC"] = (df["label"] == "LOC").astype(int)

symbolic_keywords = ["freedom", "struggle", "liberation", "future", "dream", "cause", "revolution", "hope", "people"]
df["symbolic_flagged"] = df["sentence"].str.contains("|".join(symbolic_keywords), flags=re.IGNORECASE, na=False)

expected_countries = ["argentina", "chile", "peru", "bolivia", "colombia", "venezuela"]
df["country_valid"] = df["sentence"].str.lower().apply(
    lambda x: int(any(country in x for country in expected_countries))
)

df["fuzzy_score"] = df.apply(
    lambda row: fuzz.ratio(str(row["entity"]).lower(), str(row["entity_norm"]).lower()), axis=1
)
df["fuzzy_score_scaled"] = df["fuzzy_score"] / 100.0

df["auto_label"] = ((df["country_valid"] == 1) & (~df["symbolic_flagged"])).astype(int)

# Train/test split
features = df[["label_GPE", "label_LOC", "symbolic_flagged", "country_valid", "fuzzy_score_scaled"]]
target = df["auto_label"]

X_train, X_test, y_train, y_test = train_test_split(features, target, stratify=target, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = LogisticRegression()
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)
print(" Classification Report:")
print(classification_report(y_test, y_pred))

# Predict on all
X_all_scaled = scaler.transform(features)
df["geo_confidence"] = clf.predict_proba(X_all_scaled)[:, 1]
df["filtered_out_ml"] = df["geo_confidence"] < 0.5

# Save outputs
df_filtered = df[~df["filtered_out_ml"]].copy()
df.to_csv("outputs/geoparsing_ensemble_flagged_with_ml.csv", index=False)
df_filtered.to_csv("outputs/geoparsing_ensemble_final_ml_filtered.csv", index=False)

print(" ML filtering complete")


 Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       198
           1       1.00      1.00      1.00        35

    accuracy                           1.00       233
   macro avg       1.00      1.00      1.00       233
weighted avg       1.00      1.00      1.00       233

 ML filtering complete


In [None]:
# Block 9: Boost confidence for small towns in South America
df = pd.read_csv("outputs/geoparsing_ensemble_final_ml_filtered.csv")

# Define target countries
target_countries = [
    "argentina", "chile", "peru", "bolivia", "colombia",
    "venezuela", "ecuador", "brazil", "uruguay", "paraguay"
]

# Prepare for matching
df["entity_norm_lower"] = df["entity_norm"].str.lower()

# Try to load population-enriched data
try:
    enriched = pd.read_csv("outputs/geoparsing_final_enriched.csv")
    enriched["entity_norm_lower"] = enriched["entity_norm"].str.lower()

    df = df.merge(
        enriched[["entity_norm_lower", "country", "population"]],
        on="entity_norm_lower", how="left"
    )

    # Mark small towns
    df["boost_small_town"] = (
        df["population"].fillna(0).lt(50000) &
        df["country"].str.lower().isin(target_countries)
    )
    print(f" Boosted {df['boost_small_town'].sum()} small towns")

except Exception as e:
    print(f" Could not load enriched data: {e}")
    df["boost_small_town"] = False

# Apply confidence boost
df["geo_confidence_boosted"] = df["geo_confidence"]
df.loc[df["boost_small_town"], "geo_confidence_boosted"] = 0.9

# Save result
df.to_csv("outputs/geoparsing_ensemble_final_ml_boosted.csv", index=False)
print("Saved: geoparsing_ensemble_final_ml_boosted.csv")


 Could not load enriched data: "['country'] not in index"
Saved: geoparsing_ensemble_final_ml_boosted.csv


In [None]:

# Block 10: vague term filtering (ML + WordNet) + South America region restriction

from nltk.corpus import wordnet as wn
import pandas as pd

df = pd.read_csv("outputs/geoparsing_ensemble_final_ml_boosted.csv")

#  Normalize entity name 
df["entity_norm"] = df["entity_norm"].astype(str).str.lower().str.strip()

# === STEP 1: ML-learned vague terms ===
term_stats = df.groupby("entity_norm").agg({
    "geo_confidence_boosted": "mean",
    "entity": "count"
}).rename(columns={
    "entity": "freq",
    "geo_confidence_boosted": "avg_conf"
}).reset_index()

learned_vague = term_stats[
    (term_stats["freq"] >= 3) &
    (term_stats["avg_conf"] < 0.35)
]["entity_norm"].tolist()
print(f" ML-learned vague terms: {len(learned_vague)}")

# === STEP 2: WordNet vague terms ===
location_synsets = [
    wn.synset("location.n.01"),
    wn.synset("region.n.01"),
    wn.synset("area.n.01"),
    wn.synset("place.n.01"),
    wn.synset("territory.n.01")
]

vague_terms_wordnet = set()
for syn in location_synsets:
    for hypo in syn.closure(lambda s: s.hyponyms()):
        for lemma in hypo.lemmas():
            vague_terms_wordnet.add(lemma.name().lower().replace("_", " "))
print(f" WordNet vague terms: {len(vague_terms_wordnet)}")

# === STEP 3: Combined vague term filtering ===
df["is_vague_combined"] = df.apply(
    lambda row: (
        row["entity_norm"] in learned_vague or
        (row["label"] == "LOC" and row["entity_norm"] in vague_terms_wordnet)
    ),
    axis=1
)

df_filtered = df[~df["is_vague_combined"]].copy()
print(f" Removed {df['is_vague_combined'].sum()} entities (ML or WordNet flagged)")

# === STEP 4: South America region restriction ===
sa_countries = {
    "argentina", "bolivia", "brazil", "chile", "colombia",
    "ecuador", "guyana", "paraguay", "peru", "suriname", "uruguay", "venezuela"
}

if "country" in df_filtered.columns:
    df_filtered["country"] = df_filtered["country"].astype(str).str.lower()
    df_filtered = df_filtered[df_filtered["country"].isin(sa_countries)].copy()
    print(f"🌎 After SA region filter: {len(df_filtered)} rows")
else:
    print(" 'country' column not found. Cannot apply region filter.")

# === Final Save ===
df_filtered.to_csv("outputs/geoparsing_ner_ensemble_filtered_southamerica.csv", index=False)
print(" Saved: geoparsing_ner_ensemble_filtered_southamerica.csv")



 ML-learned vague terms: 0
 WordNet vague terms: 1786
 Removed 0 entities (ML or WordNet flagged)
 'country' column not found. Cannot apply region filter.
 Saved: geoparsing_ner_ensemble_filtered_southamerica.csv


In [None]:
# Block 11 : Contextual Scoring with enhanced inputs
import spacy
import re
import dateparser.search
from tqdm import tqdm
from nltk.corpus import wordnet as wn, framenet as fn

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# === Load motion verbs ===
motion_frames = ['Motion', 'Travel', 'Self_motion', 'Arriving', 'Departing']
motion_verbs = set()
for frame in motion_frames:
    try:
        for lu in fn.frame_by_name(frame).lexUnit.values():
            if lu['name'].endswith('.v'):
                motion_verbs.add(lu['name'].split('.')[0].lower())
    except:
        continue

# === Load transport terms ===
vehicle_syn = wn.synset('vehicle.n.01')
transport_terms = set()
for syn in vehicle_syn.closure(lambda s: s.hyponyms()):
    for lemma in syn.lemmas():
        transport_terms.add(lemma.name().lower().replace('_', ' '))

# === Load filtered file after SA restriction ===
df = pd.read_csv("outputs/geoparsing_ner_ensemble_filtered_southamerica.csv")

# === Load sentence map ===
with open("outputs/cleaned_motorcycle_diaries_geoparsing.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()
sentence_map = {}
for line in lines:
    if line.strip().startswith("[SENT"):
        sid = int(line.split("]")[0].split()[1])
        sentence_map[sid] = line.split("]")[1].strip()

# === Scoring logic ===
scored_rows = []
for _, row in tqdm(df.iterrows(), total=len(df), desc="Scoring entities"):
    sid = row["sentence_id"]
    entity = row["entity"]
    norm = row["entity_norm"]
    label = row["label"]
    sentence = sentence_map.get(sid, "")

    doc = nlp(sentence)
    score = 0
    entity_token = None

    for ent in doc.ents:
        if ent.label_ in {"GPE", "LOC"} and ent.text.lower().strip() == entity.lower().strip():
            entity_token = ent.root
            break

    if not entity_token:
        continue

    sentence_lower = sentence.lower()
    entity_lower = entity.lower()

    if any(verb in sentence_lower for verb in motion_verbs):
        score += 1
    if any(term in sentence_lower for term in transport_terms):
        score += 1
    if re.search(r'\b(in|to|at)\s+' + re.escape(entity_lower) + r'\b', sentence_lower):
        score += 1
    if re.search(r"\b" + re.escape(entity_lower) + r"['’]s\b", sentence_lower):
        score -= 1
    if re.search(r"\bof\s+" + re.escape(entity_lower) + r"\b", sentence_lower):
        score -= 1
    if re.search(r"\bfor\s+" + re.escape(entity_lower) + r"\b", sentence_lower):
        score -= 1
    if dateparser.search.search_dates(sentence):
        score += 1

    scored_rows.append({
        "sentence_id": sid,
        "entity": entity,
        "entity_norm": norm,
        "label": label,
        "sentence": sentence,
        "score": score,
        "latitude": row.get("latitude"),
        "longitude": row.get("longitude"),
        "country": row.get("country")
    })

scored_df = pd.DataFrame(scored_rows)
scored_df = scored_df.sort_values(by=["score", "sentence_id"], ascending=[False, True])
scored_df.to_csv("outputs/geoparsing_scored_candidates.csv", index=False)
print("📄 Saved: outputs/geoparsing_scored_candidates.csv")




  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
Scoring entities: 100%|██████████| 176/176 [00:15<00:00, 11.13it/s]

📄 Saved: outputs/geoparsing_scored_candidates.csv





In [None]:
# Block 12: Enrich entities with coordinates via GeoNames → Wikidata → OSM
import time
import requests
import pandas as pd
from tqdm import tqdm

# === Load scored candidates ===
df = pd.read_csv("outputs/geoparsing_scored_candidates.csv")
places = df["entity_norm"].dropna().unique()
geonames_username = "alicjab"

geo_data = {}

# === Step 1: GeoNames ===
print("🌍 Querying GeoNames...")
for place in tqdm(places, desc="GeoNames queries"):
    try:
        params = {"q": place, "maxRows": 1, "username": geonames_username}
        r = requests.get("http://api.geonames.org/searchJSON", params=params, timeout=10).json()
        if not r.get("geonames"):
            raise ValueError("No result")
        g = r["geonames"][0]
        geo_data[place] = {
            "latitude": float(g["lat"]),
            "longitude": float(g["lng"]),
            "country": g.get("countryName"),
            "population": int(g.get("population", 0))
        }
    except Exception:
        geo_data[place] = {"latitude": None, "longitude": None, "country": None, "population": None}
    time.sleep(1)  # GeoNames rate limit

geo_df = pd.DataFrame.from_dict(geo_data, orient="index")
geo_df.index.name = "entity_norm"
geo_df.reset_index(inplace=True)

# === Merge GeoNames results ===
df_enriched = df.merge(geo_df, on="entity_norm", how="left")

# === Step 2: Wikidata fallback ===
print("🔁 Querying Wikidata for missing coordinates...")
if "latitude" not in df_enriched.columns:
    df_enriched["latitude"] = None
    df_enriched["longitude"] = None

missing_places = df_enriched[df_enriched["latitude"].isna()]["entity_norm"].dropna().unique()

def query_wikidata_coords(place):
    try:
        search_url = "https://www.wikidata.org/w/api.php"
        search_params = {
            "action": "wbsearchentities",
            "search": place,
            "language": "en",
            "format": "json"
        }
        r = requests.get(search_url, params=search_params, timeout=10).json()
        if not r["search"]:
            return {"entity_norm": place, "wikidata_lat": None, "wikidata_lon": None}
        qid = r["search"][0]["id"]
        entity = requests.get(f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json").json()
        coords = entity["entities"][qid]["claims"].get("P625", [{}])[0].get("mainsnak", {}).get("datavalue", {}).get("value", {})
        return {"entity_norm": place, "wikidata_lat": coords.get("latitude"), "wikidata_lon": coords.get("longitude")}
    except Exception:
        return {"entity_norm": place, "wikidata_lat": None, "wikidata_lon": None}

wikidata_results = pd.DataFrame([query_wikidata_coords(p) for p in tqdm(missing_places, desc="Wikidata queries")])
df_enriched = df_enriched.merge(wikidata_results, on="entity_norm", how="left")
df_enriched["latitude"] = df_enriched["latitude"].combine_first(df_enriched["wikidata_lat"])
df_enriched["longitude"] = df_enriched["longitude"].combine_first(df_enriched["wikidata_lon"])

# === Step 3: OSM fallback ===
print("🧭 Querying OpenStreetMap for unresolved locations...")
nominatim_places = df_enriched[df_enriched["latitude"].isna()]["entity_norm"].dropna().unique()

def query_osm(place):
    try:
        r = requests.get(
            "https://nominatim.openstreetmap.org/search",
            params={"q": place, "format": "json", "limit": 1},
            headers={"User-Agent": "Geoparser/1.0"},
            timeout=10
        ).json()
        if not r:
            return {"entity_norm": place, "osm_lat": None, "osm_lon": None}
        return {
            "entity_norm": place,
            "osm_lat": float(r[0]["lat"]),
            "osm_lon": float(r[0]["lon"])
        }
    except Exception:
        return {"entity_norm": place, "osm_lat": None, "osm_lon": None}

osm_results = pd.DataFrame([query_osm(p) for p in tqdm(nominatim_places, desc="OSM queries")])
df_enriched = df_enriched.merge(osm_results, on="entity_norm", how="left")
df_enriched["latitude"] = df_enriched["latitude"].combine_first(df_enriched["osm_lat"])
df_enriched["longitude"] = df_enriched["longitude"].combine_first(df_enriched["osm_lon"])


df_enriched.to_csv("outputs/geoparsing_final_enriched.csv", index=False)
print("✅ Final enriched file saved: geoparsing_final_enriched.csv")


🌍 Querying GeoNames...


GeoNames queries: 100%|██████████| 43/43 [00:55<00:00,  1.29s/it]


🔁 Querying Wikidata for missing coordinates...


Wikidata queries: 100%|██████████| 43/43 [00:35<00:00,  1.22it/s]
  df_enriched["latitude"] = df_enriched["latitude"].combine_first(df_enriched["wikidata_lat"])
  df_enriched["longitude"] = df_enriched["longitude"].combine_first(df_enriched["wikidata_lon"])


🧭 Querying OpenStreetMap for unresolved locations...


OSM queries: 100%|██████████| 16/16 [00:11<00:00,  1.38it/s]

✅ Final enriched file saved: geoparsing_final_enriched.csv





In [None]:
# Block 13: Geographic Outlier Filtering using DBSCAN (corrected input + centroid-aware reweighting + continent filtering)
from sklearn.cluster import DBSCAN
import numpy as np
import pandas as pd
from geopy.distance import geodesic
from tqdm import tqdm
import reverse_geocoder as rg


df = pd.read_csv("outputs/geoparsing_final_enriched.csv")

# Drop rows without coordinates
df_geo = df.dropna(subset=["latitude", "longitude"]).copy()
coords_rad = np.radians(df_geo[["latitude", "longitude"]].values)

# Cluster with DBSCAN using haversine distance (in radians)
clustering = DBSCAN(eps=0.5, min_samples=2, metric='haversine')
df_geo["geo_cluster"] = clustering.fit_predict(coords_rad)

# === Compute centroids of each valid cluster ===
centroids = (
    df_geo[df_geo["geo_cluster"] != -1]
    .groupby("geo_cluster")[["latitude", "longitude"]]
    .mean()
    .to_dict("index")
)

# === Reweight scores for outliers ===
outliers = df_geo[df_geo["geo_cluster"] == -1].copy()
non_outliers = df_geo[df_geo["geo_cluster"] != -1].copy()

reweighted = []
for _, row in tqdm(outliers.iterrows(), total=len(outliers), desc="↩️ Reweighting outliers"):
    min_dist_km = float("inf")
    entity_point = (row["latitude"], row["longitude"])

    for c in centroids.values():
        centroid_point = (c["latitude"], c["longitude"])
        dist = geodesic(entity_point, centroid_point).km
        if dist < min_dist_km:
            min_dist_km = dist

    # If outlier is <500 km from any cluster, keep it with downgraded score
    if min_dist_km < 500:
        row["geo_cluster"] = -2  # kept but marked as downgraded outlier
        row["geo_score_adjusted"] = row.get("score", 0) - 1
        reweighted.append(row)

# Combine cleaned + downgraded outliers
final_df = pd.concat([non_outliers, pd.DataFrame(reweighted)], ignore_index=True)
print(f" Final post-reweighting size: {len(final_df)} rows")

# === Filter by continent (keep only South America) ===
def get_continent(lat, lon):
    try:
        results = rg.search((lat, lon), mode=1)
        cc = results[0]['cc']
        # ISO country codes in South America
        south_america = {
            "AR", "BO", "BR", "CL", "CO", "EC", "GY", "PY", "PE", "SR", "UY", "VE",  # South America
             "MX", "GT", "HN", "SV", "NI", "CR", "PA"  # Central America / Mesoamerica
        }
        return 'South America' if cc in south_america else 'Other'
    except:
        return 'Unknown'

print("🌎 Filtering by continent...")
final_df["continent"] = final_df.apply(
    lambda row: get_continent(row["latitude"], row["longitude"]), axis=1
)

before_filter = len(final_df)
final_df = final_df[final_df["continent"] == "South America"].copy()
print(f"🌍 Removed {before_filter - len(final_df)} non-South American entries")

# Save final output
final_df.to_csv("outputs/geoparsing_final_scored_clustered.csv", index=False)
print("Saved final cleaned + reweighted + region-filtered version: geoparsing_final_scored_clustered.csv")

↩️ Reweighting outliers: 100%|██████████| 3/3 [00:00<00:00, 2712.42it/s]


 Final post-reweighting size: 107 rows
🌎 Filtering by continent...
Loading formatted geocoded file...
🌍 Removed 11 non-South American entries
Saved final cleaned + reweighted + region-filtered version: geoparsing_final_scored_clustered.csv


In [None]:
#Block 14: Visualize clustered results using Plotly

import pandas as pd
import plotly.express as px

# Load clustered data
df = pd.read_csv("outputs/geoparsing_final_scored_clustered.csv")

# Drop NaNs (should already be clean)
df = df.dropna(subset=["latitude", "longitude"])

# Rename cluster column for consistency with Plotly
df["cluster"] = df["geo_cluster"]

# Create hover label
df["hover"] = df["entity"] + " (cluster " + df["cluster"].astype(str) + ")"

# Basic scatter geo map
fig = px.scatter_geo(
    df,
    lat="latitude",
    lon="longitude",
    text="entity",
    hover_name="hover",
    color="cluster",
    title="Clustered Location Mentions from Text",
    projection="natural earth"
)

fig.update_traces(marker=dict(size=6))
fig.update_layout(legend_title_text='Cluster ID')
fig.show()

